1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24 * Portions Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /*
33 * DTrace - Dynamic Tracing for Solaris
34 *
35 * This is the implementation of the Solaris Dynamic Tracing framework
36 * (DTrace). The user-visible interface to DTrace is described at length in
37 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
38 * library, the in-kernel DTrace framework, and the DTrace providers are
39 * described in the block comments in the <sys/dtrace.h> header file. The
40 * internal architecture of DTrace is described in the block comments in the
41 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
42 * implementation very much assume mastery of all of these sources; if one has
43 * an unanswered question about the implementation, one should consult them
44 * first.
45 *
46 * The functions here are ordered roughly as follows:
47 *
48 * - Probe context functions
49 * - Probe hashing functions
50 * - Non-probe context utility functions
51 * - Matching functions
52 * - Provider-to-Framework API functions
53 * - Probe management functions
54 * - DIF object functions
55 * - Format functions
56 * - Predicate functions
57 * - ECB functions
58 * - Buffer functions
59 * - Enabling functions
60 * - DOF functions
61 * - Anonymous enabling functions
62 * - Process functions
63 * - Consumer state functions
64 * - Helper functions
65 * - Hook functions
66 * - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72 #include <sys/errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <sys/conf.h>
76 #include <sys/random.h>
77 #include <sys/systm.h>
78 #include <sys/dtrace_impl.h>
79 #include <sys/param.h>
80 #include <sys/proc_internal.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <miscfs/devfs/devfs.h>
84 #include <sys/malloc.h>
85 #include <sys/kernel_types.h>
86 #include <sys/proc_internal.h>
87 #include <sys/uio_internal.h>
88 #include <sys/kauth.h>
89 #include <vm/pmap.h>
90 #include <sys/user.h>
91 #include <mach/exception_types.h>
92 #include <sys/signalvar.h>
93 #include <mach/task.h>
94 #include <kern/ast.h>
95 #include <kern/hvg_hypercall.h>
96 #include <kern/sched_prim.h>
97 #include <kern/processor.h>
98 #include <kern/task.h>
99 #include <kern/zalloc.h>
100 #include <netinet/in.h>
101 #include <libkern/sysctl.h>
102 #include <sys/kdebug.h>
103 #include <sys/sdt_impl.h>
104
105 #if CONFIG_PERVASIVE_CPI
106 #include <kern/monotonic.h>
107 #include <machine/monotonic.h>
108 #endif /* CONFIG_PERVASIVE_CPI */
109
110 #include "dtrace_xoroshiro128_plus.h"
111
112 #include <IOKit/IOPlatformExpert.h>
113
114 #include <kern/cpu_data.h>
115
116 extern addr64_t kvtophys(vm_offset_t va);
117
118 extern uint32_t pmap_find_phys(void *, uint64_t);
119 extern boolean_t pmap_valid_page(uint32_t);
120 extern void OSKextRegisterKextsWithDTrace(void);
121 extern kmod_info_t g_kernel_kmod_info;
122 extern void commpage_update_dof(boolean_t enabled);
123
124 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
125 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
126
127 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
128
129 extern void dtrace_suspend(void);
130 extern void dtrace_resume(void);
131 extern void dtrace_early_init(void);
132 extern int dtrace_keep_kernel_symbols(void);
133 extern void dtrace_init(void);
134 extern void helper_init(void);
135 extern void fasttrap_init(void);
136
137 static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
138 extern void dtrace_lazy_dofs_destroy(proc_t *);
139 extern void dtrace_postinit(void);
140
141 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
142 extern void dtrace_proc_exec(proc_t*);
143 extern void dtrace_proc_exit(proc_t*);
144
145 /*
146 * DTrace Tunable Variables
147 *
148 * The following variables may be dynamically tuned by using sysctl(8), the
149 * variables being stored in the kern.dtrace namespace. For example:
150 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
151 *
152 * In general, the only variables that one should be tuning this way are those
153 * that affect system-wide DTrace behavior, and for which the default behavior
154 * is undesirable. Most of these variables are tunable on a per-consumer
155 * basis using DTrace options, and need not be tuned on a system-wide basis.
156 * When tuning these variables, avoid pathological values; while some attempt
157 * is made to verify the integrity of these variables, they are not considered
158 * part of the supported interface to DTrace, and they are therefore not
159 * checked comprehensively.
160 */
161 uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
162 uint64_t dtrace_buffer_memory_inuse = 0;
163 int dtrace_destructive_disallow = 1;
164 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
165 size_t dtrace_difo_maxsize = (256 * 1024);
166 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
167 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
168 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
169 size_t dtrace_actions_max = (16 * 1024);
170 size_t dtrace_retain_max = 1024;
171 dtrace_optval_t dtrace_helper_actions_max = 32;
172 dtrace_optval_t dtrace_helper_providers_max = 64;
173 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
174 size_t dtrace_strsize_default = 256;
175 dtrace_optval_t dtrace_strsize_min = 8;
176 dtrace_optval_t dtrace_strsize_max = 65536;
177 dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
178 dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
179 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
180 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
181 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
182 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
183 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
184 dtrace_optval_t dtrace_nspec_default = 1;
185 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
186 dtrace_optval_t dtrace_stackframes_default = 20;
187 dtrace_optval_t dtrace_ustackframes_default = 20;
188 dtrace_optval_t dtrace_jstackframes_default = 50;
189 dtrace_optval_t dtrace_jstackstrsize_default = 512;
190 dtrace_optval_t dtrace_buflimit_default = 75;
191 dtrace_optval_t dtrace_buflimit_min = 1;
192 dtrace_optval_t dtrace_buflimit_max = 99;
193 size_t dtrace_nprobes_default = 4;
194 int dtrace_msgdsize_max = 128;
195 hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
196 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
197 int dtrace_devdepth_max = 32;
198 int dtrace_err_verbose;
199 hrtime_t dtrace_deadman_interval = NANOSEC;
200 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
201 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
202
203 /*
204 * DTrace External Variables
205 *
206 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
207 * available to DTrace consumers via the backtick (`) syntax. One of these,
208 * dtrace_zero, is made deliberately so: it is provided as a source of
209 * well-known, zero-filled memory. While this variable is not documented,
210 * it is used by some translators as an implementation detail.
211 */
212 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
213 unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
214 /*
215 * DTrace Internal Variables
216 */
217 static dev_info_t *dtrace_devi; /* device info */
218 static vmem_t *dtrace_arena; /* probe ID arena */
219 static dtrace_probe_t **dtrace_probes; /* array of all probes */
220 static int dtrace_nprobes; /* number of probes */
221 static dtrace_provider_t *dtrace_provider; /* provider list */
222 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
223 static int dtrace_opens; /* number of opens */
224 static int dtrace_helpers; /* number of helpers */
225 static dtrace_hash_t *dtrace_strings;
226 static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */
227 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
228 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
229 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
230 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
231 static int dtrace_toxranges; /* number of toxic ranges */
232 static int dtrace_toxranges_max; /* size of toxic range array */
233 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
234 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
235 static kthread_t *dtrace_panicked; /* panicking thread */
236 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
237 static dtrace_genid_t dtrace_probegen; /* current probe generation */
238 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
239 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
240 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
241 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
242
243 static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
244
245 /*
246 * This does't quite fit as an internal variable, as it must be accessed in
247 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
248 */
249 int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
250 static uint32_t dtrace_wake_clients;
251 static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
252
253 /*
254 * To save memory, some common memory allocations are given a
255 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
256 * which means it would fall into the kalloc.128 bucket. With
257 * 20k elements allocated, the space saved is substantial.
258 */
259
260 static ZONE_DEFINE_TYPE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t",
261 dtrace_probe_t, ZC_PGZ_USE_GUARDS);
262
263 static ZONE_DEFINE(dtrace_state_pcpu_zone, "dtrace.dtrace_dstate_percpu_t",
264 sizeof(dtrace_dstate_percpu_t), ZC_PERCPU);
265
266 static int dtrace_module_unloaded(struct kmod_info *kmod);
267
268 /*
269 * DTrace Locking
270 * DTrace is protected by three (relatively coarse-grained) locks:
271 *
272 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
273 * including enabling state, probes, ECBs, consumer state, helper state,
274 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
275 * probe context is lock-free -- synchronization is handled via the
276 * dtrace_sync() cross call mechanism.
277 *
278 * (2) dtrace_provider_lock is required when manipulating provider state, or
279 * when provider state must be held constant.
280 *
281 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
282 * when meta provider state must be held constant.
283 *
284 * The lock ordering between these three locks is dtrace_meta_lock before
285 * dtrace_provider_lock before dtrace_lock. (In particular, there are
286 * several places where dtrace_provider_lock is held by the framework as it
287 * calls into the providers -- which then call back into the framework,
288 * grabbing dtrace_lock.)
289 *
290 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
291 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
292 * role as a coarse-grained lock; it is acquired before both of these locks.
293 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
294 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
295 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
296 * acquired _between_ dtrace_provider_lock and dtrace_lock.
297 */
298
299
300 /*
301 * APPLE NOTE:
302 *
303 * For porting purposes, all kmutex_t vars have been changed
304 * to lck_mtx_t, which require explicit initialization.
305 *
306 * kmutex_t becomes lck_mtx_t
307 * mutex_enter() becomes lck_mtx_lock()
308 * mutex_exit() becomes lck_mtx_unlock()
309 *
310 * Lock asserts are changed like this:
311 *
312 * ASSERT(MUTEX_HELD(&cpu_lock));
313 * becomes:
314 * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
315 *
316 */
317 static LCK_MTX_DECLARE_ATTR(dtrace_lock,
318 &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */
319 static LCK_MTX_DECLARE_ATTR(dtrace_provider_lock,
320 &dtrace_lck_grp, &dtrace_lck_attr); /* provider state lock */
321 static LCK_MTX_DECLARE_ATTR(dtrace_meta_lock,
322 &dtrace_lck_grp, &dtrace_lck_attr); /* meta-provider state lock */
323 static LCK_RW_DECLARE_ATTR(dtrace_dof_mode_lock,
324 &dtrace_lck_grp, &dtrace_lck_attr); /* dof mode lock */
325
326 /*
327 * DTrace Provider Variables
328 *
329 * These are the variables relating to DTrace as a provider (that is, the
330 * provider of the BEGIN, END, and ERROR probes).
331 */
332 static dtrace_pattr_t dtrace_provider_attr = {
333 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
334 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
335 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
336 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
337 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
338 };
339
340 static void
dtrace_provide_nullop(void * arg,const dtrace_probedesc_t * desc)341 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
342 {
343 #pragma unused(arg, desc)
344 }
345
346 static void
dtrace_provide_module_nullop(void * arg,struct modctl * ctl)347 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
348 {
349 #pragma unused(arg, ctl)
350 }
351
352 static int
dtrace_enable_nullop(void * arg,dtrace_id_t id,void * parg)353 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
354 {
355 #pragma unused(arg, id, parg)
356 return (0);
357 }
358
359 static void
dtrace_disable_nullop(void * arg,dtrace_id_t id,void * parg)360 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
361 {
362 #pragma unused(arg, id, parg)
363 }
364
365 static void
dtrace_suspend_nullop(void * arg,dtrace_id_t id,void * parg)366 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
367 {
368 #pragma unused(arg, id, parg)
369 }
370
371 static void
dtrace_resume_nullop(void * arg,dtrace_id_t id,void * parg)372 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
373 {
374 #pragma unused(arg, id, parg)
375 }
376
377 static void
dtrace_destroy_nullop(void * arg,dtrace_id_t id,void * parg)378 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
379 {
380 #pragma unused(arg, id, parg)
381 }
382
383
384 static dtrace_pops_t dtrace_provider_ops = {
385 .dtps_provide = dtrace_provide_nullop,
386 .dtps_provide_module = dtrace_provide_module_nullop,
387 .dtps_enable = dtrace_enable_nullop,
388 .dtps_disable = dtrace_disable_nullop,
389 .dtps_suspend = dtrace_suspend_nullop,
390 .dtps_resume = dtrace_resume_nullop,
391 .dtps_getargdesc = NULL,
392 .dtps_getargval = NULL,
393 .dtps_usermode = NULL,
394 .dtps_destroy = dtrace_destroy_nullop,
395 };
396
397 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
398 static dtrace_id_t dtrace_probeid_end; /* special END probe */
399 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
400
401 /*
402 * DTrace Helper Tracing Variables
403 */
404 uint32_t dtrace_helptrace_next = 0;
405 uint32_t dtrace_helptrace_nlocals;
406 char *dtrace_helptrace_buffer;
407 size_t dtrace_helptrace_bufsize = 512 * 1024;
408
409 #if DEBUG
410 int dtrace_helptrace_enabled = 1;
411 #else
412 int dtrace_helptrace_enabled = 0;
413 #endif
414
415 #if defined (__arm64__)
416 /*
417 * The ioctl for adding helper DOF is based on the
418 * size of a user_addr_t. We need to recognize both
419 * U32 and U64 as the same action.
420 */
421 #define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
422 #define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
423 #endif /* __arm64__ */
424
425 /*
426 * DTrace Error Hashing
427 *
428 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
429 * table. This is very useful for checking coverage of tests that are
430 * expected to induce DIF or DOF processing errors, and may be useful for
431 * debugging problems in the DIF code generator or in DOF generation . The
432 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
433 */
434 #if DEBUG
435 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
436 static const char *dtrace_errlast;
437 static kthread_t *dtrace_errthread;
438 static LCK_MTX_DECLARE_ATTR(dtrace_errlock, &dtrace_lck_grp, &dtrace_lck_attr);
439 #endif
440
441 /*
442 * DTrace Macros and Constants
443 *
444 * These are various macros that are useful in various spots in the
445 * implementation, along with a few random constants that have no meaning
446 * outside of the implementation. There is no real structure to this cpp
447 * mishmash -- but is there ever?
448 */
449
450 #define DTRACE_GETSTR(hash, elm) \
451 (hash->dth_getstr(elm, hash->dth_stroffs))
452
453 #define DTRACE_HASHSTR(hash, elm) \
454 dtrace_hash_str(DTRACE_GETSTR(hash, elm))
455
456 #define DTRACE_HASHNEXT(hash, elm) \
457 (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
458
459 #define DTRACE_HASHPREV(hash, elm) \
460 (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
461
462 #define DTRACE_HASHEQ(hash, lhs, rhs) \
463 (strcmp(DTRACE_GETSTR(hash, lhs), \
464 DTRACE_GETSTR(hash, rhs)) == 0)
465
466 #define DTRACE_AGGHASHSIZE_SLEW 17
467
468 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
469
470 /*
471 * The key for a thread-local variable needs to be unique to a single
472 * thread over the lifetime of the system, and not overlap with any variable
473 * IDs. So we take thread's thread_id, a unique 64-bit number that is never
474 * reused after the thread exits, and add DIF_VARIABLE_MAX to it, which
475 * guarantees that it won’t overlap any variable IDs. We also want to treat
476 * running in interrupt context as independent of thread-context. So if
477 * interrupts are active, we set the 63rd bit, otherwise it’s cleared.
478 *
479 * This is necessary (but not sufficient) to assure that global associative
480 * arrays never collide with thread-local variables. To guarantee that they
481 * cannot collide, we must also define the order for keying dynamic variables.
482 *
483 * That order is:
484 *
485 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
486 *
487 * Because the variable-key and the tls-key are in orthogonal spaces, there is
488 * no way for a global variable key signature to match a thread-local key
489 * signature.
490 */
491 #if defined (__x86_64__) || defined(__arm64__)
492 #define DTRACE_TLS_THRKEY(where) { \
493 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
494 uint64_t thr = thread_tid(current_thread()); \
495 ASSERT(intr < 2); \
496 (where) = ((thr + DIF_VARIABLE_MAX) & (~((uint64_t)1 << 63))) | \
497 ((uint64_t)intr << 63); \
498 }
499 #else
500 #error Unknown architecture
501 #endif
502
503 #define DT_BSWAP_8(x) ((x) & 0xff)
504 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
505 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
506 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
507
508 #define DT_MASK_LO 0x00000000FFFFFFFFULL
509
510 #define DTRACE_STORE(type, tomax, offset, what) \
511 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
512
513
514 #define DTRACE_ALIGNCHECK(addr, size, flags) \
515 if (addr & (MIN(size,4) - 1)) { \
516 *flags |= CPU_DTRACE_BADALIGN; \
517 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
518 return (0); \
519 }
520
521 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
522 do { \
523 if ((remp) != NULL) { \
524 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
525 } \
526 } while (0)
527
528
529 /*
530 * Test whether a range of memory starting at testaddr of size testsz falls
531 * within the range of memory described by addr, sz. We take care to avoid
532 * problems with overflow and underflow of the unsigned quantities, and
533 * disallow all negative sizes. Ranges of size 0 are allowed.
534 */
535 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
536 ((testaddr) - (baseaddr) < (basesz) && \
537 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
538 (testaddr) + (testsz) >= (testaddr))
539
540 /*
541 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
542 * alloc_sz on the righthand side of the comparison in order to avoid overflow
543 * or underflow in the comparison with it. This is simpler than the INRANGE
544 * check above, because we know that the dtms_scratch_ptr is valid in the
545 * range. Allocations of size zero are allowed.
546 */
547 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
548 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
549 (mstate)->dtms_scratch_ptr >= (alloc_sz))
550
551 #if defined (__x86_64__) || defined (__arm64__)
552 #define DTRACE_LOADFUNC(bits) \
553 /*CSTYLED*/ \
554 uint##bits##_t dtrace_load##bits(uintptr_t addr); \
555 \
556 extern int dtrace_nofault_copy##bits(uintptr_t, uint##bits##_t *); \
557 \
558 uint##bits##_t \
559 dtrace_load##bits(uintptr_t addr) \
560 { \
561 size_t size = bits / NBBY; \
562 /*CSTYLED*/ \
563 uint##bits##_t rval = 0; \
564 int i; \
565 volatile uint16_t *flags = (volatile uint16_t *) \
566 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
567 uintptr_t caddr = vm_memtag_canonicalize_kernel(addr); \
568 \
569 DTRACE_ALIGNCHECK(addr, size, flags); \
570 \
571 for (i = 0; i < dtrace_toxranges; i++) { \
572 if (caddr >= dtrace_toxrange[i].dtt_limit) \
573 continue; \
574 \
575 if (caddr + size <= dtrace_toxrange[i].dtt_base) \
576 continue; \
577 \
578 /* \
579 * This address falls within a toxic region; return 0. \
580 */ \
581 *flags |= CPU_DTRACE_BADADDR; \
582 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
583 return (0); \
584 } \
585 \
586 { \
587 *flags |= CPU_DTRACE_NOFAULT; \
588 /*CSTYLED*/ \
589 /* \
590 * PR6394061 - avoid device memory that is unpredictably \
591 * mapped and unmapped \
592 */ \
593 if (!pmap_valid_page(pmap_find_phys(kernel_pmap, addr)) || \
594 dtrace_nofault_copy##bits(addr, &rval)) { \
595 *flags |= CPU_DTRACE_BADADDR; \
596 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
597 return (0); \
598 } \
599 \
600 *flags &= ~CPU_DTRACE_NOFAULT; \
601 } \
602 \
603 return (rval); \
604 }
605 #else /* all other architectures */
606 #error Unknown Architecture
607 #endif
608
609 #ifdef __LP64__
610 #define dtrace_loadptr dtrace_load64
611 #else
612 #define dtrace_loadptr dtrace_load32
613 #endif
614
615 #define DTRACE_DYNHASH_FREE 0
616 #define DTRACE_DYNHASH_SINK 1
617 #define DTRACE_DYNHASH_VALID 2
618
619 #define DTRACE_MATCH_FAIL -1
620 #define DTRACE_MATCH_NEXT 0
621 #define DTRACE_MATCH_DONE 1
622 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
623 #define DTRACE_STATE_ALIGN 64
624
625 #define DTRACE_FLAGS2FLT(flags) \
626 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
627 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
628 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
629 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
630 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
631 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
632 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
633 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
634 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
635 DTRACEFLT_UNKNOWN)
636
637 #define DTRACEACT_ISSTRING(act) \
638 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
639 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
640
641
642 static size_t dtrace_strlen(const char *, size_t);
643 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
644 static void dtrace_enabling_provide(dtrace_provider_t *);
645 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
646 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
647 static void dtrace_enabling_matchall(void);
648 static dtrace_state_t *dtrace_anon_grab(void);
649 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
650 dtrace_state_t *, uint64_t, uint64_t);
651 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
652 static void dtrace_buffer_drop(dtrace_buffer_t *);
653 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
654 dtrace_state_t *, dtrace_mstate_t *);
655 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
656 dtrace_optval_t);
657 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
658 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
659 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
660 dtrace_mstate_t *, dtrace_vstate_t *);
661 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
662 dtrace_mstate_t *, dtrace_vstate_t *);
663
664
665 /*
666 * DTrace sysctl handlers
667 *
668 * These declarations and functions are used for a deeper DTrace configuration.
669 * Most of them are not per-consumer basis and may impact the other DTrace
670 * consumers. Correctness may not be supported for all the variables, so you
671 * should be careful about what values you are using.
672 */
673
674 SYSCTL_DECL(_kern_dtrace);
675 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
676
677 static int
678 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
679 {
680 #pragma unused(oidp, arg2)
681 int changed, error;
682 int value = *(int *) arg1;
683
684 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
685 if (error || !changed)
686 return (error);
687
688 if (value != 0 && value != 1)
689 return (ERANGE);
690
691 lck_mtx_lock(&dtrace_lock);
692 dtrace_err_verbose = value;
693 lck_mtx_unlock(&dtrace_lock);
694
695 return (0);
696 }
697
698 /*
699 * kern.dtrace.err_verbose
700 *
701 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
702 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
703 */
704 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
705 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
706 &dtrace_err_verbose, 0,
707 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
708
709 static int
710 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
711 {
712 #pragma unused(oidp, arg2, req)
713 int changed, error;
714 uint64_t value = *(uint64_t *) arg1;
715
716 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
717 if (error || !changed)
718 return (error);
719
720 if (value <= dtrace_buffer_memory_inuse)
721 return (ERANGE);
722
723 lck_mtx_lock(&dtrace_lock);
724 dtrace_buffer_memory_maxsize = value;
725 lck_mtx_unlock(&dtrace_lock);
726
727 return (0);
728 }
729
730 /*
731 * kern.dtrace.buffer_memory_maxsize
732 *
733 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
734 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
735 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
736 */
737 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
738 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
739 &dtrace_buffer_memory_maxsize, 0,
740 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
741
742 /*
743 * kern.dtrace.buffer_memory_inuse
744 *
745 * Current state buffer memory used, in bytes, by all the DTrace consumers.
746 * This value is read-only.
747 */
748 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
749 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
750
751 static int
752 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
753 {
754 #pragma unused(oidp, arg2, req)
755 int changed, error;
756 size_t value = *(size_t*) arg1;
757
758 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
759 if (error || !changed)
760 return (error);
761
762 if (value <= 0)
763 return (ERANGE);
764
765 lck_mtx_lock(&dtrace_lock);
766 dtrace_difo_maxsize = value;
767 lck_mtx_unlock(&dtrace_lock);
768
769 return (0);
770 }
771
772 /*
773 * kern.dtrace.difo_maxsize
774 *
775 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
776 * to get the default value. Attempting to set a null or negative size will
777 * result in a failure.
778 */
779 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
780 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
781 &dtrace_difo_maxsize, 0,
782 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
783
784 static int
785 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
786 {
787 #pragma unused(oidp, arg2, req)
788 int changed, error;
789 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
790
791 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
792 if (error || !changed)
793 return (error);
794
795 if (value <= 0)
796 return (ERANGE);
797
798 if (value >= dtrace_copy_maxsize())
799 return (ERANGE);
800
801 lck_mtx_lock(&dtrace_lock);
802 dtrace_dof_maxsize = value;
803 lck_mtx_unlock(&dtrace_lock);
804
805 return (0);
806 }
807
808 /*
809 * kern.dtrace.dof_maxsize
810 *
811 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
812 * get the default value. Attempting to set a null or negative size will result
813 * in a failure.
814 */
815 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
816 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
817 &dtrace_dof_maxsize, 0,
818 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
819
820 static int
821 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
822 {
823 #pragma unused(oidp, arg2, req)
824 int changed, error;
825 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
826
827 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
828 if (error || !changed)
829 return (error);
830
831 if (value <= 0)
832 return (ERANGE);
833 if (value > dtrace_statvar_maxsize_max)
834 return (ERANGE);
835
836 lck_mtx_lock(&dtrace_lock);
837 dtrace_statvar_maxsize = value;
838 lck_mtx_unlock(&dtrace_lock);
839
840 return (0);
841 }
842
843 /*
844 * kern.dtrace.global_maxsize
845 *
846 * Set the variable max size in bytes, check the definition of
847 * dtrace_statvar_maxsize to get the default value. Attempting to set a null,
848 * too high or negative size will result in a failure.
849 */
850 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
851 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
852 &dtrace_statvar_maxsize, 0,
853 sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
854
855
856 /*
857 * kern.dtrace.provide_private_probes
858 *
859 * Set whether the providers must provide the private probes. This is
860 * kept as compatibility as they are always provided.
861 */
862 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
863 CTLFLAG_RD | CTLFLAG_LOCKED,
864 (int *)NULL, 1, "provider must provide the private probes");
865
866 /*
867 * kern.dtrace.dof_mode
868 *
869 * Returns the current DOF mode.
870 * This value is read-only.
871 */
872 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
873 &dtrace_dof_mode, 0, "dtrace dof mode");
874
875 /*
876 * DTrace Probe Context Functions
877 *
878 * These functions are called from probe context. Because probe context is
879 * any context in which C may be called, arbitrarily locks may be held,
880 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
881 * As a result, functions called from probe context may only call other DTrace
882 * support functions -- they may not interact at all with the system at large.
883 * (Note that the ASSERT macro is made probe-context safe by redefining it in
884 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
885 * loads are to be performed from probe context, they _must_ be in terms of
886 * the safe dtrace_load*() variants.
887 *
888 * Some functions in this block are not actually called from probe context;
889 * for these functions, there will be a comment above the function reading
890 * "Note: not called from probe context."
891 */
892
893 int
dtrace_assfail(const char * a,const char * f,int l)894 dtrace_assfail(const char *a, const char *f, int l)
895 {
896 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
897
898 /*
899 * We just need something here that even the most clever compiler
900 * cannot optimize away.
901 */
902 return (a[(uintptr_t)f]);
903 }
904
905 /*
906 * Atomically increment a specified error counter from probe context.
907 */
908 static void
dtrace_error(uint32_t * counter)909 dtrace_error(uint32_t *counter)
910 {
911 /*
912 * Most counters stored to in probe context are per-CPU counters.
913 * However, there are some error conditions that are sufficiently
914 * arcane that they don't merit per-CPU storage. If these counters
915 * are incremented concurrently on different CPUs, scalability will be
916 * adversely affected -- but we don't expect them to be white-hot in a
917 * correctly constructed enabling...
918 */
919 uint32_t oval, nval;
920
921 do {
922 oval = *counter;
923
924 if ((nval = oval + 1) == 0) {
925 /*
926 * If the counter would wrap, set it to 1 -- assuring
927 * that the counter is never zero when we have seen
928 * errors. (The counter must be 32-bits because we
929 * aren't guaranteed a 64-bit compare&swap operation.)
930 * To save this code both the infamy of being fingered
931 * by a priggish news story and the indignity of being
932 * the target of a neo-puritan witch trial, we're
933 * carefully avoiding any colorful description of the
934 * likelihood of this condition -- but suffice it to
935 * say that it is only slightly more likely than the
936 * overflow of predicate cache IDs, as discussed in
937 * dtrace_predicate_create().
938 */
939 nval = 1;
940 }
941 } while (dtrace_cas32(counter, oval, nval) != oval);
942 }
943
944 /*
945 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
946 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
947 */
948 DTRACE_LOADFUNC(8)
949 DTRACE_LOADFUNC(16)
950 DTRACE_LOADFUNC(32)
951 DTRACE_LOADFUNC(64)
952
953 static int
dtrace_inscratch(uintptr_t dest,size_t size,dtrace_mstate_t * mstate)954 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
955 {
956 if (dest < mstate->dtms_scratch_base)
957 return (0);
958
959 if (dest + size < dest)
960 return (0);
961
962 if (dest + size > mstate->dtms_scratch_ptr)
963 return (0);
964
965 return (1);
966 }
967
968 static int
dtrace_canstore_statvar(uint64_t addr,size_t sz,size_t * remain,dtrace_statvar_t ** svars,int nsvars)969 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
970 dtrace_statvar_t **svars, int nsvars)
971 {
972 int i;
973
974 size_t maxglobalsize, maxlocalsize;
975
976 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
977 maxlocalsize = (maxglobalsize) * NCPU;
978
979 if (nsvars == 0)
980 return (0);
981
982 for (i = 0; i < nsvars; i++) {
983 dtrace_statvar_t *svar = svars[i];
984 uint8_t scope;
985 size_t size;
986
987 if (svar == NULL || (size = svar->dtsv_size) == 0)
988 continue;
989
990 scope = svar->dtsv_var.dtdv_scope;
991
992 /**
993 * We verify that our size is valid in the spirit of providing
994 * defense in depth: we want to prevent attackers from using
995 * DTrace to escalate an orthogonal kernel heap corruption bug
996 * into the ability to store to arbitrary locations in memory.
997 */
998 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
999 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1000
1001 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1002 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1003 svar->dtsv_size);
1004 return (1);
1005 }
1006 }
1007
1008 return (0);
1009 }
1010
1011 /*
1012 * Check to see if the address is within a memory region to which a store may
1013 * be issued. This includes the DTrace scratch areas, and any DTrace variable
1014 * region. The caller of dtrace_canstore() is responsible for performing any
1015 * alignment checks that are needed before stores are actually executed.
1016 */
1017 static int
dtrace_canstore(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1018 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1019 dtrace_vstate_t *vstate)
1020 {
1021 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1022 }
1023 /*
1024 * Implementation of dtrace_canstore which communicates the upper bound of the
1025 * allowed memory region.
1026 */
1027 static int
dtrace_canstore_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1028 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1029 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1030 {
1031 /*
1032 * First, check to see if the address is in allocated scratch space...
1033 */
1034 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1035 mstate->dtms_scratch_ptr - mstate->dtms_scratch_base)) {
1036 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1037 mstate->dtms_scratch_ptr - mstate->dtms_scratch_base);
1038 return (1);
1039 }
1040 /*
1041 * Now check to see if it's a dynamic variable. This check will pick
1042 * up both thread-local variables and any global dynamically-allocated
1043 * variables.
1044 */
1045 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1046 vstate->dtvs_dynvars.dtds_size)) {
1047 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1048 uintptr_t base = (uintptr_t)dstate->dtds_base +
1049 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1050 uintptr_t chunkoffs;
1051 dtrace_dynvar_t *dvar;
1052
1053 /*
1054 * Before we assume that we can store here, we need to make
1055 * sure that it isn't in our metadata -- storing to our
1056 * dynamic variable metadata would corrupt our state. For
1057 * the range to not include any dynamic variable metadata,
1058 * it must:
1059 *
1060 * (1) Start above the hash table that is at the base of
1061 * the dynamic variable space
1062 *
1063 * (2) Have a starting chunk offset that is beyond the
1064 * dtrace_dynvar_t that is at the base of every chunk
1065 *
1066 * (3) Not span a chunk boundary
1067 *
1068 * (4) Not be in the tuple space of a dynamic variable
1069 *
1070 */
1071 if (addr < base)
1072 return (0);
1073
1074 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1075
1076 if (chunkoffs < sizeof (dtrace_dynvar_t))
1077 return (0);
1078
1079 if (chunkoffs + sz > dstate->dtds_chunksize)
1080 return (0);
1081
1082 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1083
1084 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1085 return (0);
1086
1087 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1088 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1089 return (0);
1090
1091 return (1);
1092 }
1093
1094 /*
1095 * Finally, check the static local and global variables. These checks
1096 * take the longest, so we perform them last.
1097 */
1098 if (dtrace_canstore_statvar(addr, sz, remain,
1099 vstate->dtvs_locals, vstate->dtvs_nlocals))
1100 return (1);
1101
1102 if (dtrace_canstore_statvar(addr, sz, remain,
1103 vstate->dtvs_globals, vstate->dtvs_nglobals))
1104 return (1);
1105
1106 return (0);
1107 }
1108
1109
1110 /*
1111 * Convenience routine to check to see if the address is within a memory
1112 * region in which a load may be issued given the user's privilege level;
1113 * if not, it sets the appropriate error flags and loads 'addr' into the
1114 * illegal value slot.
1115 *
1116 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1117 * appropriate memory access protection.
1118 */
1119 int
dtrace_canload(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1120 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1121 dtrace_vstate_t *vstate)
1122 {
1123 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1124 }
1125
1126 /*
1127 * Implementation of dtrace_canload which communicates the upper bound of the
1128 * allowed memory region.
1129 */
1130 static int
dtrace_canload_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1131 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1132 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1133 {
1134 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1135
1136 /*
1137 * If we hold the privilege to read from kernel memory, then
1138 * everything is readable.
1139 */
1140 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1141 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1142 return (1);
1143 }
1144
1145 /*
1146 * You can obviously read that which you can store.
1147 */
1148 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1149 return (1);
1150
1151 /*
1152 * We're allowed to read from our own string table.
1153 */
1154 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1155 mstate->dtms_difo->dtdo_strlen)) {
1156 DTRACE_RANGE_REMAIN(remain, addr,
1157 mstate->dtms_difo->dtdo_strtab,
1158 mstate->dtms_difo->dtdo_strlen);
1159 return (1);
1160 }
1161
1162 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1163 *illval = addr;
1164 return (0);
1165 }
1166
1167 /*
1168 * Convenience routine to check to see if a given string is within a memory
1169 * region in which a load may be issued given the user's privilege level;
1170 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1171 * calls in the event that the user has all privileges.
1172 */
1173 static int
dtrace_strcanload(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1174 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1175 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1176 {
1177 size_t rsize = 0;
1178
1179 /*
1180 * If we hold the privilege to read from kernel memory, then
1181 * everything is readable.
1182 */
1183 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1184 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1185 return (1);
1186 }
1187
1188 /*
1189 * Even if the caller is uninterested in querying the remaining valid
1190 * range, it is required to ensure that the access is allowed.
1191 */
1192 if (remain == NULL) {
1193 remain = &rsize;
1194 }
1195 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1196 size_t strsz;
1197 /*
1198 * Perform the strlen after determining the length of the
1199 * memory region which is accessible. This prevents timing
1200 * information from being used to find NULs in memory which is
1201 * not accessible to the caller.
1202 */
1203 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1204 MIN(sz, *remain));
1205 if (strsz <= *remain) {
1206 return (1);
1207 }
1208 }
1209
1210 return (0);
1211 }
1212
1213 /*
1214 * Convenience routine to check to see if a given variable is within a memory
1215 * region in which a load may be issued given the user's privilege level.
1216 */
1217 static int
dtrace_vcanload(void * src,dtrace_diftype_t * type,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1218 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1219 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1220 {
1221 size_t sz;
1222 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1223
1224 /*
1225 * Calculate the max size before performing any checks since even
1226 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1227 * return the max length via 'remain'.
1228 */
1229 if (type->dtdt_kind == DIF_TYPE_STRING) {
1230 dtrace_state_t *state = vstate->dtvs_state;
1231
1232 if (state != NULL) {
1233 sz = state->dts_options[DTRACEOPT_STRSIZE];
1234 } else {
1235 /*
1236 * In helper context, we have a NULL state; fall back
1237 * to using the system-wide default for the string size
1238 * in this case.
1239 */
1240 sz = dtrace_strsize_default;
1241 }
1242 } else {
1243 sz = type->dtdt_size;
1244 }
1245
1246 /*
1247 * If we hold the privilege to read from kernel memory, then
1248 * everything is readable.
1249 */
1250 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1251 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1252 return (1);
1253 }
1254
1255 if (type->dtdt_kind == DIF_TYPE_STRING) {
1256 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1257 vstate));
1258 }
1259 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1260 vstate));
1261 }
1262
1263 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
1264 #define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
1265 #define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1266 ((ch) == '\t') || ((ch) == '\f'))
1267 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1268 ((ch) >= 'A' && (ch) <= 'F'))
1269 #define lisalnum(x) \
1270 (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1271
1272 #define DIGIT(x) \
1273 (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1274
1275 /*
1276 * Convert a string to a signed integer using safe loads.
1277 */
1278 static int64_t
dtrace_strtoll(char * input,int base,size_t limit)1279 dtrace_strtoll(char *input, int base, size_t limit)
1280 {
1281 uintptr_t pos = (uintptr_t)input;
1282 int64_t val = 0;
1283 int x;
1284 boolean_t neg = B_FALSE;
1285 char c, cc, ccc;
1286 uintptr_t end = pos + limit;
1287
1288 /*
1289 * Consume any whitespace preceding digits.
1290 */
1291 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1292 pos++;
1293
1294 /*
1295 * Handle an explicit sign if one is present.
1296 */
1297 if (c == '-' || c == '+') {
1298 if (c == '-')
1299 neg = B_TRUE;
1300 c = dtrace_load8(++pos);
1301 }
1302
1303 /*
1304 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1305 * if present.
1306 */
1307 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1308 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1309 pos += 2;
1310 c = ccc;
1311 }
1312
1313 /*
1314 * Read in contiguous digits until the first non-digit character.
1315 */
1316 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1317 c = dtrace_load8(++pos))
1318 val = val * base + x;
1319
1320 return (neg ? -val : val);
1321 }
1322
1323
1324 /*
1325 * Compare two strings using safe loads.
1326 */
1327 static int
dtrace_strncmp(const char * s1,const char * s2,size_t limit)1328 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1329 {
1330 uint8_t c1, c2;
1331 volatile uint16_t *flags;
1332
1333 if (s1 == s2 || limit == 0)
1334 return (0);
1335
1336 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1337
1338 do {
1339 if (s1 == NULL) {
1340 c1 = '\0';
1341 } else {
1342 c1 = dtrace_load8((uintptr_t)s1++);
1343 }
1344
1345 if (s2 == NULL) {
1346 c2 = '\0';
1347 } else {
1348 c2 = dtrace_load8((uintptr_t)s2++);
1349 }
1350
1351 if (c1 != c2)
1352 return (c1 - c2);
1353 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1354
1355 return (0);
1356 }
1357
1358 /*
1359 * Compute strlen(s) for a string using safe memory accesses. The additional
1360 * len parameter is used to specify a maximum length to ensure completion.
1361 */
1362 static size_t
dtrace_strlen(const char * s,size_t lim)1363 dtrace_strlen(const char *s, size_t lim)
1364 {
1365 uint_t len;
1366
1367 for (len = 0; len != lim; len++) {
1368 if (dtrace_load8((uintptr_t)s++) == '\0')
1369 break;
1370 }
1371
1372 return (len);
1373 }
1374
1375 /*
1376 * Check if an address falls within a toxic region.
1377 */
1378 static int
dtrace_istoxic(uintptr_t kaddr,size_t size)1379 dtrace_istoxic(uintptr_t kaddr, size_t size)
1380 {
1381 uintptr_t taddr, tsize;
1382 int i;
1383
1384 for (i = 0; i < dtrace_toxranges; i++) {
1385 taddr = dtrace_toxrange[i].dtt_base;
1386 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1387
1388 if (kaddr - taddr < tsize) {
1389 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1390 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1391 return (1);
1392 }
1393
1394 if (taddr - kaddr < size) {
1395 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1396 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1397 return (1);
1398 }
1399 }
1400
1401 return (0);
1402 }
1403
1404 /*
1405 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1406 * memory specified by the DIF program. The dst is assumed to be safe memory
1407 * that we can store to directly because it is managed by DTrace. As with
1408 * standard bcopy, overlapping copies are handled properly.
1409 */
1410 static void
dtrace_bcopy(const void * src,void * dst,size_t len)1411 dtrace_bcopy(const void *src, void *dst, size_t len)
1412 {
1413 if (len != 0) {
1414 uint8_t *s1 = dst;
1415 const uint8_t *s2 = src;
1416
1417 if (s1 <= s2) {
1418 do {
1419 *s1++ = dtrace_load8((uintptr_t)s2++);
1420 } while (--len != 0);
1421 } else {
1422 s2 += len;
1423 s1 += len;
1424
1425 do {
1426 *--s1 = dtrace_load8((uintptr_t)--s2);
1427 } while (--len != 0);
1428 }
1429 }
1430 }
1431
1432 /*
1433 * Copy src to dst using safe memory accesses, up to either the specified
1434 * length, or the point that a nul byte is encountered. The src is assumed to
1435 * be unsafe memory specified by the DIF program. The dst is assumed to be
1436 * safe memory that we can store to directly because it is managed by DTrace.
1437 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1438 */
1439 static void
dtrace_strcpy(const void * src,void * dst,size_t len)1440 dtrace_strcpy(const void *src, void *dst, size_t len)
1441 {
1442 if (len != 0) {
1443 uint8_t *s1 = dst, c;
1444 const uint8_t *s2 = src;
1445
1446 do {
1447 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1448 } while (--len != 0 && c != '\0');
1449 }
1450 }
1451
1452 /*
1453 * Copy src to dst, deriving the size and type from the specified (BYREF)
1454 * variable type. The src is assumed to be unsafe memory specified by the DIF
1455 * program. The dst is assumed to be DTrace variable memory that is of the
1456 * specified type; we assume that we can store to directly.
1457 */
1458 static void
dtrace_vcopy(void * src,void * dst,dtrace_diftype_t * type,size_t limit)1459 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1460 {
1461 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1462
1463 if (type->dtdt_kind == DIF_TYPE_STRING) {
1464 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1465 } else {
1466 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1467 }
1468 }
1469
1470 /*
1471 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1472 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1473 * safe memory that we can access directly because it is managed by DTrace.
1474 */
1475 static int
dtrace_bcmp(const void * s1,const void * s2,size_t len)1476 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1477 {
1478 volatile uint16_t *flags;
1479
1480 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1481
1482 if (s1 == s2)
1483 return (0);
1484
1485 if (s1 == NULL || s2 == NULL)
1486 return (1);
1487
1488 if (s1 != s2 && len != 0) {
1489 const uint8_t *ps1 = s1;
1490 const uint8_t *ps2 = s2;
1491
1492 do {
1493 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1494 return (1);
1495 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1496 }
1497 return (0);
1498 }
1499
1500 /*
1501 * Zero the specified region using a simple byte-by-byte loop. Note that this
1502 * is for safe DTrace-managed memory only.
1503 */
1504 static void
dtrace_bzero(void * dst,size_t len)1505 dtrace_bzero(void *dst, size_t len)
1506 {
1507 uchar_t *cp;
1508
1509 for (cp = dst; len != 0; len--)
1510 *cp++ = 0;
1511 }
1512
1513 static void
dtrace_add_128(uint64_t * addend1,uint64_t * addend2,uint64_t * sum)1514 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1515 {
1516 uint64_t result[2];
1517
1518 result[0] = addend1[0] + addend2[0];
1519 result[1] = addend1[1] + addend2[1] +
1520 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1521
1522 sum[0] = result[0];
1523 sum[1] = result[1];
1524 }
1525
1526 /*
1527 * Shift the 128-bit value in a by b. If b is positive, shift left.
1528 * If b is negative, shift right.
1529 */
1530 static void
dtrace_shift_128(uint64_t * a,int b)1531 dtrace_shift_128(uint64_t *a, int b)
1532 {
1533 uint64_t mask;
1534
1535 if (b == 0)
1536 return;
1537
1538 if (b < 0) {
1539 b = -b;
1540 if (b >= 64) {
1541 a[0] = a[1] >> (b - 64);
1542 a[1] = 0;
1543 } else {
1544 a[0] >>= b;
1545 mask = 1LL << (64 - b);
1546 mask -= 1;
1547 a[0] |= ((a[1] & mask) << (64 - b));
1548 a[1] >>= b;
1549 }
1550 } else {
1551 if (b >= 64) {
1552 a[1] = a[0] << (b - 64);
1553 a[0] = 0;
1554 } else {
1555 a[1] <<= b;
1556 mask = a[0] >> (64 - b);
1557 a[1] |= mask;
1558 a[0] <<= b;
1559 }
1560 }
1561 }
1562
1563 /*
1564 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1565 * use native multiplication on those, and then re-combine into the
1566 * resulting 128-bit value.
1567 *
1568 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1569 * hi1 * hi2 << 64 +
1570 * hi1 * lo2 << 32 +
1571 * hi2 * lo1 << 32 +
1572 * lo1 * lo2
1573 */
1574 static void
dtrace_multiply_128(uint64_t factor1,uint64_t factor2,uint64_t * product)1575 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1576 {
1577 uint64_t hi1, hi2, lo1, lo2;
1578 uint64_t tmp[2];
1579
1580 hi1 = factor1 >> 32;
1581 hi2 = factor2 >> 32;
1582
1583 lo1 = factor1 & DT_MASK_LO;
1584 lo2 = factor2 & DT_MASK_LO;
1585
1586 product[0] = lo1 * lo2;
1587 product[1] = hi1 * hi2;
1588
1589 tmp[0] = hi1 * lo2;
1590 tmp[1] = 0;
1591 dtrace_shift_128(tmp, 32);
1592 dtrace_add_128(product, tmp, product);
1593
1594 tmp[0] = hi2 * lo1;
1595 tmp[1] = 0;
1596 dtrace_shift_128(tmp, 32);
1597 dtrace_add_128(product, tmp, product);
1598 }
1599
1600 /*
1601 * This privilege check should be used by actions and subroutines to
1602 * verify that the user credentials of the process that enabled the
1603 * invoking ECB match the target credentials
1604 */
1605 static int
dtrace_priv_proc_common_user(dtrace_state_t * state)1606 dtrace_priv_proc_common_user(dtrace_state_t *state)
1607 {
1608 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1609
1610 /*
1611 * We should always have a non-NULL state cred here, since if cred
1612 * is null (anonymous tracing), we fast-path bypass this routine.
1613 */
1614 ASSERT(s_cr != NULL);
1615
1616 if ((cr = dtrace_CRED()) != NULL &&
1617 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1618 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1619 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1620 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1621 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1622 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1623 return (1);
1624
1625 return (0);
1626 }
1627
1628 /*
1629 * This privilege check should be used by actions and subroutines to
1630 * verify that the zone of the process that enabled the invoking ECB
1631 * matches the target credentials
1632 */
1633 static int
dtrace_priv_proc_common_zone(dtrace_state_t * state)1634 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1635 {
1636 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1637 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1638
1639 /*
1640 * We should always have a non-NULL state cred here, since if cred
1641 * is null (anonymous tracing), we fast-path bypass this routine.
1642 */
1643 ASSERT(s_cr != NULL);
1644
1645 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1646 }
1647
1648 /*
1649 * This privilege check should be used by actions and subroutines to
1650 * verify that the process has not setuid or changed credentials.
1651 */
1652 static int
dtrace_priv_proc_common_nocd(void)1653 dtrace_priv_proc_common_nocd(void)
1654 {
1655 return 1; /* Darwin omits "No Core Dump" flag. */
1656 }
1657
1658 static int
dtrace_priv_proc_destructive(dtrace_state_t * state)1659 dtrace_priv_proc_destructive(dtrace_state_t *state)
1660 {
1661 int action = state->dts_cred.dcr_action;
1662
1663 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1664 goto bad;
1665
1666 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1667 goto bad;
1668
1669 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1670 dtrace_priv_proc_common_zone(state) == 0)
1671 goto bad;
1672
1673 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1674 dtrace_priv_proc_common_user(state) == 0)
1675 goto bad;
1676
1677 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1678 dtrace_priv_proc_common_nocd() == 0)
1679 goto bad;
1680
1681 return (1);
1682
1683 bad:
1684 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1685
1686 return (0);
1687 }
1688
1689 static int
dtrace_priv_proc_control(dtrace_state_t * state)1690 dtrace_priv_proc_control(dtrace_state_t *state)
1691 {
1692 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1693 goto bad;
1694
1695 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1696 goto bad;
1697
1698 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1699 return (1);
1700
1701 if (dtrace_priv_proc_common_zone(state) &&
1702 dtrace_priv_proc_common_user(state) &&
1703 dtrace_priv_proc_common_nocd())
1704 return (1);
1705
1706 bad:
1707 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1708
1709 return (0);
1710 }
1711
1712 static int
dtrace_priv_proc(dtrace_state_t * state)1713 dtrace_priv_proc(dtrace_state_t *state)
1714 {
1715 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1716 goto bad;
1717
1718 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1719 goto bad;
1720
1721 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1722 return (1);
1723
1724 bad:
1725 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1726
1727 return (0);
1728 }
1729
1730 /*
1731 * The P_LNOATTACH check is an Apple specific check.
1732 * We need a version of dtrace_priv_proc() that omits
1733 * that check for PID and EXECNAME accesses
1734 */
1735 static int
dtrace_priv_proc_relaxed(dtrace_state_t * state)1736 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1737 {
1738
1739 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1740 return (1);
1741
1742 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1743
1744 return (0);
1745 }
1746
1747 static int
dtrace_priv_kernel(dtrace_state_t * state)1748 dtrace_priv_kernel(dtrace_state_t *state)
1749 {
1750 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1751 goto bad;
1752
1753 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1754 return (1);
1755
1756 bad:
1757 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1758
1759 return (0);
1760 }
1761
1762 static int
dtrace_priv_kernel_destructive(dtrace_state_t * state)1763 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1764 {
1765 if (dtrace_is_restricted())
1766 goto bad;
1767
1768 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1769 return (1);
1770
1771 bad:
1772 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1773
1774 return (0);
1775 }
1776
1777 /*
1778 * Note: not called from probe context. This function is called
1779 * asynchronously (and at a regular interval) from outside of probe context to
1780 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1781 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1782 */
1783 static void
dtrace_dynvar_clean(dtrace_dstate_t * dstate)1784 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1785 {
1786 dtrace_dynvar_t *dirty;
1787 int work = 0;
1788
1789 zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1790 ASSERT(dcpu->dtdsc_rinsing == NULL);
1791
1792 /*
1793 * If the dirty list is NULL, there is no dirty work to do.
1794 */
1795 if (dcpu->dtdsc_dirty == NULL)
1796 continue;
1797
1798 /*
1799 * If the clean list is non-NULL, then we're not going to do
1800 * any work for this CPU -- it means that there has not been
1801 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1802 * since the last time we cleaned house.
1803 */
1804 if (dcpu->dtdsc_clean != NULL)
1805 continue;
1806
1807 work = 1;
1808
1809 /*
1810 * Atomically move the dirty list aside.
1811 */
1812 do {
1813 dirty = dcpu->dtdsc_dirty;
1814
1815 /*
1816 * Before we zap the dirty list, set the rinsing list.
1817 * (This allows for a potential assertion in
1818 * dtrace_dynvar(): if a free dynamic variable appears
1819 * on a hash chain, either the dirty list or the
1820 * rinsing list for some CPU must be non-NULL.)
1821 */
1822 dcpu->dtdsc_rinsing = dirty;
1823 dtrace_membar_producer();
1824 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1825 dirty, NULL) != dirty);
1826 }
1827
1828 if (!work) {
1829 /*
1830 * We have no work to do; we can simply return.
1831 */
1832 return;
1833 }
1834
1835 dtrace_sync();
1836
1837 zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1838 if (dcpu->dtdsc_rinsing == NULL)
1839 continue;
1840
1841 /*
1842 * We are now guaranteed that no hash chain contains a pointer
1843 * into this dirty list; we can make it clean.
1844 */
1845 ASSERT(dcpu->dtdsc_clean == NULL);
1846 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1847 dcpu->dtdsc_rinsing = NULL;
1848 }
1849
1850 /*
1851 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1852 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1853 * This prevents a race whereby a CPU incorrectly decides that
1854 * the state should be something other than DTRACE_DSTATE_CLEAN
1855 * after dtrace_dynvar_clean() has completed.
1856 */
1857 dtrace_sync();
1858
1859 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1860 }
1861
1862 /*
1863 * Depending on the value of the op parameter, this function looks-up,
1864 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1865 * allocation is requested, this function will return a pointer to a
1866 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1867 * variable can be allocated. If NULL is returned, the appropriate counter
1868 * will be incremented.
1869 */
1870 static dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t * dstate,uint_t nkeys,dtrace_key_t * key,size_t dsize,dtrace_dynvar_op_t op,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1871 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1872 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1873 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1874 {
1875 uint64_t hashval = DTRACE_DYNHASH_VALID;
1876 dtrace_dynhash_t *hash = dstate->dtds_hash;
1877 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1878 processorid_t me = CPU->cpu_id, cpu = me;
1879 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, me);
1880 size_t bucket, ksize;
1881 size_t chunksize = dstate->dtds_chunksize;
1882 uintptr_t kdata, lock, nstate;
1883 uint_t i;
1884
1885 ASSERT(nkeys != 0);
1886
1887 /*
1888 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1889 * algorithm. For the by-value portions, we perform the algorithm in
1890 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1891 * bit, and seems to have only a minute effect on distribution. For
1892 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1893 * over each referenced byte. It's painful to do this, but it's much
1894 * better than pathological hash distribution. The efficacy of the
1895 * hashing algorithm (and a comparison with other algorithms) may be
1896 * found by running the ::dtrace_dynstat MDB dcmd.
1897 */
1898 for (i = 0; i < nkeys; i++) {
1899 if (key[i].dttk_size == 0) {
1900 uint64_t val = key[i].dttk_value;
1901
1902 hashval += (val >> 48) & 0xffff;
1903 hashval += (hashval << 10);
1904 hashval ^= (hashval >> 6);
1905
1906 hashval += (val >> 32) & 0xffff;
1907 hashval += (hashval << 10);
1908 hashval ^= (hashval >> 6);
1909
1910 hashval += (val >> 16) & 0xffff;
1911 hashval += (hashval << 10);
1912 hashval ^= (hashval >> 6);
1913
1914 hashval += val & 0xffff;
1915 hashval += (hashval << 10);
1916 hashval ^= (hashval >> 6);
1917 } else {
1918 /*
1919 * This is incredibly painful, but it beats the hell
1920 * out of the alternative.
1921 */
1922 uint64_t j, size = key[i].dttk_size;
1923 uintptr_t base = (uintptr_t)key[i].dttk_value;
1924
1925 if (!dtrace_canload(base, size, mstate, vstate))
1926 break;
1927
1928 for (j = 0; j < size; j++) {
1929 hashval += dtrace_load8(base + j);
1930 hashval += (hashval << 10);
1931 hashval ^= (hashval >> 6);
1932 }
1933 }
1934 }
1935
1936 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1937 return (NULL);
1938
1939 hashval += (hashval << 3);
1940 hashval ^= (hashval >> 11);
1941 hashval += (hashval << 15);
1942
1943 /*
1944 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1945 * comes out to be one of our two sentinel hash values. If this
1946 * actually happens, we set the hashval to be a value known to be a
1947 * non-sentinel value.
1948 */
1949 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1950 hashval = DTRACE_DYNHASH_VALID;
1951
1952 /*
1953 * Yes, it's painful to do a divide here. If the cycle count becomes
1954 * important here, tricks can be pulled to reduce it. (However, it's
1955 * critical that hash collisions be kept to an absolute minimum;
1956 * they're much more painful than a divide.) It's better to have a
1957 * solution that generates few collisions and still keeps things
1958 * relatively simple.
1959 */
1960 bucket = hashval % dstate->dtds_hashsize;
1961
1962 if (op == DTRACE_DYNVAR_DEALLOC) {
1963 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1964
1965 for (;;) {
1966 while ((lock = *lockp) & 1)
1967 continue;
1968
1969 if (dtrace_casptr((void *)(uintptr_t)lockp,
1970 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1971 break;
1972 }
1973
1974 dtrace_membar_producer();
1975 }
1976
1977 top:
1978 prev = NULL;
1979 lock = hash[bucket].dtdh_lock;
1980
1981 dtrace_membar_consumer();
1982
1983 start = hash[bucket].dtdh_chain;
1984 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1985 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1986 op != DTRACE_DYNVAR_DEALLOC));
1987
1988 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1989 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1990 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1991
1992 if (dvar->dtdv_hashval != hashval) {
1993 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1994 /*
1995 * We've reached the sink, and therefore the
1996 * end of the hash chain; we can kick out of
1997 * the loop knowing that we have seen a valid
1998 * snapshot of state.
1999 */
2000 ASSERT(dvar->dtdv_next == NULL);
2001 ASSERT(dvar == &dtrace_dynhash_sink);
2002 break;
2003 }
2004
2005 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2006 /*
2007 * We've gone off the rails: somewhere along
2008 * the line, one of the members of this hash
2009 * chain was deleted. Note that we could also
2010 * detect this by simply letting this loop run
2011 * to completion, as we would eventually hit
2012 * the end of the dirty list. However, we
2013 * want to avoid running the length of the
2014 * dirty list unnecessarily (it might be quite
2015 * long), so we catch this as early as
2016 * possible by detecting the hash marker. In
2017 * this case, we simply set dvar to NULL and
2018 * break; the conditional after the loop will
2019 * send us back to top.
2020 */
2021 dvar = NULL;
2022 break;
2023 }
2024
2025 goto next;
2026 }
2027
2028 if (dtuple->dtt_nkeys != nkeys)
2029 goto next;
2030
2031 for (i = 0; i < nkeys; i++, dkey++) {
2032 if (dkey->dttk_size != key[i].dttk_size)
2033 goto next; /* size or type mismatch */
2034
2035 if (dkey->dttk_size != 0) {
2036 if (dtrace_bcmp(
2037 (void *)(uintptr_t)key[i].dttk_value,
2038 (void *)(uintptr_t)dkey->dttk_value,
2039 dkey->dttk_size))
2040 goto next;
2041 } else {
2042 if (dkey->dttk_value != key[i].dttk_value)
2043 goto next;
2044 }
2045 }
2046
2047 if (op != DTRACE_DYNVAR_DEALLOC)
2048 return (dvar);
2049
2050 ASSERT(dvar->dtdv_next == NULL ||
2051 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2052
2053 if (prev != NULL) {
2054 ASSERT(hash[bucket].dtdh_chain != dvar);
2055 ASSERT(start != dvar);
2056 ASSERT(prev->dtdv_next == dvar);
2057 prev->dtdv_next = dvar->dtdv_next;
2058 } else {
2059 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2060 start, dvar->dtdv_next) != start) {
2061 /*
2062 * We have failed to atomically swing the
2063 * hash table head pointer, presumably because
2064 * of a conflicting allocation on another CPU.
2065 * We need to reread the hash chain and try
2066 * again.
2067 */
2068 goto top;
2069 }
2070 }
2071
2072 dtrace_membar_producer();
2073
2074 /*
2075 * Now set the hash value to indicate that it's free.
2076 */
2077 ASSERT(hash[bucket].dtdh_chain != dvar);
2078 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2079
2080 dtrace_membar_producer();
2081
2082 /*
2083 * Set the next pointer to point at the dirty list, and
2084 * atomically swing the dirty pointer to the newly freed dvar.
2085 */
2086 do {
2087 next = dcpu->dtdsc_dirty;
2088 dvar->dtdv_next = next;
2089 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2090
2091 /*
2092 * Finally, unlock this hash bucket.
2093 */
2094 ASSERT(hash[bucket].dtdh_lock == lock);
2095 ASSERT(lock & 1);
2096 hash[bucket].dtdh_lock++;
2097
2098 return (NULL);
2099 next:
2100 prev = dvar;
2101 continue;
2102 }
2103
2104 if (dvar == NULL) {
2105 /*
2106 * If dvar is NULL, it is because we went off the rails:
2107 * one of the elements that we traversed in the hash chain
2108 * was deleted while we were traversing it. In this case,
2109 * we assert that we aren't doing a dealloc (deallocs lock
2110 * the hash bucket to prevent themselves from racing with
2111 * one another), and retry the hash chain traversal.
2112 */
2113 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2114 goto top;
2115 }
2116
2117 if (op != DTRACE_DYNVAR_ALLOC) {
2118 /*
2119 * If we are not to allocate a new variable, we want to
2120 * return NULL now. Before we return, check that the value
2121 * of the lock word hasn't changed. If it has, we may have
2122 * seen an inconsistent snapshot.
2123 */
2124 if (op == DTRACE_DYNVAR_NOALLOC) {
2125 if (hash[bucket].dtdh_lock != lock)
2126 goto top;
2127 } else {
2128 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2129 ASSERT(hash[bucket].dtdh_lock == lock);
2130 ASSERT(lock & 1);
2131 hash[bucket].dtdh_lock++;
2132 }
2133
2134 return (NULL);
2135 }
2136
2137 /*
2138 * We need to allocate a new dynamic variable. The size we need is the
2139 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2140 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2141 * the size of any referred-to data (dsize). We then round the final
2142 * size up to the chunksize for allocation.
2143 */
2144 for (ksize = 0, i = 0; i < nkeys; i++)
2145 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2146
2147 /*
2148 * This should be pretty much impossible, but could happen if, say,
2149 * strange DIF specified the tuple. Ideally, this should be an
2150 * assertion and not an error condition -- but that requires that the
2151 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2152 * bullet-proof. (That is, it must not be able to be fooled by
2153 * malicious DIF.) Given the lack of backwards branches in DIF,
2154 * solving this would presumably not amount to solving the Halting
2155 * Problem -- but it still seems awfully hard.
2156 */
2157 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2158 ksize + dsize > chunksize) {
2159 dcpu->dtdsc_drops++;
2160 return (NULL);
2161 }
2162
2163 nstate = DTRACE_DSTATE_EMPTY;
2164
2165 do {
2166 retry:
2167 free = dcpu->dtdsc_free;
2168
2169 if (free == NULL) {
2170 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2171 void *rval;
2172
2173 if (clean == NULL) {
2174 /*
2175 * We're out of dynamic variable space on
2176 * this CPU. Unless we have tried all CPUs,
2177 * we'll try to allocate from a different
2178 * CPU.
2179 */
2180 switch (dstate->dtds_state) {
2181 case DTRACE_DSTATE_CLEAN: {
2182 void *sp = &dstate->dtds_state;
2183
2184 if (++cpu >= (int)NCPU)
2185 cpu = 0;
2186
2187 if (dcpu->dtdsc_dirty != NULL &&
2188 nstate == DTRACE_DSTATE_EMPTY)
2189 nstate = DTRACE_DSTATE_DIRTY;
2190
2191 if (dcpu->dtdsc_rinsing != NULL)
2192 nstate = DTRACE_DSTATE_RINSING;
2193
2194 dcpu = zpercpu_get_cpu(dstate->dtds_percpu, cpu);
2195
2196 if (cpu != me)
2197 goto retry;
2198
2199 (void) dtrace_cas32(sp,
2200 DTRACE_DSTATE_CLEAN, nstate);
2201
2202 /*
2203 * To increment the correct bean
2204 * counter, take another lap.
2205 */
2206 goto retry;
2207 }
2208
2209 case DTRACE_DSTATE_DIRTY:
2210 dcpu->dtdsc_dirty_drops++;
2211 break;
2212
2213 case DTRACE_DSTATE_RINSING:
2214 dcpu->dtdsc_rinsing_drops++;
2215 break;
2216
2217 case DTRACE_DSTATE_EMPTY:
2218 dcpu->dtdsc_drops++;
2219 break;
2220 }
2221
2222 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2223 return (NULL);
2224 }
2225
2226 /*
2227 * The clean list appears to be non-empty. We want to
2228 * move the clean list to the free list; we start by
2229 * moving the clean pointer aside.
2230 */
2231 if (dtrace_casptr(&dcpu->dtdsc_clean,
2232 clean, NULL) != clean) {
2233 /*
2234 * We are in one of two situations:
2235 *
2236 * (a) The clean list was switched to the
2237 * free list by another CPU.
2238 *
2239 * (b) The clean list was added to by the
2240 * cleansing cyclic.
2241 *
2242 * In either of these situations, we can
2243 * just reattempt the free list allocation.
2244 */
2245 goto retry;
2246 }
2247
2248 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2249
2250 /*
2251 * Now we'll move the clean list to the free list.
2252 * It's impossible for this to fail: the only way
2253 * the free list can be updated is through this
2254 * code path, and only one CPU can own the clean list.
2255 * Thus, it would only be possible for this to fail if
2256 * this code were racing with dtrace_dynvar_clean().
2257 * (That is, if dtrace_dynvar_clean() updated the clean
2258 * list, and we ended up racing to update the free
2259 * list.) This race is prevented by the dtrace_sync()
2260 * in dtrace_dynvar_clean() -- which flushes the
2261 * owners of the clean lists out before resetting
2262 * the clean lists.
2263 */
2264 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2265 ASSERT(rval == NULL);
2266 goto retry;
2267 }
2268
2269 dvar = free;
2270 new_free = dvar->dtdv_next;
2271 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2272
2273 /*
2274 * We have now allocated a new chunk. We copy the tuple keys into the
2275 * tuple array and copy any referenced key data into the data space
2276 * following the tuple array. As we do this, we relocate dttk_value
2277 * in the final tuple to point to the key data address in the chunk.
2278 */
2279 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2280 dvar->dtdv_data = (void *)(kdata + ksize);
2281 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2282
2283 for (i = 0; i < nkeys; i++) {
2284 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2285 size_t kesize = key[i].dttk_size;
2286
2287 if (kesize != 0) {
2288 dtrace_bcopy(
2289 (const void *)(uintptr_t)key[i].dttk_value,
2290 (void *)kdata, kesize);
2291 dkey->dttk_value = kdata;
2292 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2293 } else {
2294 dkey->dttk_value = key[i].dttk_value;
2295 }
2296
2297 dkey->dttk_size = kesize;
2298 }
2299
2300 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2301 dvar->dtdv_hashval = hashval;
2302 dvar->dtdv_next = start;
2303
2304 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2305 return (dvar);
2306
2307 /*
2308 * The cas has failed. Either another CPU is adding an element to
2309 * this hash chain, or another CPU is deleting an element from this
2310 * hash chain. The simplest way to deal with both of these cases
2311 * (though not necessarily the most efficient) is to free our
2312 * allocated block and tail-call ourselves. Note that the free is
2313 * to the dirty list and _not_ to the free list. This is to prevent
2314 * races with allocators, above.
2315 */
2316 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2317
2318 dtrace_membar_producer();
2319
2320 do {
2321 free = dcpu->dtdsc_dirty;
2322 dvar->dtdv_next = free;
2323 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2324
2325 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2326 }
2327
2328 /*ARGSUSED*/
2329 static void
dtrace_aggregate_min(uint64_t * oval,uint64_t nval,uint64_t arg)2330 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2331 {
2332 #pragma unused(arg) /* __APPLE__ */
2333 if ((int64_t)nval < (int64_t)*oval)
2334 *oval = nval;
2335 }
2336
2337 /*ARGSUSED*/
2338 static void
dtrace_aggregate_max(uint64_t * oval,uint64_t nval,uint64_t arg)2339 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2340 {
2341 #pragma unused(arg) /* __APPLE__ */
2342 if ((int64_t)nval > (int64_t)*oval)
2343 *oval = nval;
2344 }
2345
2346 static void
dtrace_aggregate_quantize(uint64_t * quanta,uint64_t nval,uint64_t incr)2347 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2348 {
2349 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2350 int64_t val = (int64_t)nval;
2351
2352 if (val < 0) {
2353 for (i = 0; i < zero; i++) {
2354 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2355 quanta[i] += incr;
2356 return;
2357 }
2358 }
2359 } else {
2360 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2361 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2362 quanta[i - 1] += incr;
2363 return;
2364 }
2365 }
2366
2367 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2368 return;
2369 }
2370
2371 ASSERT(0);
2372 }
2373
2374 static void
dtrace_aggregate_lquantize(uint64_t * lquanta,uint64_t nval,uint64_t incr)2375 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2376 {
2377 uint64_t arg = *lquanta++;
2378 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2379 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2380 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2381 int32_t val = (int32_t)nval, level;
2382
2383 ASSERT(step != 0);
2384 ASSERT(levels != 0);
2385
2386 if (val < base) {
2387 /*
2388 * This is an underflow.
2389 */
2390 lquanta[0] += incr;
2391 return;
2392 }
2393
2394 level = (val - base) / step;
2395
2396 if (level < levels) {
2397 lquanta[level + 1] += incr;
2398 return;
2399 }
2400
2401 /*
2402 * This is an overflow.
2403 */
2404 lquanta[levels + 1] += incr;
2405 }
2406
2407 static int
dtrace_aggregate_llquantize_bucket(int16_t factor,int16_t low,int16_t high,int16_t nsteps,int64_t value)2408 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2409 int16_t nsteps, int64_t value)
2410 {
2411 int64_t this = 1, last, next;
2412 int base = 1, order;
2413
2414 for (order = 0; order < low; ++order)
2415 this *= factor;
2416
2417 /*
2418 * If our value is less than our factor taken to the power of the
2419 * low order of magnitude, it goes into the zeroth bucket.
2420 */
2421 if (value < this)
2422 return 0;
2423 else
2424 last = this;
2425
2426 for (this *= factor; order <= high; ++order) {
2427 int nbuckets = this > nsteps ? nsteps : this;
2428
2429 /*
2430 * We should not generally get log/linear quantizations
2431 * with a high magnitude that allows 64-bits to
2432 * overflow, but we nonetheless protect against this
2433 * by explicitly checking for overflow, and clamping
2434 * our value accordingly.
2435 */
2436 next = this * factor;
2437 if (next < this) {
2438 value = this - 1;
2439 }
2440
2441 /*
2442 * If our value lies within this order of magnitude,
2443 * determine its position by taking the offset within
2444 * the order of magnitude, dividing by the bucket
2445 * width, and adding to our (accumulated) base.
2446 */
2447 if (value < this) {
2448 return (base + (value - last) / (this / nbuckets));
2449 }
2450
2451 base += nbuckets - (nbuckets / factor);
2452 last = this;
2453 this = next;
2454 }
2455
2456 /*
2457 * Our value is greater than or equal to our factor taken to the
2458 * power of one plus the high magnitude -- return the top bucket.
2459 */
2460 return base;
2461 }
2462
2463 static void
dtrace_aggregate_llquantize(uint64_t * llquanta,uint64_t nval,uint64_t incr)2464 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2465 {
2466 uint64_t arg = *llquanta++;
2467 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2468 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2469 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2470 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2471
2472 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2473 }
2474
2475 /*ARGSUSED*/
2476 static void
dtrace_aggregate_avg(uint64_t * data,uint64_t nval,uint64_t arg)2477 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2478 {
2479 #pragma unused(arg) /* __APPLE__ */
2480 data[0]++;
2481 data[1] += nval;
2482 }
2483
2484 /*ARGSUSED*/
2485 static void
dtrace_aggregate_stddev(uint64_t * data,uint64_t nval,uint64_t arg)2486 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2487 {
2488 #pragma unused(arg) /* __APPLE__ */
2489 int64_t snval = (int64_t)nval;
2490 uint64_t tmp[2];
2491
2492 data[0]++;
2493 data[1] += nval;
2494
2495 /*
2496 * What we want to say here is:
2497 *
2498 * data[2] += nval * nval;
2499 *
2500 * But given that nval is 64-bit, we could easily overflow, so
2501 * we do this as 128-bit arithmetic.
2502 */
2503 if (snval < 0)
2504 snval = -snval;
2505
2506 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2507 dtrace_add_128(data + 2, tmp, data + 2);
2508 }
2509
2510 /*ARGSUSED*/
2511 static void
dtrace_aggregate_count(uint64_t * oval,uint64_t nval,uint64_t arg)2512 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2513 {
2514 #pragma unused(nval, arg) /* __APPLE__ */
2515 *oval = *oval + 1;
2516 }
2517
2518 /*ARGSUSED*/
2519 static void
dtrace_aggregate_sum(uint64_t * oval,uint64_t nval,uint64_t arg)2520 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2521 {
2522 #pragma unused(arg) /* __APPLE__ */
2523 *oval += nval;
2524 }
2525
2526 /*
2527 * Aggregate given the tuple in the principal data buffer, and the aggregating
2528 * action denoted by the specified dtrace_aggregation_t. The aggregation
2529 * buffer is specified as the buf parameter. This routine does not return
2530 * failure; if there is no space in the aggregation buffer, the data will be
2531 * dropped, and a corresponding counter incremented.
2532 */
2533 __attribute__((noinline))
2534 static void
dtrace_aggregate(dtrace_aggregation_t * agg,dtrace_buffer_t * dbuf,intptr_t offset,dtrace_buffer_t * buf,uint64_t expr,uint64_t arg)2535 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2536 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2537 {
2538 #pragma unused(arg)
2539 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2540 uint32_t i, ndx, size, fsize;
2541 uint32_t align = sizeof (uint64_t) - 1;
2542 dtrace_aggbuffer_t *agb;
2543 dtrace_aggkey_t *key;
2544 uint32_t hashval = 0, limit, isstr;
2545 caddr_t tomax, data, kdata;
2546 dtrace_actkind_t action;
2547 dtrace_action_t *act;
2548 uintptr_t offs;
2549
2550 if (buf == NULL)
2551 return;
2552
2553 if (!agg->dtag_hasarg) {
2554 /*
2555 * Currently, only quantize() and lquantize() take additional
2556 * arguments, and they have the same semantics: an increment
2557 * value that defaults to 1 when not present. If additional
2558 * aggregating actions take arguments, the setting of the
2559 * default argument value will presumably have to become more
2560 * sophisticated...
2561 */
2562 arg = 1;
2563 }
2564
2565 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2566 size = rec->dtrd_offset - agg->dtag_base;
2567 fsize = size + rec->dtrd_size;
2568
2569 ASSERT(dbuf->dtb_tomax != NULL);
2570 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2571
2572 if ((tomax = buf->dtb_tomax) == NULL) {
2573 dtrace_buffer_drop(buf);
2574 return;
2575 }
2576
2577 /*
2578 * The metastructure is always at the bottom of the buffer.
2579 */
2580 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2581 sizeof (dtrace_aggbuffer_t));
2582
2583 if (buf->dtb_offset == 0) {
2584 /*
2585 * We just kludge up approximately 1/8th of the size to be
2586 * buckets. If this guess ends up being routinely
2587 * off-the-mark, we may need to dynamically readjust this
2588 * based on past performance.
2589 */
2590 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2591
2592 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2593 (uintptr_t)tomax || hashsize == 0) {
2594 /*
2595 * We've been given a ludicrously small buffer;
2596 * increment our drop count and leave.
2597 */
2598 dtrace_buffer_drop(buf);
2599 return;
2600 }
2601
2602 /*
2603 * And now, a pathetic attempt to try to get a an odd (or
2604 * perchance, a prime) hash size for better hash distribution.
2605 */
2606 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2607 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2608
2609 agb->dtagb_hashsize = hashsize;
2610 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2611 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2612 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2613
2614 for (i = 0; i < agb->dtagb_hashsize; i++)
2615 agb->dtagb_hash[i] = NULL;
2616 }
2617
2618 ASSERT(agg->dtag_first != NULL);
2619 ASSERT(agg->dtag_first->dta_intuple);
2620
2621 /*
2622 * Calculate the hash value based on the key. Note that we _don't_
2623 * include the aggid in the hashing (but we will store it as part of
2624 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2625 * algorithm: a simple, quick algorithm that has no known funnels, and
2626 * gets good distribution in practice. The efficacy of the hashing
2627 * algorithm (and a comparison with other algorithms) may be found by
2628 * running the ::dtrace_aggstat MDB dcmd.
2629 */
2630 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2631 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2632 limit = i + act->dta_rec.dtrd_size;
2633 ASSERT(limit <= size);
2634 isstr = DTRACEACT_ISSTRING(act);
2635
2636 for (; i < limit; i++) {
2637 hashval += data[i];
2638 hashval += (hashval << 10);
2639 hashval ^= (hashval >> 6);
2640
2641 if (isstr && data[i] == '\0')
2642 break;
2643 }
2644 }
2645
2646 hashval += (hashval << 3);
2647 hashval ^= (hashval >> 11);
2648 hashval += (hashval << 15);
2649
2650 /*
2651 * Yes, the divide here is expensive -- but it's generally the least
2652 * of the performance issues given the amount of data that we iterate
2653 * over to compute hash values, compare data, etc.
2654 */
2655 ndx = hashval % agb->dtagb_hashsize;
2656
2657 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2658 ASSERT((caddr_t)key >= tomax);
2659 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2660
2661 if (hashval != key->dtak_hashval || key->dtak_size != size)
2662 continue;
2663
2664 kdata = key->dtak_data;
2665 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2666
2667 for (act = agg->dtag_first; act->dta_intuple;
2668 act = act->dta_next) {
2669 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2670 limit = i + act->dta_rec.dtrd_size;
2671 ASSERT(limit <= size);
2672 isstr = DTRACEACT_ISSTRING(act);
2673
2674 for (; i < limit; i++) {
2675 if (kdata[i] != data[i])
2676 goto next;
2677
2678 if (isstr && data[i] == '\0')
2679 break;
2680 }
2681 }
2682
2683 if (action != key->dtak_action) {
2684 /*
2685 * We are aggregating on the same value in the same
2686 * aggregation with two different aggregating actions.
2687 * (This should have been picked up in the compiler,
2688 * so we may be dealing with errant or devious DIF.)
2689 * This is an error condition; we indicate as much,
2690 * and return.
2691 */
2692 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2693 return;
2694 }
2695
2696 /*
2697 * This is a hit: we need to apply the aggregator to
2698 * the value at this key.
2699 */
2700 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2701 return;
2702 next:
2703 continue;
2704 }
2705
2706 /*
2707 * We didn't find it. We need to allocate some zero-filled space,
2708 * link it into the hash table appropriately, and apply the aggregator
2709 * to the (zero-filled) value.
2710 */
2711 offs = buf->dtb_offset;
2712 while (offs & (align - 1))
2713 offs += sizeof (uint32_t);
2714
2715 /*
2716 * If we don't have enough room to both allocate a new key _and_
2717 * its associated data, increment the drop count and return.
2718 */
2719 if ((uintptr_t)tomax + offs + fsize >
2720 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2721 dtrace_buffer_drop(buf);
2722 return;
2723 }
2724
2725 /*CONSTCOND*/
2726 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2727 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2728 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2729
2730 key->dtak_data = kdata = tomax + offs;
2731 buf->dtb_offset = offs + fsize;
2732
2733 /*
2734 * Now copy the data across.
2735 */
2736 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2737
2738 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2739 kdata[i] = data[i];
2740
2741 /*
2742 * Because strings are not zeroed out by default, we need to iterate
2743 * looking for actions that store strings, and we need to explicitly
2744 * pad these strings out with zeroes.
2745 */
2746 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2747 int nul;
2748
2749 if (!DTRACEACT_ISSTRING(act))
2750 continue;
2751
2752 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2753 limit = i + act->dta_rec.dtrd_size;
2754 ASSERT(limit <= size);
2755
2756 for (nul = 0; i < limit; i++) {
2757 if (nul) {
2758 kdata[i] = '\0';
2759 continue;
2760 }
2761
2762 if (data[i] != '\0')
2763 continue;
2764
2765 nul = 1;
2766 }
2767 }
2768
2769 for (i = size; i < fsize; i++)
2770 kdata[i] = 0;
2771
2772 key->dtak_hashval = hashval;
2773 key->dtak_size = size;
2774 key->dtak_action = action;
2775 key->dtak_next = agb->dtagb_hash[ndx];
2776 agb->dtagb_hash[ndx] = key;
2777
2778 /*
2779 * Finally, apply the aggregator.
2780 */
2781 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2782 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2783 }
2784
2785 /*
2786 * Given consumer state, this routine finds a speculation in the INACTIVE
2787 * state and transitions it into the ACTIVE state. If there is no speculation
2788 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2789 * incremented -- it is up to the caller to take appropriate action.
2790 */
2791 static int
dtrace_speculation(dtrace_state_t * state)2792 dtrace_speculation(dtrace_state_t *state)
2793 {
2794 int i = 0;
2795 dtrace_speculation_state_t current;
2796 uint32_t *stat = &state->dts_speculations_unavail, count;
2797
2798 while (i < state->dts_nspeculations) {
2799 dtrace_speculation_t *spec = &state->dts_speculations[i];
2800
2801 current = spec->dtsp_state;
2802
2803 if (current != DTRACESPEC_INACTIVE) {
2804 if (current == DTRACESPEC_COMMITTINGMANY ||
2805 current == DTRACESPEC_COMMITTING ||
2806 current == DTRACESPEC_DISCARDING)
2807 stat = &state->dts_speculations_busy;
2808 i++;
2809 continue;
2810 }
2811
2812 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2813 current, DTRACESPEC_ACTIVE) == current)
2814 return (i + 1);
2815 }
2816
2817 /*
2818 * We couldn't find a speculation. If we found as much as a single
2819 * busy speculation buffer, we'll attribute this failure as "busy"
2820 * instead of "unavail".
2821 */
2822 do {
2823 count = *stat;
2824 } while (dtrace_cas32(stat, count, count + 1) != count);
2825
2826 return (0);
2827 }
2828
2829 /*
2830 * This routine commits an active speculation. If the specified speculation
2831 * is not in a valid state to perform a commit(), this routine will silently do
2832 * nothing. The state of the specified speculation is transitioned according
2833 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2834 */
2835 static void
dtrace_speculation_commit(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)2836 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2837 dtrace_specid_t which)
2838 {
2839 dtrace_speculation_t *spec;
2840 dtrace_buffer_t *src, *dest;
2841 uintptr_t daddr, saddr, dlimit, slimit;
2842 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2843 intptr_t offs;
2844 uint64_t timestamp;
2845
2846 if (which == 0)
2847 return;
2848
2849 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2850 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2851 return;
2852 }
2853
2854 spec = &state->dts_speculations[which - 1];
2855 src = &spec->dtsp_buffer[cpu];
2856 dest = &state->dts_buffer[cpu];
2857
2858 do {
2859 current = spec->dtsp_state;
2860
2861 if (current == DTRACESPEC_COMMITTINGMANY)
2862 break;
2863
2864 switch (current) {
2865 case DTRACESPEC_INACTIVE:
2866 case DTRACESPEC_DISCARDING:
2867 return;
2868
2869 case DTRACESPEC_COMMITTING:
2870 /*
2871 * This is only possible if we are (a) commit()'ing
2872 * without having done a prior speculate() on this CPU
2873 * and (b) racing with another commit() on a different
2874 * CPU. There's nothing to do -- we just assert that
2875 * our offset is 0.
2876 */
2877 ASSERT(src->dtb_offset == 0);
2878 return;
2879
2880 case DTRACESPEC_ACTIVE:
2881 new = DTRACESPEC_COMMITTING;
2882 break;
2883
2884 case DTRACESPEC_ACTIVEONE:
2885 /*
2886 * This speculation is active on one CPU. If our
2887 * buffer offset is non-zero, we know that the one CPU
2888 * must be us. Otherwise, we are committing on a
2889 * different CPU from the speculate(), and we must
2890 * rely on being asynchronously cleaned.
2891 */
2892 if (src->dtb_offset != 0) {
2893 new = DTRACESPEC_COMMITTING;
2894 break;
2895 }
2896 OS_FALLTHROUGH;
2897
2898 case DTRACESPEC_ACTIVEMANY:
2899 new = DTRACESPEC_COMMITTINGMANY;
2900 break;
2901
2902 default:
2903 ASSERT(0);
2904 }
2905 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2906 current, new) != current);
2907
2908 /*
2909 * We have set the state to indicate that we are committing this
2910 * speculation. Now reserve the necessary space in the destination
2911 * buffer.
2912 */
2913 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2914 sizeof (uint64_t), state, NULL)) < 0) {
2915 dtrace_buffer_drop(dest);
2916 goto out;
2917 }
2918
2919 /*
2920 * We have sufficient space to copy the speculative buffer into the
2921 * primary buffer. First, modify the speculative buffer, filling
2922 * in the timestamp of all entries with the current time. The data
2923 * must have the commit() time rather than the time it was traced,
2924 * so that all entries in the primary buffer are in timestamp order.
2925 */
2926 timestamp = dtrace_gethrtime();
2927 saddr = (uintptr_t)src->dtb_tomax;
2928 slimit = saddr + src->dtb_offset;
2929 while (saddr < slimit) {
2930 size_t size;
2931 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2932
2933 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2934 saddr += sizeof (dtrace_epid_t);
2935 continue;
2936 }
2937
2938 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2939 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2940
2941 ASSERT(saddr + size <= slimit);
2942 ASSERT(size >= sizeof(dtrace_rechdr_t));
2943 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2944
2945 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2946
2947 saddr += size;
2948 }
2949
2950 /*
2951 * Copy the buffer across. (Note that this is a
2952 * highly subobtimal bcopy(); in the unlikely event that this becomes
2953 * a serious performance issue, a high-performance DTrace-specific
2954 * bcopy() should obviously be invented.)
2955 */
2956 daddr = (uintptr_t)dest->dtb_tomax + offs;
2957 dlimit = daddr + src->dtb_offset;
2958 saddr = (uintptr_t)src->dtb_tomax;
2959
2960 /*
2961 * First, the aligned portion.
2962 */
2963 while (dlimit - daddr >= sizeof (uint64_t)) {
2964 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2965
2966 daddr += sizeof (uint64_t);
2967 saddr += sizeof (uint64_t);
2968 }
2969
2970 /*
2971 * Now any left-over bit...
2972 */
2973 while (dlimit - daddr)
2974 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2975
2976 /*
2977 * Finally, commit the reserved space in the destination buffer.
2978 */
2979 dest->dtb_offset = offs + src->dtb_offset;
2980
2981 out:
2982 /*
2983 * If we're lucky enough to be the only active CPU on this speculation
2984 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2985 */
2986 if (current == DTRACESPEC_ACTIVE ||
2987 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2988 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2989 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2990 #pragma unused(rval) /* __APPLE__ */
2991
2992 ASSERT(rval == DTRACESPEC_COMMITTING);
2993 }
2994
2995 src->dtb_offset = 0;
2996 src->dtb_xamot_drops += src->dtb_drops;
2997 src->dtb_drops = 0;
2998 }
2999
3000 /*
3001 * This routine discards an active speculation. If the specified speculation
3002 * is not in a valid state to perform a discard(), this routine will silently
3003 * do nothing. The state of the specified speculation is transitioned
3004 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3005 */
3006 __attribute__((noinline))
3007 static void
dtrace_speculation_discard(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)3008 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3009 dtrace_specid_t which)
3010 {
3011 dtrace_speculation_t *spec;
3012 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3013 dtrace_buffer_t *buf;
3014
3015 if (which == 0)
3016 return;
3017
3018 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3019 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3020 return;
3021 }
3022
3023 spec = &state->dts_speculations[which - 1];
3024 buf = &spec->dtsp_buffer[cpu];
3025
3026 do {
3027 current = spec->dtsp_state;
3028
3029 switch (current) {
3030 case DTRACESPEC_INACTIVE:
3031 case DTRACESPEC_COMMITTINGMANY:
3032 case DTRACESPEC_COMMITTING:
3033 case DTRACESPEC_DISCARDING:
3034 return;
3035
3036 case DTRACESPEC_ACTIVE:
3037 case DTRACESPEC_ACTIVEMANY:
3038 new = DTRACESPEC_DISCARDING;
3039 break;
3040
3041 case DTRACESPEC_ACTIVEONE:
3042 if (buf->dtb_offset != 0) {
3043 new = DTRACESPEC_INACTIVE;
3044 } else {
3045 new = DTRACESPEC_DISCARDING;
3046 }
3047 break;
3048
3049 default:
3050 ASSERT(0);
3051 }
3052 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3053 current, new) != current);
3054
3055 buf->dtb_offset = 0;
3056 buf->dtb_drops = 0;
3057 }
3058
3059 /*
3060 * Note: not called from probe context. This function is called
3061 * asynchronously from cross call context to clean any speculations that are
3062 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3063 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3064 * speculation.
3065 */
3066 static void
dtrace_speculation_clean_here(dtrace_state_t * state)3067 dtrace_speculation_clean_here(dtrace_state_t *state)
3068 {
3069 dtrace_icookie_t cookie;
3070 processorid_t cpu = CPU->cpu_id;
3071 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3072 dtrace_specid_t i;
3073
3074 cookie = dtrace_interrupt_disable();
3075
3076 if (dest->dtb_tomax == NULL) {
3077 dtrace_interrupt_enable(cookie);
3078 return;
3079 }
3080
3081 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3082 dtrace_speculation_t *spec = &state->dts_speculations[i];
3083 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3084
3085 if (src->dtb_tomax == NULL)
3086 continue;
3087
3088 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3089 src->dtb_offset = 0;
3090 continue;
3091 }
3092
3093 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3094 continue;
3095
3096 if (src->dtb_offset == 0)
3097 continue;
3098
3099 dtrace_speculation_commit(state, cpu, i + 1);
3100 }
3101
3102 dtrace_interrupt_enable(cookie);
3103 }
3104
3105 /*
3106 * Note: not called from probe context. This function is called
3107 * asynchronously (and at a regular interval) to clean any speculations that
3108 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3109 * is work to be done, it cross calls all CPUs to perform that work;
3110 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3111 * INACTIVE state until they have been cleaned by all CPUs.
3112 */
3113 static void
dtrace_speculation_clean(dtrace_state_t * state)3114 dtrace_speculation_clean(dtrace_state_t *state)
3115 {
3116 int work = 0;
3117 uint32_t rv;
3118 dtrace_specid_t i;
3119
3120 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3121 dtrace_speculation_t *spec = &state->dts_speculations[i];
3122
3123 ASSERT(!spec->dtsp_cleaning);
3124
3125 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3126 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3127 continue;
3128
3129 work++;
3130 spec->dtsp_cleaning = 1;
3131 }
3132
3133 if (!work)
3134 return;
3135
3136 dtrace_xcall(DTRACE_CPUALL,
3137 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3138
3139 /*
3140 * We now know that all CPUs have committed or discarded their
3141 * speculation buffers, as appropriate. We can now set the state
3142 * to inactive.
3143 */
3144 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3145 dtrace_speculation_t *spec = &state->dts_speculations[i];
3146 dtrace_speculation_state_t current, new;
3147
3148 if (!spec->dtsp_cleaning)
3149 continue;
3150
3151 current = spec->dtsp_state;
3152 ASSERT(current == DTRACESPEC_DISCARDING ||
3153 current == DTRACESPEC_COMMITTINGMANY);
3154
3155 new = DTRACESPEC_INACTIVE;
3156
3157 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3158 ASSERT(rv == current);
3159 spec->dtsp_cleaning = 0;
3160 }
3161 }
3162
3163 /*
3164 * Called as part of a speculate() to get the speculative buffer associated
3165 * with a given speculation. Returns NULL if the specified speculation is not
3166 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3167 * the active CPU is not the specified CPU -- the speculation will be
3168 * atomically transitioned into the ACTIVEMANY state.
3169 */
3170 __attribute__((noinline))
3171 static dtrace_buffer_t *
dtrace_speculation_buffer(dtrace_state_t * state,processorid_t cpuid,dtrace_specid_t which)3172 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3173 dtrace_specid_t which)
3174 {
3175 dtrace_speculation_t *spec;
3176 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3177 dtrace_buffer_t *buf;
3178
3179 if (which == 0)
3180 return (NULL);
3181
3182 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3183 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3184 return (NULL);
3185 }
3186
3187 spec = &state->dts_speculations[which - 1];
3188 buf = &spec->dtsp_buffer[cpuid];
3189
3190 do {
3191 current = spec->dtsp_state;
3192
3193 switch (current) {
3194 case DTRACESPEC_INACTIVE:
3195 case DTRACESPEC_COMMITTINGMANY:
3196 case DTRACESPEC_DISCARDING:
3197 return (NULL);
3198
3199 case DTRACESPEC_COMMITTING:
3200 ASSERT(buf->dtb_offset == 0);
3201 return (NULL);
3202
3203 case DTRACESPEC_ACTIVEONE:
3204 /*
3205 * This speculation is currently active on one CPU.
3206 * Check the offset in the buffer; if it's non-zero,
3207 * that CPU must be us (and we leave the state alone).
3208 * If it's zero, assume that we're starting on a new
3209 * CPU -- and change the state to indicate that the
3210 * speculation is active on more than one CPU.
3211 */
3212 if (buf->dtb_offset != 0)
3213 return (buf);
3214
3215 new = DTRACESPEC_ACTIVEMANY;
3216 break;
3217
3218 case DTRACESPEC_ACTIVEMANY:
3219 return (buf);
3220
3221 case DTRACESPEC_ACTIVE:
3222 new = DTRACESPEC_ACTIVEONE;
3223 break;
3224
3225 default:
3226 ASSERT(0);
3227 }
3228 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3229 current, new) != current);
3230
3231 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3232 return (buf);
3233 }
3234
3235 /*
3236 * Return a string. In the event that the user lacks the privilege to access
3237 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3238 * don't fail access checking.
3239 *
3240 * dtrace_dif_variable() uses this routine as a helper for various
3241 * builtin values such as 'execname' and 'probefunc.'
3242 */
3243 static
3244 uintptr_t
dtrace_dif_varstr(uintptr_t addr,dtrace_state_t * state,dtrace_mstate_t * mstate)3245 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3246 dtrace_mstate_t *mstate)
3247 {
3248 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3249 uintptr_t ret;
3250 size_t strsz;
3251
3252 /*
3253 * The easy case: this probe is allowed to read all of memory, so
3254 * we can just return this as a vanilla pointer.
3255 */
3256 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3257 return (addr);
3258
3259 /*
3260 * This is the tougher case: we copy the string in question from
3261 * kernel memory into scratch memory and return it that way: this
3262 * ensures that we won't trip up when access checking tests the
3263 * BYREF return value.
3264 */
3265 strsz = dtrace_strlen((char *)addr, size) + 1;
3266
3267 if (mstate->dtms_scratch_ptr + strsz >
3268 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3269 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3270 return (0);
3271 }
3272
3273 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3274 strsz);
3275 ret = mstate->dtms_scratch_ptr;
3276 mstate->dtms_scratch_ptr += strsz;
3277 return (ret);
3278 }
3279
3280 /*
3281 * This function implements the DIF emulator's variable lookups. The emulator
3282 * passes a reserved variable identifier and optional built-in array index.
3283 */
3284 static uint64_t
dtrace_dif_variable(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t v,uint64_t ndx)3285 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3286 uint64_t ndx)
3287 {
3288 /*
3289 * If we're accessing one of the uncached arguments, we'll turn this
3290 * into a reference in the args array.
3291 */
3292 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3293 ndx = v - DIF_VAR_ARG0;
3294 v = DIF_VAR_ARGS;
3295 }
3296
3297 switch (v) {
3298 case DIF_VAR_ARGS:
3299 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3300 if (ndx >= sizeof (mstate->dtms_arg) /
3301 sizeof (mstate->dtms_arg[0])) {
3302 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3303 dtrace_vstate_t *vstate = &state->dts_vstate;
3304 dtrace_provider_t *pv;
3305 uint64_t val;
3306 int argndx = ndx;
3307
3308 if (argndx < 0) {
3309 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3310 return (0);
3311 }
3312
3313 pv = mstate->dtms_probe->dtpr_provider;
3314 if (pv->dtpv_pops.dtps_getargval != NULL)
3315 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3316 mstate->dtms_probe->dtpr_id,
3317 mstate->dtms_probe->dtpr_arg, argndx, aframes);
3318 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3319 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && argndx == 5) {
3320 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3321 }
3322
3323 else
3324 val = dtrace_getarg(argndx, aframes, mstate, vstate);
3325
3326 /*
3327 * This is regrettably required to keep the compiler
3328 * from tail-optimizing the call to dtrace_getarg().
3329 * The condition always evaluates to true, but the
3330 * compiler has no way of figuring that out a priori.
3331 * (None of this would be necessary if the compiler
3332 * could be relied upon to _always_ tail-optimize
3333 * the call to dtrace_getarg() -- but it can't.)
3334 */
3335 if (mstate->dtms_probe != NULL)
3336 return (val);
3337
3338 ASSERT(0);
3339 }
3340
3341 return (mstate->dtms_arg[ndx]);
3342
3343 case DIF_VAR_UREGS: {
3344 thread_t thread;
3345
3346 if (!dtrace_priv_proc(state))
3347 return (0);
3348
3349 if ((thread = current_thread()) == NULL) {
3350 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3351 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3352 return (0);
3353 }
3354
3355 return (dtrace_getreg(find_user_regs(thread), ndx));
3356 }
3357
3358 case DIF_VAR_VMREGS: {
3359 uint64_t rval;
3360
3361 if (!dtrace_priv_kernel(state))
3362 return (0);
3363
3364 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3365
3366 rval = dtrace_getvmreg(ndx);
3367
3368 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3369
3370 return (rval);
3371 }
3372
3373 case DIF_VAR_CURTHREAD:
3374 if (!dtrace_priv_kernel(state))
3375 return (0);
3376
3377 return ((uint64_t)(uintptr_t)current_thread());
3378
3379 case DIF_VAR_TIMESTAMP:
3380 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3381 mstate->dtms_timestamp = dtrace_gethrtime();
3382 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3383 }
3384 return (mstate->dtms_timestamp);
3385
3386 case DIF_VAR_VTIMESTAMP:
3387 ASSERT(dtrace_vtime_references != 0);
3388 return (dtrace_get_thread_vtime(current_thread()));
3389
3390 case DIF_VAR_WALLTIMESTAMP:
3391 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3392 mstate->dtms_walltimestamp = dtrace_gethrestime();
3393 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3394 }
3395 return (mstate->dtms_walltimestamp);
3396
3397 case DIF_VAR_MACHTIMESTAMP:
3398 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3399 mstate->dtms_machtimestamp = mach_absolute_time();
3400 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3401 }
3402 return (mstate->dtms_machtimestamp);
3403
3404 case DIF_VAR_MACHCTIMESTAMP:
3405 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHCTIMESTAMP)) {
3406 mstate->dtms_machctimestamp = mach_continuous_time();
3407 mstate->dtms_present |= DTRACE_MSTATE_MACHCTIMESTAMP;
3408 }
3409 return (mstate->dtms_machctimestamp);
3410
3411
3412 case DIF_VAR_CPU:
3413 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3414
3415 case DIF_VAR_IPL:
3416 if (!dtrace_priv_kernel(state))
3417 return (0);
3418 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3419 mstate->dtms_ipl = dtrace_getipl();
3420 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3421 }
3422 return (mstate->dtms_ipl);
3423
3424 case DIF_VAR_EPID:
3425 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3426 return (mstate->dtms_epid);
3427
3428 case DIF_VAR_ID:
3429 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3430 return (mstate->dtms_probe->dtpr_id);
3431
3432 case DIF_VAR_STACKDEPTH:
3433 if (!dtrace_priv_kernel(state))
3434 return (0);
3435 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3436 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3437
3438 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3439 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3440 }
3441 return (mstate->dtms_stackdepth);
3442
3443 case DIF_VAR_USTACKDEPTH:
3444 if (!dtrace_priv_proc(state))
3445 return (0);
3446 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3447 /*
3448 * See comment in DIF_VAR_PID.
3449 */
3450 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3451 CPU_ON_INTR(CPU)) {
3452 mstate->dtms_ustackdepth = 0;
3453 } else {
3454 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3455 mstate->dtms_ustackdepth =
3456 dtrace_getustackdepth();
3457 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3458 }
3459 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3460 }
3461 return (mstate->dtms_ustackdepth);
3462
3463 case DIF_VAR_CALLER:
3464 if (!dtrace_priv_kernel(state))
3465 return (0);
3466 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3467 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3468
3469 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3470 /*
3471 * If this is an unanchored probe, we are
3472 * required to go through the slow path:
3473 * dtrace_caller() only guarantees correct
3474 * results for anchored probes.
3475 */
3476 pc_t caller[2];
3477
3478 dtrace_getpcstack(caller, 2, aframes,
3479 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3480 mstate->dtms_caller = caller[1];
3481 } else if ((mstate->dtms_caller =
3482 dtrace_caller(aframes)) == (uintptr_t)-1) {
3483 /*
3484 * We have failed to do this the quick way;
3485 * we must resort to the slower approach of
3486 * calling dtrace_getpcstack().
3487 */
3488 pc_t caller;
3489
3490 dtrace_getpcstack(&caller, 1, aframes, NULL);
3491 mstate->dtms_caller = caller;
3492 }
3493
3494 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3495 }
3496 return (mstate->dtms_caller);
3497
3498 case DIF_VAR_UCALLER:
3499 if (!dtrace_priv_proc(state))
3500 return (0);
3501
3502 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3503 uint64_t ustack[3];
3504
3505 /*
3506 * dtrace_getupcstack() fills in the first uint64_t
3507 * with the current PID. The second uint64_t will
3508 * be the program counter at user-level. The third
3509 * uint64_t will contain the caller, which is what
3510 * we're after.
3511 */
3512 ustack[2] = 0;
3513 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3514 dtrace_getupcstack(ustack, 3);
3515 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3516 mstate->dtms_ucaller = ustack[2];
3517 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3518 }
3519
3520 return (mstate->dtms_ucaller);
3521
3522 case DIF_VAR_PROBEPROV:
3523 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3524 return (dtrace_dif_varstr(
3525 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3526 state, mstate));
3527
3528 case DIF_VAR_PROBEMOD:
3529 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3530 return (dtrace_dif_varstr(
3531 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3532 state, mstate));
3533
3534 case DIF_VAR_PROBEFUNC:
3535 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3536 return (dtrace_dif_varstr(
3537 (uintptr_t)mstate->dtms_probe->dtpr_func,
3538 state, mstate));
3539
3540 case DIF_VAR_PROBENAME:
3541 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3542 return (dtrace_dif_varstr(
3543 (uintptr_t)mstate->dtms_probe->dtpr_name,
3544 state, mstate));
3545
3546 case DIF_VAR_PID:
3547 if (!dtrace_priv_proc_relaxed(state))
3548 return (0);
3549
3550 /*
3551 * Note that we are assuming that an unanchored probe is
3552 * always due to a high-level interrupt. (And we're assuming
3553 * that there is only a single high level interrupt.)
3554 */
3555 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3556 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3557 return 0;
3558
3559 return ((uint64_t)dtrace_proc_selfpid());
3560
3561 case DIF_VAR_PPID:
3562 if (!dtrace_priv_proc_relaxed(state))
3563 return (0);
3564
3565 /*
3566 * See comment in DIF_VAR_PID.
3567 */
3568 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3569 return (0);
3570
3571 return ((uint64_t)dtrace_proc_selfppid());
3572
3573 case DIF_VAR_TID:
3574 /* We do not need to check for null current_thread() */
3575 return thread_tid(current_thread()); /* globally unique */
3576
3577 case DIF_VAR_PTHREAD_SELF:
3578 if (!dtrace_priv_proc(state))
3579 return (0);
3580
3581 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3582 return 0;
3583
3584 case DIF_VAR_DISPATCHQADDR:
3585 if (!dtrace_priv_proc(state))
3586 return (0);
3587
3588 /* We do not need to check for null current_thread() */
3589 return thread_dispatchqaddr(current_thread());
3590
3591 case DIF_VAR_EXECNAME:
3592 {
3593 char *xname = (char *)mstate->dtms_scratch_ptr;
3594 const char *pname = proc_best_name(curproc);
3595 size_t scratch_size = sizeof(proc_name_t);
3596
3597 /* The scratch allocation's lifetime is that of the clause. */
3598 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3599 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3600 return 0;
3601 }
3602
3603 if (!dtrace_priv_proc_relaxed(state))
3604 return (0);
3605
3606 mstate->dtms_scratch_ptr += scratch_size;
3607 strlcpy(xname, pname, scratch_size);
3608
3609 return ((uint64_t)(uintptr_t)xname);
3610 }
3611
3612
3613 case DIF_VAR_ZONENAME:
3614 {
3615 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3616 char *zname = (char *)mstate->dtms_scratch_ptr;
3617 size_t scratch_size = 6 + 1;
3618
3619 if (!dtrace_priv_proc(state))
3620 return (0);
3621
3622 /* The scratch allocation's lifetime is that of the clause. */
3623 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3624 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3625 return 0;
3626 }
3627
3628 mstate->dtms_scratch_ptr += scratch_size;
3629
3630 /* The kernel does not provide zonename, it will always return 'global'. */
3631 strlcpy(zname, "global", scratch_size);
3632
3633 return ((uint64_t)(uintptr_t)zname);
3634 }
3635
3636 #if CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS
3637 case DIF_VAR_CPUINSTRS:
3638 return mt_cur_cpu_instrs();
3639
3640 case DIF_VAR_CPUCYCLES:
3641 return mt_cur_cpu_cycles();
3642
3643 case DIF_VAR_VINSTRS: {
3644 struct recount_usage usage = { 0 };
3645 recount_current_thread_usage(&usage);
3646 return recount_usage_instructions(&usage);
3647 }
3648
3649 case DIF_VAR_VCYCLES: {
3650 struct recount_usage usage = { 0 };
3651 recount_current_thread_usage(&usage);
3652 return recount_usage_cycles(&usage);
3653 }
3654
3655 #else /* CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS */
3656 case DIF_VAR_CPUINSTRS:
3657 case DIF_VAR_CPUCYCLES:
3658 case DIF_VAR_VINSTRS:
3659 case DIF_VAR_VCYCLES:
3660 return 0;
3661 #endif /* !CONFIG_PERVASIVE_CPI || !CONFIG_CPU_COUNTERS */
3662
3663 case DIF_VAR_UID:
3664 if (!dtrace_priv_proc_relaxed(state))
3665 return (0);
3666
3667 /*
3668 * See comment in DIF_VAR_PID.
3669 */
3670 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3671 return (0);
3672
3673 return ((uint64_t) dtrace_proc_selfruid());
3674
3675 case DIF_VAR_GID:
3676 if (!dtrace_priv_proc(state))
3677 return (0);
3678
3679 /*
3680 * See comment in DIF_VAR_PID.
3681 */
3682 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3683 return (0);
3684
3685 if (dtrace_CRED() != NULL)
3686 /* Credential does not require lazy initialization. */
3687 return ((uint64_t)kauth_getgid());
3688 else {
3689 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3690 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3691 return -1ULL;
3692 }
3693
3694 case DIF_VAR_ERRNO: {
3695 uthread_t uthread = current_uthread();
3696 if (!dtrace_priv_proc(state))
3697 return (0);
3698
3699 /*
3700 * See comment in DIF_VAR_PID.
3701 */
3702 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3703 return (0);
3704
3705 if (uthread)
3706 return (uint64_t)uthread->t_dtrace_errno;
3707 else {
3708 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3709 return -1ULL;
3710 }
3711 }
3712
3713 default:
3714 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3715 return (0);
3716 }
3717 }
3718
3719 typedef enum dtrace_json_state {
3720 DTRACE_JSON_REST = 1,
3721 DTRACE_JSON_OBJECT,
3722 DTRACE_JSON_STRING,
3723 DTRACE_JSON_STRING_ESCAPE,
3724 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3725 DTRACE_JSON_COLON,
3726 DTRACE_JSON_COMMA,
3727 DTRACE_JSON_VALUE,
3728 DTRACE_JSON_IDENTIFIER,
3729 DTRACE_JSON_NUMBER,
3730 DTRACE_JSON_NUMBER_FRAC,
3731 DTRACE_JSON_NUMBER_EXP,
3732 DTRACE_JSON_COLLECT_OBJECT
3733 } dtrace_json_state_t;
3734
3735 /*
3736 * This function possesses just enough knowledge about JSON to extract a single
3737 * value from a JSON string and store it in the scratch buffer. It is able
3738 * to extract nested object values, and members of arrays by index.
3739 *
3740 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3741 * be looked up as we descend into the object tree. e.g.
3742 *
3743 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3744 * with nelems = 5.
3745 *
3746 * The run time of this function must be bounded above by strsize to limit the
3747 * amount of work done in probe context. As such, it is implemented as a
3748 * simple state machine, reading one character at a time using safe loads
3749 * until we find the requested element, hit a parsing error or run off the
3750 * end of the object or string.
3751 *
3752 * As there is no way for a subroutine to return an error without interrupting
3753 * clause execution, we simply return NULL in the event of a missing key or any
3754 * other error condition. Each NULL return in this function is commented with
3755 * the error condition it represents -- parsing or otherwise.
3756 *
3757 * The set of states for the state machine closely matches the JSON
3758 * specification (http://json.org/). Briefly:
3759 *
3760 * DTRACE_JSON_REST:
3761 * Skip whitespace until we find either a top-level Object, moving
3762 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3763 *
3764 * DTRACE_JSON_OBJECT:
3765 * Locate the next key String in an Object. Sets a flag to denote
3766 * the next String as a key string and moves to DTRACE_JSON_STRING.
3767 *
3768 * DTRACE_JSON_COLON:
3769 * Skip whitespace until we find the colon that separates key Strings
3770 * from their values. Once found, move to DTRACE_JSON_VALUE.
3771 *
3772 * DTRACE_JSON_VALUE:
3773 * Detects the type of the next value (String, Number, Identifier, Object
3774 * or Array) and routes to the states that process that type. Here we also
3775 * deal with the element selector list if we are requested to traverse down
3776 * into the object tree.
3777 *
3778 * DTRACE_JSON_COMMA:
3779 * Skip whitespace until we find the comma that separates key-value pairs
3780 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3781 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3782 * states return to this state at the end of their value, unless otherwise
3783 * noted.
3784 *
3785 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3786 * Processes a Number literal from the JSON, including any exponent
3787 * component that may be present. Numbers are returned as strings, which
3788 * may be passed to strtoll() if an integer is required.
3789 *
3790 * DTRACE_JSON_IDENTIFIER:
3791 * Processes a "true", "false" or "null" literal in the JSON.
3792 *
3793 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3794 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3795 * Processes a String literal from the JSON, whether the String denotes
3796 * a key, a value or part of a larger Object. Handles all escape sequences
3797 * present in the specification, including four-digit unicode characters,
3798 * but merely includes the escape sequence without converting it to the
3799 * actual escaped character. If the String is flagged as a key, we
3800 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3801 *
3802 * DTRACE_JSON_COLLECT_OBJECT:
3803 * This state collects an entire Object (or Array), correctly handling
3804 * embedded strings. If the full element selector list matches this nested
3805 * object, we return the Object in full as a string. If not, we use this
3806 * state to skip to the next value at this level and continue processing.
3807 */
3808 static char *
dtrace_json(uint64_t size,uintptr_t json,char * elemlist,int nelems,char * dest)3809 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3810 char *dest)
3811 {
3812 dtrace_json_state_t state = DTRACE_JSON_REST;
3813 int64_t array_elem = INT64_MIN;
3814 int64_t array_pos = 0;
3815 uint8_t escape_unicount = 0;
3816 boolean_t string_is_key = B_FALSE;
3817 boolean_t collect_object = B_FALSE;
3818 boolean_t found_key = B_FALSE;
3819 boolean_t in_array = B_FALSE;
3820 uint32_t braces = 0, brackets = 0;
3821 char *elem = elemlist;
3822 char *dd = dest;
3823 uintptr_t cur;
3824
3825 for (cur = json; cur < json + size; cur++) {
3826 char cc = dtrace_load8(cur);
3827 if (cc == '\0')
3828 return (NULL);
3829
3830 switch (state) {
3831 case DTRACE_JSON_REST:
3832 if (isspace(cc))
3833 break;
3834
3835 if (cc == '{') {
3836 state = DTRACE_JSON_OBJECT;
3837 break;
3838 }
3839
3840 if (cc == '[') {
3841 in_array = B_TRUE;
3842 array_pos = 0;
3843 array_elem = dtrace_strtoll(elem, 10, size);
3844 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3845 state = DTRACE_JSON_VALUE;
3846 break;
3847 }
3848
3849 /*
3850 * ERROR: expected to find a top-level object or array.
3851 */
3852 return (NULL);
3853 case DTRACE_JSON_OBJECT:
3854 if (isspace(cc))
3855 break;
3856
3857 if (cc == '"') {
3858 state = DTRACE_JSON_STRING;
3859 string_is_key = B_TRUE;
3860 break;
3861 }
3862
3863 /*
3864 * ERROR: either the object did not start with a key
3865 * string, or we've run off the end of the object
3866 * without finding the requested key.
3867 */
3868 return (NULL);
3869 case DTRACE_JSON_STRING:
3870 if (cc == '\\') {
3871 *dd++ = '\\';
3872 state = DTRACE_JSON_STRING_ESCAPE;
3873 break;
3874 }
3875
3876 if (cc == '"') {
3877 if (collect_object) {
3878 /*
3879 * We don't reset the dest here, as
3880 * the string is part of a larger
3881 * object being collected.
3882 */
3883 *dd++ = cc;
3884 collect_object = B_FALSE;
3885 state = DTRACE_JSON_COLLECT_OBJECT;
3886 break;
3887 }
3888 *dd = '\0';
3889 dd = dest; /* reset string buffer */
3890 if (string_is_key) {
3891 if (dtrace_strncmp(dest, elem,
3892 size) == 0)
3893 found_key = B_TRUE;
3894 } else if (found_key) {
3895 if (nelems > 1) {
3896 /*
3897 * We expected an object, not
3898 * this string.
3899 */
3900 return (NULL);
3901 }
3902 return (dest);
3903 }
3904 state = string_is_key ? DTRACE_JSON_COLON :
3905 DTRACE_JSON_COMMA;
3906 string_is_key = B_FALSE;
3907 break;
3908 }
3909
3910 *dd++ = cc;
3911 break;
3912 case DTRACE_JSON_STRING_ESCAPE:
3913 *dd++ = cc;
3914 if (cc == 'u') {
3915 escape_unicount = 0;
3916 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3917 } else {
3918 state = DTRACE_JSON_STRING;
3919 }
3920 break;
3921 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3922 if (!isxdigit(cc)) {
3923 /*
3924 * ERROR: invalid unicode escape, expected
3925 * four valid hexidecimal digits.
3926 */
3927 return (NULL);
3928 }
3929
3930 *dd++ = cc;
3931 if (++escape_unicount == 4)
3932 state = DTRACE_JSON_STRING;
3933 break;
3934 case DTRACE_JSON_COLON:
3935 if (isspace(cc))
3936 break;
3937
3938 if (cc == ':') {
3939 state = DTRACE_JSON_VALUE;
3940 break;
3941 }
3942
3943 /*
3944 * ERROR: expected a colon.
3945 */
3946 return (NULL);
3947 case DTRACE_JSON_COMMA:
3948 if (isspace(cc))
3949 break;
3950
3951 if (cc == ',') {
3952 if (in_array) {
3953 state = DTRACE_JSON_VALUE;
3954 if (++array_pos == array_elem)
3955 found_key = B_TRUE;
3956 } else {
3957 state = DTRACE_JSON_OBJECT;
3958 }
3959 break;
3960 }
3961
3962 /*
3963 * ERROR: either we hit an unexpected character, or
3964 * we reached the end of the object or array without
3965 * finding the requested key.
3966 */
3967 return (NULL);
3968 case DTRACE_JSON_IDENTIFIER:
3969 if (islower(cc)) {
3970 *dd++ = cc;
3971 break;
3972 }
3973
3974 *dd = '\0';
3975 dd = dest; /* reset string buffer */
3976
3977 if (dtrace_strncmp(dest, "true", 5) == 0 ||
3978 dtrace_strncmp(dest, "false", 6) == 0 ||
3979 dtrace_strncmp(dest, "null", 5) == 0) {
3980 if (found_key) {
3981 if (nelems > 1) {
3982 /*
3983 * ERROR: We expected an object,
3984 * not this identifier.
3985 */
3986 return (NULL);
3987 }
3988 return (dest);
3989 } else {
3990 cur--;
3991 state = DTRACE_JSON_COMMA;
3992 break;
3993 }
3994 }
3995
3996 /*
3997 * ERROR: we did not recognise the identifier as one
3998 * of those in the JSON specification.
3999 */
4000 return (NULL);
4001 case DTRACE_JSON_NUMBER:
4002 if (cc == '.') {
4003 *dd++ = cc;
4004 state = DTRACE_JSON_NUMBER_FRAC;
4005 break;
4006 }
4007
4008 if (cc == 'x' || cc == 'X') {
4009 /*
4010 * ERROR: specification explicitly excludes
4011 * hexidecimal or octal numbers.
4012 */
4013 return (NULL);
4014 }
4015
4016 OS_FALLTHROUGH;
4017 case DTRACE_JSON_NUMBER_FRAC:
4018 if (cc == 'e' || cc == 'E') {
4019 *dd++ = cc;
4020 state = DTRACE_JSON_NUMBER_EXP;
4021 break;
4022 }
4023
4024 if (cc == '+' || cc == '-') {
4025 /*
4026 * ERROR: expect sign as part of exponent only.
4027 */
4028 return (NULL);
4029 }
4030 OS_FALLTHROUGH;
4031 case DTRACE_JSON_NUMBER_EXP:
4032 if (isdigit(cc) || cc == '+' || cc == '-') {
4033 *dd++ = cc;
4034 break;
4035 }
4036
4037 *dd = '\0';
4038 dd = dest; /* reset string buffer */
4039 if (found_key) {
4040 if (nelems > 1) {
4041 /*
4042 * ERROR: We expected an object, not
4043 * this number.
4044 */
4045 return (NULL);
4046 }
4047 return (dest);
4048 }
4049
4050 cur--;
4051 state = DTRACE_JSON_COMMA;
4052 break;
4053 case DTRACE_JSON_VALUE:
4054 if (isspace(cc))
4055 break;
4056
4057 if (cc == '{' || cc == '[') {
4058 if (nelems > 1 && found_key) {
4059 in_array = cc == '[' ? B_TRUE : B_FALSE;
4060 /*
4061 * If our element selector directs us
4062 * to descend into this nested object,
4063 * then move to the next selector
4064 * element in the list and restart the
4065 * state machine.
4066 */
4067 while (*elem != '\0')
4068 elem++;
4069 elem++; /* skip the inter-element NUL */
4070 nelems--;
4071 dd = dest;
4072 if (in_array) {
4073 state = DTRACE_JSON_VALUE;
4074 array_pos = 0;
4075 array_elem = dtrace_strtoll(
4076 elem, 10, size);
4077 found_key = array_elem == 0 ?
4078 B_TRUE : B_FALSE;
4079 } else {
4080 found_key = B_FALSE;
4081 state = DTRACE_JSON_OBJECT;
4082 }
4083 break;
4084 }
4085
4086 /*
4087 * Otherwise, we wish to either skip this
4088 * nested object or return it in full.
4089 */
4090 if (cc == '[')
4091 brackets = 1;
4092 else
4093 braces = 1;
4094 *dd++ = cc;
4095 state = DTRACE_JSON_COLLECT_OBJECT;
4096 break;
4097 }
4098
4099 if (cc == '"') {
4100 state = DTRACE_JSON_STRING;
4101 break;
4102 }
4103
4104 if (islower(cc)) {
4105 /*
4106 * Here we deal with true, false and null.
4107 */
4108 *dd++ = cc;
4109 state = DTRACE_JSON_IDENTIFIER;
4110 break;
4111 }
4112
4113 if (cc == '-' || isdigit(cc)) {
4114 *dd++ = cc;
4115 state = DTRACE_JSON_NUMBER;
4116 break;
4117 }
4118
4119 /*
4120 * ERROR: unexpected character at start of value.
4121 */
4122 return (NULL);
4123 case DTRACE_JSON_COLLECT_OBJECT:
4124 if (cc == '\0')
4125 /*
4126 * ERROR: unexpected end of input.
4127 */
4128 return (NULL);
4129
4130 *dd++ = cc;
4131 if (cc == '"') {
4132 collect_object = B_TRUE;
4133 state = DTRACE_JSON_STRING;
4134 break;
4135 }
4136
4137 if (cc == ']') {
4138 if (brackets-- == 0) {
4139 /*
4140 * ERROR: unbalanced brackets.
4141 */
4142 return (NULL);
4143 }
4144 } else if (cc == '}') {
4145 if (braces-- == 0) {
4146 /*
4147 * ERROR: unbalanced braces.
4148 */
4149 return (NULL);
4150 }
4151 } else if (cc == '{') {
4152 braces++;
4153 } else if (cc == '[') {
4154 brackets++;
4155 }
4156
4157 if (brackets == 0 && braces == 0) {
4158 if (found_key) {
4159 *dd = '\0';
4160 return (dest);
4161 }
4162 dd = dest; /* reset string buffer */
4163 state = DTRACE_JSON_COMMA;
4164 }
4165 break;
4166 }
4167 }
4168 return (NULL);
4169 }
4170
4171 /*
4172 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4173 * Notice that we don't bother validating the proper number of arguments or
4174 * their types in the tuple stack. This isn't needed because all argument
4175 * interpretation is safe because of our load safety -- the worst that can
4176 * happen is that a bogus program can obtain bogus results.
4177 */
4178 static void
dtrace_dif_subr(uint_t subr,uint_t rd,uint64_t * regs,dtrace_key_t * tupregs,int nargs,dtrace_mstate_t * mstate,dtrace_state_t * state)4179 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4180 dtrace_key_t *tupregs, int nargs,
4181 dtrace_mstate_t *mstate, dtrace_state_t *state)
4182 {
4183 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4184 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4185 dtrace_vstate_t *vstate = &state->dts_vstate;
4186
4187 #if !defined(__APPLE__)
4188 union {
4189 mutex_impl_t mi;
4190 uint64_t mx;
4191 } m;
4192
4193 union {
4194 krwlock_t ri;
4195 uintptr_t rw;
4196 } r;
4197 #else
4198 /* FIXME: awaits lock/mutex work */
4199 #endif /* __APPLE__ */
4200
4201 switch (subr) {
4202 case DIF_SUBR_RAND:
4203 regs[rd] = dtrace_xoroshiro128_plus_next(
4204 state->dts_rstate[CPU->cpu_id]);
4205 break;
4206
4207 #if !defined(__APPLE__)
4208 case DIF_SUBR_MUTEX_OWNED:
4209 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4210 mstate, vstate)) {
4211 regs[rd] = 0;
4212 break;
4213 }
4214
4215 m.mx = dtrace_load64(tupregs[0].dttk_value);
4216 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4217 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4218 else
4219 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4220 break;
4221
4222 case DIF_SUBR_MUTEX_OWNER:
4223 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4224 mstate, vstate)) {
4225 regs[rd] = 0;
4226 break;
4227 }
4228
4229 m.mx = dtrace_load64(tupregs[0].dttk_value);
4230 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4231 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4232 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4233 else
4234 regs[rd] = 0;
4235 break;
4236
4237 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4238 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4239 mstate, vstate)) {
4240 regs[rd] = 0;
4241 break;
4242 }
4243
4244 m.mx = dtrace_load64(tupregs[0].dttk_value);
4245 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4246 break;
4247
4248 case DIF_SUBR_MUTEX_TYPE_SPIN:
4249 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4250 mstate, vstate)) {
4251 regs[rd] = 0;
4252 break;
4253 }
4254
4255 m.mx = dtrace_load64(tupregs[0].dttk_value);
4256 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4257 break;
4258
4259 case DIF_SUBR_RW_READ_HELD: {
4260 uintptr_t tmp;
4261
4262 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4263 mstate, vstate)) {
4264 regs[rd] = 0;
4265 break;
4266 }
4267
4268 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4269 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4270 break;
4271 }
4272
4273 case DIF_SUBR_RW_WRITE_HELD:
4274 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4275 mstate, vstate)) {
4276 regs[rd] = 0;
4277 break;
4278 }
4279
4280 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4281 regs[rd] = _RW_WRITE_HELD(&r.ri);
4282 break;
4283
4284 case DIF_SUBR_RW_ISWRITER:
4285 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4286 mstate, vstate)) {
4287 regs[rd] = 0;
4288 break;
4289 }
4290
4291 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4292 regs[rd] = _RW_ISWRITER(&r.ri);
4293 break;
4294 #else
4295 /* FIXME: awaits lock/mutex work */
4296 #endif /* __APPLE__ */
4297
4298 case DIF_SUBR_BCOPY: {
4299 /*
4300 * We need to be sure that the destination is in the scratch
4301 * region -- no other region is allowed.
4302 */
4303 uintptr_t src = tupregs[0].dttk_value;
4304 uintptr_t dest = tupregs[1].dttk_value;
4305 size_t size = tupregs[2].dttk_value;
4306
4307 if (!dtrace_inscratch(dest, size, mstate)) {
4308 *flags |= CPU_DTRACE_BADADDR;
4309 *illval = regs[rd];
4310 break;
4311 }
4312
4313 if (!dtrace_canload(src, size, mstate, vstate)) {
4314 regs[rd] = 0;
4315 break;
4316 }
4317
4318 dtrace_bcopy((void *)src, (void *)dest, size);
4319 break;
4320 }
4321
4322 case DIF_SUBR_ALLOCA:
4323 case DIF_SUBR_COPYIN: {
4324 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4325 uint64_t size =
4326 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4327 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4328
4329 /*
4330 * Check whether the user can access kernel memory
4331 */
4332 if (dtrace_priv_kernel(state) == 0) {
4333 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4334 regs[rd] = 0;
4335 break;
4336 }
4337 /*
4338 * This action doesn't require any credential checks since
4339 * probes will not activate in user contexts to which the
4340 * enabling user does not have permissions.
4341 */
4342
4343 /*
4344 * Rounding up the user allocation size could have overflowed
4345 * a large, bogus allocation (like -1ULL) to 0.
4346 */
4347 if (scratch_size < size ||
4348 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4349 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4350 regs[rd] = 0;
4351 break;
4352 }
4353
4354 if (subr == DIF_SUBR_COPYIN) {
4355 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4356 if (dtrace_priv_proc(state))
4357 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4358 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4359 }
4360
4361 mstate->dtms_scratch_ptr += scratch_size;
4362 regs[rd] = dest;
4363 break;
4364 }
4365
4366 case DIF_SUBR_COPYINTO: {
4367 uint64_t size = tupregs[1].dttk_value;
4368 uintptr_t dest = tupregs[2].dttk_value;
4369
4370 /*
4371 * This action doesn't require any credential checks since
4372 * probes will not activate in user contexts to which the
4373 * enabling user does not have permissions.
4374 */
4375 if (!dtrace_inscratch(dest, size, mstate)) {
4376 *flags |= CPU_DTRACE_BADADDR;
4377 *illval = regs[rd];
4378 break;
4379 }
4380
4381 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4382 if (dtrace_priv_proc(state))
4383 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4384 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4385 break;
4386 }
4387
4388 case DIF_SUBR_COPYINSTR: {
4389 uintptr_t dest = mstate->dtms_scratch_ptr;
4390 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4391
4392 if (nargs > 1 && tupregs[1].dttk_value < size)
4393 size = tupregs[1].dttk_value + 1;
4394
4395 /*
4396 * This action doesn't require any credential checks since
4397 * probes will not activate in user contexts to which the
4398 * enabling user does not have permissions.
4399 */
4400 if (!DTRACE_INSCRATCH(mstate, size)) {
4401 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4402 regs[rd] = 0;
4403 break;
4404 }
4405
4406 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4407 if (dtrace_priv_proc(state))
4408 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4409 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4410
4411 ((char *)dest)[size - 1] = '\0';
4412 mstate->dtms_scratch_ptr += size;
4413 regs[rd] = dest;
4414 break;
4415 }
4416
4417 case DIF_SUBR_MSGSIZE:
4418 case DIF_SUBR_MSGDSIZE: {
4419 /* Darwin does not implement SysV streams messages */
4420 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4421 regs[rd] = 0;
4422 break;
4423 }
4424
4425 case DIF_SUBR_PROGENYOF: {
4426 pid_t pid = tupregs[0].dttk_value;
4427 struct proc *p = current_proc();
4428 int rval = 0, lim = nprocs;
4429
4430 while(p && (lim-- > 0)) {
4431 pid_t ppid;
4432
4433 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4434 if (*flags & CPU_DTRACE_FAULT)
4435 break;
4436
4437 if (ppid == pid) {
4438 rval = 1;
4439 break;
4440 }
4441
4442 if (ppid == 0)
4443 break; /* Can't climb process tree any further. */
4444
4445 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4446 #if __has_feature(ptrauth_calls)
4447 p = ptrauth_strip(p, ptrauth_key_process_independent_data);
4448 #endif
4449 if (*flags & CPU_DTRACE_FAULT)
4450 break;
4451 }
4452
4453 regs[rd] = rval;
4454 break;
4455 }
4456
4457 case DIF_SUBR_SPECULATION:
4458 regs[rd] = dtrace_speculation(state);
4459 break;
4460
4461
4462 case DIF_SUBR_COPYOUT: {
4463 uintptr_t kaddr = tupregs[0].dttk_value;
4464 user_addr_t uaddr = tupregs[1].dttk_value;
4465 uint64_t size = tupregs[2].dttk_value;
4466
4467 if (!dtrace_destructive_disallow &&
4468 dtrace_priv_proc_control(state) &&
4469 !dtrace_istoxic(kaddr, size) &&
4470 dtrace_canload(kaddr, size, mstate, vstate)) {
4471 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4472 dtrace_copyout(kaddr, uaddr, size, flags);
4473 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4474 }
4475 break;
4476 }
4477
4478 case DIF_SUBR_COPYOUTSTR: {
4479 uintptr_t kaddr = tupregs[0].dttk_value;
4480 user_addr_t uaddr = tupregs[1].dttk_value;
4481 uint64_t size = tupregs[2].dttk_value;
4482 size_t lim;
4483
4484 if (!dtrace_destructive_disallow &&
4485 dtrace_priv_proc_control(state) &&
4486 !dtrace_istoxic(kaddr, size) &&
4487 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4488 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4489 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4490 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4491 }
4492 break;
4493 }
4494
4495 case DIF_SUBR_STRLEN: {
4496 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4497 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4498 size_t lim;
4499
4500 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4501 regs[rd] = 0;
4502 break;
4503 }
4504
4505 regs[rd] = dtrace_strlen((char *)addr, lim);
4506
4507 break;
4508 }
4509
4510 case DIF_SUBR_STRCHR:
4511 case DIF_SUBR_STRRCHR: {
4512 /*
4513 * We're going to iterate over the string looking for the
4514 * specified character. We will iterate until we have reached
4515 * the string length or we have found the character. If this
4516 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4517 * of the specified character instead of the first.
4518 */
4519 uintptr_t addr = tupregs[0].dttk_value;
4520 uintptr_t addr_limit;
4521 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4522 size_t lim;
4523 char c, target = (char)tupregs[1].dttk_value;
4524
4525 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4526 regs[rd] = 0;
4527 break;
4528 }
4529 addr_limit = addr + lim;
4530
4531 for (regs[rd] = 0; addr < addr_limit; addr++) {
4532 if ((c = dtrace_load8(addr)) == target) {
4533 regs[rd] = addr;
4534
4535 if (subr == DIF_SUBR_STRCHR)
4536 break;
4537 }
4538
4539 if (c == '\0')
4540 break;
4541 }
4542
4543 break;
4544 }
4545
4546 case DIF_SUBR_STRSTR:
4547 case DIF_SUBR_INDEX:
4548 case DIF_SUBR_RINDEX: {
4549 /*
4550 * We're going to iterate over the string looking for the
4551 * specified string. We will iterate until we have reached
4552 * the string length or we have found the string. (Yes, this
4553 * is done in the most naive way possible -- but considering
4554 * that the string we're searching for is likely to be
4555 * relatively short, the complexity of Rabin-Karp or similar
4556 * hardly seems merited.)
4557 */
4558 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4559 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4560 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4561 size_t len = dtrace_strlen(addr, size);
4562 size_t sublen = dtrace_strlen(substr, size);
4563 char *limit = addr + len, *orig = addr;
4564 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4565 int inc = 1;
4566
4567 regs[rd] = notfound;
4568
4569 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4570 regs[rd] = 0;
4571 break;
4572 }
4573
4574 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4575 vstate)) {
4576 regs[rd] = 0;
4577 break;
4578 }
4579
4580 /*
4581 * strstr() and index()/rindex() have similar semantics if
4582 * both strings are the empty string: strstr() returns a
4583 * pointer to the (empty) string, and index() and rindex()
4584 * both return index 0 (regardless of any position argument).
4585 */
4586 if (sublen == 0 && len == 0) {
4587 if (subr == DIF_SUBR_STRSTR)
4588 regs[rd] = (uintptr_t)addr;
4589 else
4590 regs[rd] = 0;
4591 break;
4592 }
4593
4594 if (subr != DIF_SUBR_STRSTR) {
4595 if (subr == DIF_SUBR_RINDEX) {
4596 limit = orig - 1;
4597 addr += len;
4598 inc = -1;
4599 }
4600
4601 /*
4602 * Both index() and rindex() take an optional position
4603 * argument that denotes the starting position.
4604 */
4605 if (nargs == 3) {
4606 int64_t pos = (int64_t)tupregs[2].dttk_value;
4607
4608 /*
4609 * If the position argument to index() is
4610 * negative, Perl implicitly clamps it at
4611 * zero. This semantic is a little surprising
4612 * given the special meaning of negative
4613 * positions to similar Perl functions like
4614 * substr(), but it appears to reflect a
4615 * notion that index() can start from a
4616 * negative index and increment its way up to
4617 * the string. Given this notion, Perl's
4618 * rindex() is at least self-consistent in
4619 * that it implicitly clamps positions greater
4620 * than the string length to be the string
4621 * length. Where Perl completely loses
4622 * coherence, however, is when the specified
4623 * substring is the empty string (""). In
4624 * this case, even if the position is
4625 * negative, rindex() returns 0 -- and even if
4626 * the position is greater than the length,
4627 * index() returns the string length. These
4628 * semantics violate the notion that index()
4629 * should never return a value less than the
4630 * specified position and that rindex() should
4631 * never return a value greater than the
4632 * specified position. (One assumes that
4633 * these semantics are artifacts of Perl's
4634 * implementation and not the results of
4635 * deliberate design -- it beggars belief that
4636 * even Larry Wall could desire such oddness.)
4637 * While in the abstract one would wish for
4638 * consistent position semantics across
4639 * substr(), index() and rindex() -- or at the
4640 * very least self-consistent position
4641 * semantics for index() and rindex() -- we
4642 * instead opt to keep with the extant Perl
4643 * semantics, in all their broken glory. (Do
4644 * we have more desire to maintain Perl's
4645 * semantics than Perl does? Probably.)
4646 */
4647 if (subr == DIF_SUBR_RINDEX) {
4648 if (pos < 0) {
4649 if (sublen == 0)
4650 regs[rd] = 0;
4651 break;
4652 }
4653
4654 if ((size_t)pos > len)
4655 pos = len;
4656 } else {
4657 if (pos < 0)
4658 pos = 0;
4659
4660 if ((size_t)pos >= len) {
4661 if (sublen == 0)
4662 regs[rd] = len;
4663 break;
4664 }
4665 }
4666
4667 addr = orig + pos;
4668 }
4669 }
4670
4671 for (regs[rd] = notfound; addr != limit; addr += inc) {
4672 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4673 if (subr != DIF_SUBR_STRSTR) {
4674 /*
4675 * As D index() and rindex() are
4676 * modeled on Perl (and not on awk),
4677 * we return a zero-based (and not a
4678 * one-based) index. (For you Perl
4679 * weenies: no, we're not going to add
4680 * $[ -- and shouldn't you be at a con
4681 * or something?)
4682 */
4683 regs[rd] = (uintptr_t)(addr - orig);
4684 break;
4685 }
4686
4687 ASSERT(subr == DIF_SUBR_STRSTR);
4688 regs[rd] = (uintptr_t)addr;
4689 break;
4690 }
4691 }
4692
4693 break;
4694 }
4695
4696 case DIF_SUBR_STRTOK: {
4697 uintptr_t addr = tupregs[0].dttk_value;
4698 uintptr_t tokaddr = tupregs[1].dttk_value;
4699 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4700 uintptr_t limit, toklimit;
4701 size_t clim;
4702 char *dest = (char *)mstate->dtms_scratch_ptr;
4703 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
4704 uint64_t i = 0;
4705
4706 /*
4707 * Check both the token buffer and (later) the input buffer,
4708 * since both could be non-scratch addresses.
4709 */
4710 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4711 regs[rd] = 0;
4712 break;
4713 }
4714 toklimit = tokaddr + clim;
4715
4716 if (!DTRACE_INSCRATCH(mstate, size)) {
4717 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4718 regs[rd] = 0;
4719 break;
4720 }
4721
4722 if (addr == 0) {
4723 /*
4724 * If the address specified is NULL, we use our saved
4725 * strtok pointer from the mstate. Note that this
4726 * means that the saved strtok pointer is _only_
4727 * valid within multiple enablings of the same probe --
4728 * it behaves like an implicit clause-local variable.
4729 */
4730 addr = mstate->dtms_strtok;
4731 limit = mstate->dtms_strtok_limit;
4732 } else {
4733 /*
4734 * If the user-specified address is non-NULL we must
4735 * access check it. This is the only time we have
4736 * a chance to do so, since this address may reside
4737 * in the string table of this clause-- future calls
4738 * (when we fetch addr from mstate->dtms_strtok)
4739 * would fail this access check.
4740 */
4741 if (!dtrace_strcanload(addr, size, &clim, mstate,
4742 vstate)) {
4743 regs[rd] = 0;
4744 break;
4745 }
4746 limit = addr + clim;
4747 }
4748
4749 /*
4750 * First, zero the token map, and then process the token
4751 * string -- setting a bit in the map for every character
4752 * found in the token string.
4753 */
4754 for (i = 0; i < (int)sizeof (tokmap); i++)
4755 tokmap[i] = 0;
4756
4757 for (; tokaddr < toklimit; tokaddr++) {
4758 if ((c = dtrace_load8(tokaddr)) == '\0')
4759 break;
4760
4761 ASSERT((c >> 3) < sizeof (tokmap));
4762 tokmap[c >> 3] |= (1 << (c & 0x7));
4763 }
4764
4765 for (; addr < limit; addr++) {
4766 /*
4767 * We're looking for a character that is _not_
4768 * contained in the token string.
4769 */
4770 if ((c = dtrace_load8(addr)) == '\0')
4771 break;
4772
4773 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4774 break;
4775 }
4776
4777 if (c == '\0') {
4778 /*
4779 * We reached the end of the string without finding
4780 * any character that was not in the token string.
4781 * We return NULL in this case, and we set the saved
4782 * address to NULL as well.
4783 */
4784 regs[rd] = 0;
4785 mstate->dtms_strtok = 0;
4786 mstate->dtms_strtok_limit = 0;
4787 break;
4788 }
4789
4790 /*
4791 * From here on, we're copying into the destination string.
4792 */
4793 for (i = 0; addr < limit && i < size - 1; addr++) {
4794 if ((c = dtrace_load8(addr)) == '\0')
4795 break;
4796
4797 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4798 break;
4799
4800 ASSERT(i < size);
4801 dest[i++] = c;
4802 }
4803
4804 ASSERT(i < size);
4805 dest[i] = '\0';
4806 regs[rd] = (uintptr_t)dest;
4807 mstate->dtms_scratch_ptr += size;
4808 mstate->dtms_strtok = addr;
4809 mstate->dtms_strtok_limit = limit;
4810 break;
4811 }
4812
4813 case DIF_SUBR_SUBSTR: {
4814 uintptr_t s = tupregs[0].dttk_value;
4815 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4816 char *d = (char *)mstate->dtms_scratch_ptr;
4817 int64_t index = (int64_t)tupregs[1].dttk_value;
4818 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4819 size_t len = dtrace_strlen((char *)s, size);
4820 int64_t i = 0;
4821
4822 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4823 regs[rd] = 0;
4824 break;
4825 }
4826
4827 if (!DTRACE_INSCRATCH(mstate, size)) {
4828 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4829 regs[rd] = 0;
4830 break;
4831 }
4832
4833 if (nargs <= 2)
4834 remaining = (int64_t)size;
4835
4836 if (index < 0) {
4837 index += len;
4838
4839 if (index < 0 && index + remaining > 0) {
4840 remaining += index;
4841 index = 0;
4842 }
4843 }
4844
4845 if ((size_t)index >= len || index < 0) {
4846 remaining = 0;
4847 } else if (remaining < 0) {
4848 remaining += len - index;
4849 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4850 remaining = size - index;
4851 }
4852
4853 for (i = 0; i < remaining; i++) {
4854 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4855 break;
4856 }
4857
4858 d[i] = '\0';
4859
4860 mstate->dtms_scratch_ptr += size;
4861 regs[rd] = (uintptr_t)d;
4862 break;
4863 }
4864
4865 case DIF_SUBR_GETMAJOR:
4866 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4867 break;
4868
4869 case DIF_SUBR_GETMINOR:
4870 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4871 break;
4872
4873 case DIF_SUBR_DDI_PATHNAME: {
4874 /* APPLE NOTE: currently unsupported on Darwin */
4875 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4876 regs[rd] = 0;
4877 break;
4878 }
4879
4880 case DIF_SUBR_STRJOIN: {
4881 char *d = (char *)mstate->dtms_scratch_ptr;
4882 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4883 uintptr_t s1 = tupregs[0].dttk_value;
4884 uintptr_t s2 = tupregs[1].dttk_value;
4885 uint64_t i = 0, j = 0;
4886 size_t lim1, lim2;
4887 char c;
4888
4889 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4890 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4891 regs[rd] = 0;
4892 break;
4893 }
4894
4895 if (!DTRACE_INSCRATCH(mstate, size)) {
4896 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4897 regs[rd] = 0;
4898 break;
4899 }
4900
4901 for (;;) {
4902 if (i >= size) {
4903 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4904 regs[rd] = 0;
4905 break;
4906 }
4907 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4908 if ((d[i++] = c) == '\0') {
4909 i--;
4910 break;
4911 }
4912 }
4913
4914 for (;;) {
4915 if (i >= size) {
4916 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4917 regs[rd] = 0;
4918 break;
4919 }
4920 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4921 if ((d[i++] = c) == '\0')
4922 break;
4923 }
4924
4925 if (i < size) {
4926 mstate->dtms_scratch_ptr += i;
4927 regs[rd] = (uintptr_t)d;
4928 }
4929
4930 break;
4931 }
4932
4933 case DIF_SUBR_STRTOLL: {
4934 uintptr_t s = tupregs[0].dttk_value;
4935 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4936 size_t lim;
4937 int base = 10;
4938
4939 if (nargs > 1) {
4940 if ((base = tupregs[1].dttk_value) <= 1 ||
4941 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4942 *flags |= CPU_DTRACE_ILLOP;
4943 break;
4944 }
4945 }
4946
4947 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4948 regs[rd] = INT64_MIN;
4949 break;
4950 }
4951
4952 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4953 break;
4954 }
4955
4956 case DIF_SUBR_LLTOSTR: {
4957 int64_t i = (int64_t)tupregs[0].dttk_value;
4958 uint64_t val, digit;
4959 uint64_t size = 65; /* enough room for 2^64 in binary */
4960 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4961 int base = 10;
4962
4963 if (nargs > 1) {
4964 if ((base = tupregs[1].dttk_value) <= 1 ||
4965 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4966 *flags |= CPU_DTRACE_ILLOP;
4967 break;
4968 }
4969 }
4970
4971 val = (base == 10 && i < 0) ? i * -1 : i;
4972
4973 if (!DTRACE_INSCRATCH(mstate, size)) {
4974 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4975 regs[rd] = 0;
4976 break;
4977 }
4978
4979 for (*end-- = '\0'; val; val /= base) {
4980 if ((digit = val % base) <= '9' - '0') {
4981 *end-- = '0' + digit;
4982 } else {
4983 *end-- = 'a' + (digit - ('9' - '0') - 1);
4984 }
4985 }
4986
4987 if (i == 0 && base == 16)
4988 *end-- = '0';
4989
4990 if (base == 16)
4991 *end-- = 'x';
4992
4993 if (i == 0 || base == 8 || base == 16)
4994 *end-- = '0';
4995
4996 if (i < 0 && base == 10)
4997 *end-- = '-';
4998
4999 regs[rd] = (uintptr_t)end + 1;
5000 mstate->dtms_scratch_ptr += size;
5001 break;
5002 }
5003
5004 case DIF_SUBR_HTONS:
5005 case DIF_SUBR_NTOHS:
5006 #ifdef _BIG_ENDIAN
5007 regs[rd] = (uint16_t)tupregs[0].dttk_value;
5008 #else
5009 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5010 #endif
5011 break;
5012
5013
5014 case DIF_SUBR_HTONL:
5015 case DIF_SUBR_NTOHL:
5016 #ifdef _BIG_ENDIAN
5017 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5018 #else
5019 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5020 #endif
5021 break;
5022
5023
5024 case DIF_SUBR_HTONLL:
5025 case DIF_SUBR_NTOHLL:
5026 #ifdef _BIG_ENDIAN
5027 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5028 #else
5029 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5030 #endif
5031 break;
5032
5033
5034 case DIF_SUBR_DIRNAME:
5035 case DIF_SUBR_BASENAME: {
5036 char *dest = (char *)mstate->dtms_scratch_ptr;
5037 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5038 uintptr_t src = tupregs[0].dttk_value;
5039 int i, j, len = dtrace_strlen((char *)src, size);
5040 int lastbase = -1, firstbase = -1, lastdir = -1;
5041 int start, end;
5042
5043 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5044 regs[rd] = 0;
5045 break;
5046 }
5047
5048 if (!DTRACE_INSCRATCH(mstate, size)) {
5049 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5050 regs[rd] = 0;
5051 break;
5052 }
5053
5054 /*
5055 * The basename and dirname for a zero-length string is
5056 * defined to be "."
5057 */
5058 if (len == 0) {
5059 len = 1;
5060 src = (uintptr_t)".";
5061 }
5062
5063 /*
5064 * Start from the back of the string, moving back toward the
5065 * front until we see a character that isn't a slash. That
5066 * character is the last character in the basename.
5067 */
5068 for (i = len - 1; i >= 0; i--) {
5069 if (dtrace_load8(src + i) != '/')
5070 break;
5071 }
5072
5073 if (i >= 0)
5074 lastbase = i;
5075
5076 /*
5077 * Starting from the last character in the basename, move
5078 * towards the front until we find a slash. The character
5079 * that we processed immediately before that is the first
5080 * character in the basename.
5081 */
5082 for (; i >= 0; i--) {
5083 if (dtrace_load8(src + i) == '/')
5084 break;
5085 }
5086
5087 if (i >= 0)
5088 firstbase = i + 1;
5089
5090 /*
5091 * Now keep going until we find a non-slash character. That
5092 * character is the last character in the dirname.
5093 */
5094 for (; i >= 0; i--) {
5095 if (dtrace_load8(src + i) != '/')
5096 break;
5097 }
5098
5099 if (i >= 0)
5100 lastdir = i;
5101
5102 ASSERT(!(lastbase == -1 && firstbase != -1));
5103 ASSERT(!(firstbase == -1 && lastdir != -1));
5104
5105 if (lastbase == -1) {
5106 /*
5107 * We didn't find a non-slash character. We know that
5108 * the length is non-zero, so the whole string must be
5109 * slashes. In either the dirname or the basename
5110 * case, we return '/'.
5111 */
5112 ASSERT(firstbase == -1);
5113 firstbase = lastbase = lastdir = 0;
5114 }
5115
5116 if (firstbase == -1) {
5117 /*
5118 * The entire string consists only of a basename
5119 * component. If we're looking for dirname, we need
5120 * to change our string to be just "."; if we're
5121 * looking for a basename, we'll just set the first
5122 * character of the basename to be 0.
5123 */
5124 if (subr == DIF_SUBR_DIRNAME) {
5125 ASSERT(lastdir == -1);
5126 src = (uintptr_t)".";
5127 lastdir = 0;
5128 } else {
5129 firstbase = 0;
5130 }
5131 }
5132
5133 if (subr == DIF_SUBR_DIRNAME) {
5134 if (lastdir == -1) {
5135 /*
5136 * We know that we have a slash in the name --
5137 * or lastdir would be set to 0, above. And
5138 * because lastdir is -1, we know that this
5139 * slash must be the first character. (That
5140 * is, the full string must be of the form
5141 * "/basename".) In this case, the last
5142 * character of the directory name is 0.
5143 */
5144 lastdir = 0;
5145 }
5146
5147 start = 0;
5148 end = lastdir;
5149 } else {
5150 ASSERT(subr == DIF_SUBR_BASENAME);
5151 ASSERT(firstbase != -1 && lastbase != -1);
5152 start = firstbase;
5153 end = lastbase;
5154 }
5155
5156 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5157 dest[j] = dtrace_load8(src + i);
5158
5159 dest[j] = '\0';
5160 regs[rd] = (uintptr_t)dest;
5161 mstate->dtms_scratch_ptr += size;
5162 break;
5163 }
5164
5165 case DIF_SUBR_CLEANPATH: {
5166 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5167 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5168 uintptr_t src = tupregs[0].dttk_value;
5169 size_t lim;
5170 size_t i = 0, j = 0;
5171
5172 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5173 regs[rd] = 0;
5174 break;
5175 }
5176
5177 if (!DTRACE_INSCRATCH(mstate, size)) {
5178 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5179 regs[rd] = 0;
5180 break;
5181 }
5182
5183 /*
5184 * Move forward, loading each character.
5185 */
5186 do {
5187 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5188 next:
5189 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
5190 break;
5191
5192 if (c != '/') {
5193 dest[j++] = c;
5194 continue;
5195 }
5196
5197 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5198
5199 if (c == '/') {
5200 /*
5201 * We have two slashes -- we can just advance
5202 * to the next character.
5203 */
5204 goto next;
5205 }
5206
5207 if (c != '.') {
5208 /*
5209 * This is not "." and it's not ".." -- we can
5210 * just store the "/" and this character and
5211 * drive on.
5212 */
5213 dest[j++] = '/';
5214 dest[j++] = c;
5215 continue;
5216 }
5217
5218 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5219
5220 if (c == '/') {
5221 /*
5222 * This is a "/./" component. We're not going
5223 * to store anything in the destination buffer;
5224 * we're just going to go to the next component.
5225 */
5226 goto next;
5227 }
5228
5229 if (c != '.') {
5230 /*
5231 * This is not ".." -- we can just store the
5232 * "/." and this character and continue
5233 * processing.
5234 */
5235 dest[j++] = '/';
5236 dest[j++] = '.';
5237 dest[j++] = c;
5238 continue;
5239 }
5240
5241 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5242
5243 if (c != '/' && c != '\0') {
5244 /*
5245 * This is not ".." -- it's "..[mumble]".
5246 * We'll store the "/.." and this character
5247 * and continue processing.
5248 */
5249 dest[j++] = '/';
5250 dest[j++] = '.';
5251 dest[j++] = '.';
5252 dest[j++] = c;
5253 continue;
5254 }
5255
5256 /*
5257 * This is "/../" or "/..\0". We need to back up
5258 * our destination pointer until we find a "/".
5259 */
5260 i--;
5261 while (j != 0 && dest[--j] != '/')
5262 continue;
5263
5264 if (c == '\0')
5265 dest[++j] = '/';
5266 } while (c != '\0');
5267
5268 dest[j] = '\0';
5269 regs[rd] = (uintptr_t)dest;
5270 mstate->dtms_scratch_ptr += size;
5271 break;
5272 }
5273
5274 case DIF_SUBR_INET_NTOA:
5275 case DIF_SUBR_INET_NTOA6:
5276 case DIF_SUBR_INET_NTOP: {
5277 size_t size;
5278 int af, argi, i;
5279 char *base, *end;
5280
5281 if (subr == DIF_SUBR_INET_NTOP) {
5282 af = (int)tupregs[0].dttk_value;
5283 argi = 1;
5284 } else {
5285 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5286 argi = 0;
5287 }
5288
5289 if (af == AF_INET) {
5290 #if !defined(__APPLE__)
5291 ipaddr_t ip4;
5292 #else
5293 uint32_t ip4;
5294 #endif /* __APPLE__ */
5295 uint8_t *ptr8, val;
5296
5297 /*
5298 * Safely load the IPv4 address.
5299 */
5300 #if !defined(__APPLE__)
5301 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5302 #else
5303 if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5304 mstate, vstate)) {
5305 regs[rd] = 0;
5306 break;
5307 }
5308
5309 dtrace_bcopy(
5310 (void *)(uintptr_t)tupregs[argi].dttk_value,
5311 (void *)(uintptr_t)&ip4, sizeof (ip4));
5312 #endif /* __APPLE__ */
5313 /*
5314 * Check an IPv4 string will fit in scratch.
5315 */
5316 #if !defined(__APPLE__)
5317 size = INET_ADDRSTRLEN;
5318 #else
5319 size = MAX_IPv4_STR_LEN;
5320 #endif /* __APPLE__ */
5321 if (!DTRACE_INSCRATCH(mstate, size)) {
5322 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5323 regs[rd] = 0;
5324 break;
5325 }
5326 base = (char *)mstate->dtms_scratch_ptr;
5327 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5328
5329 /*
5330 * Stringify as a dotted decimal quad.
5331 */
5332 *end-- = '\0';
5333 ptr8 = (uint8_t *)&ip4;
5334 for (i = 3; i >= 0; i--) {
5335 val = ptr8[i];
5336
5337 if (val == 0) {
5338 *end-- = '0';
5339 } else {
5340 for (; val; val /= 10) {
5341 *end-- = '0' + (val % 10);
5342 }
5343 }
5344
5345 if (i > 0)
5346 *end-- = '.';
5347 }
5348 ASSERT(end + 1 >= base);
5349
5350 } else if (af == AF_INET6) {
5351 #if defined(__APPLE__)
5352 #define _S6_un __u6_addr
5353 #define _S6_u8 __u6_addr8
5354 #endif /* __APPLE__ */
5355 struct in6_addr ip6;
5356 int firstzero, tryzero, numzero, v6end;
5357 uint16_t val;
5358 const char digits[] = "0123456789abcdef";
5359
5360 /*
5361 * Stringify using RFC 1884 convention 2 - 16 bit
5362 * hexadecimal values with a zero-run compression.
5363 * Lower case hexadecimal digits are used.
5364 * eg, fe80::214:4fff:fe0b:76c8.
5365 * The IPv4 embedded form is returned for inet_ntop,
5366 * just the IPv4 string is returned for inet_ntoa6.
5367 */
5368
5369 if (!dtrace_canload(tupregs[argi].dttk_value,
5370 sizeof(struct in6_addr), mstate, vstate)) {
5371 regs[rd] = 0;
5372 break;
5373 }
5374
5375 /*
5376 * Safely load the IPv6 address.
5377 */
5378 dtrace_bcopy(
5379 (void *)(uintptr_t)tupregs[argi].dttk_value,
5380 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5381
5382 /*
5383 * Check an IPv6 string will fit in scratch.
5384 */
5385 size = INET6_ADDRSTRLEN;
5386 if (!DTRACE_INSCRATCH(mstate, size)) {
5387 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5388 regs[rd] = 0;
5389 break;
5390 }
5391 base = (char *)mstate->dtms_scratch_ptr;
5392 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5393 *end-- = '\0';
5394
5395 /*
5396 * Find the longest run of 16 bit zero values
5397 * for the single allowed zero compression - "::".
5398 */
5399 firstzero = -1;
5400 tryzero = -1;
5401 numzero = 1;
5402 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5403 if (ip6._S6_un._S6_u8[i] == 0 &&
5404 tryzero == -1 && i % 2 == 0) {
5405 tryzero = i;
5406 continue;
5407 }
5408
5409 if (tryzero != -1 &&
5410 (ip6._S6_un._S6_u8[i] != 0 ||
5411 i == sizeof (struct in6_addr) - 1)) {
5412
5413 if (i - tryzero <= numzero) {
5414 tryzero = -1;
5415 continue;
5416 }
5417
5418 firstzero = tryzero;
5419 numzero = i - i % 2 - tryzero;
5420 tryzero = -1;
5421
5422 if (ip6._S6_un._S6_u8[i] == 0 &&
5423 i == sizeof (struct in6_addr) - 1)
5424 numzero += 2;
5425 }
5426 }
5427 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5428
5429 /*
5430 * Check for an IPv4 embedded address.
5431 */
5432 v6end = sizeof (struct in6_addr) - 2;
5433 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5434 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5435 for (i = sizeof (struct in6_addr) - 1;
5436 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5437 ASSERT(end >= base);
5438
5439 val = ip6._S6_un._S6_u8[i];
5440
5441 if (val == 0) {
5442 *end-- = '0';
5443 } else {
5444 for (; val; val /= 10) {
5445 *end-- = '0' + val % 10;
5446 }
5447 }
5448
5449 if (i > (int)DTRACE_V4MAPPED_OFFSET)
5450 *end-- = '.';
5451 }
5452
5453 if (subr == DIF_SUBR_INET_NTOA6)
5454 goto inetout;
5455
5456 /*
5457 * Set v6end to skip the IPv4 address that
5458 * we have already stringified.
5459 */
5460 v6end = 10;
5461 }
5462
5463 /*
5464 * Build the IPv6 string by working through the
5465 * address in reverse.
5466 */
5467 for (i = v6end; i >= 0; i -= 2) {
5468 ASSERT(end >= base);
5469
5470 if (i == firstzero + numzero - 2) {
5471 *end-- = ':';
5472 *end-- = ':';
5473 i -= numzero - 2;
5474 continue;
5475 }
5476
5477 if (i < 14 && i != firstzero - 2)
5478 *end-- = ':';
5479
5480 val = (ip6._S6_un._S6_u8[i] << 8) +
5481 ip6._S6_un._S6_u8[i + 1];
5482
5483 if (val == 0) {
5484 *end-- = '0';
5485 } else {
5486 for (; val; val /= 16) {
5487 *end-- = digits[val % 16];
5488 }
5489 }
5490 }
5491 ASSERT(end + 1 >= base);
5492
5493 #if defined(__APPLE__)
5494 #undef _S6_un
5495 #undef _S6_u8
5496 #endif /* __APPLE__ */
5497 } else {
5498 /*
5499 * The user didn't use AH_INET or AH_INET6.
5500 */
5501 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5502 regs[rd] = 0;
5503 break;
5504 }
5505
5506 inetout: regs[rd] = (uintptr_t)end + 1;
5507 mstate->dtms_scratch_ptr += size;
5508 break;
5509 }
5510
5511 case DIF_SUBR_JSON: {
5512 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5513 uintptr_t json = tupregs[0].dttk_value;
5514 size_t jsonlen = dtrace_strlen((char *)json, size);
5515 uintptr_t elem = tupregs[1].dttk_value;
5516 size_t elemlen = dtrace_strlen((char *)elem, size);
5517
5518 char *dest = (char *)mstate->dtms_scratch_ptr;
5519 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5520 char *ee = elemlist;
5521 int nelems = 1;
5522 uintptr_t cur;
5523
5524 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5525 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5526 regs[rd] = 0;
5527 break;
5528 }
5529
5530 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5531 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5532 regs[rd] = 0;
5533 break;
5534 }
5535
5536 /*
5537 * Read the element selector and split it up into a packed list
5538 * of strings.
5539 */
5540 for (cur = elem; cur < elem + elemlen; cur++) {
5541 char cc = dtrace_load8(cur);
5542
5543 if (cur == elem && cc == '[') {
5544 /*
5545 * If the first element selector key is
5546 * actually an array index then ignore the
5547 * bracket.
5548 */
5549 continue;
5550 }
5551
5552 if (cc == ']')
5553 continue;
5554
5555 if (cc == '.' || cc == '[') {
5556 nelems++;
5557 cc = '\0';
5558 }
5559
5560 *ee++ = cc;
5561 }
5562 *ee++ = '\0';
5563
5564 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5565 nelems, dest)) != 0)
5566 mstate->dtms_scratch_ptr += jsonlen + 1;
5567 break;
5568 }
5569
5570 case DIF_SUBR_TOUPPER:
5571 case DIF_SUBR_TOLOWER: {
5572 uintptr_t src = tupregs[0].dttk_value;
5573 char *dest = (char *)mstate->dtms_scratch_ptr;
5574 char lower, upper, base, c;
5575 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5576 size_t len = dtrace_strlen((char*) src, size);
5577 size_t i = 0;
5578
5579 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5580 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5581 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5582
5583 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5584 regs[rd] = 0;
5585 break;
5586 }
5587
5588 if (!DTRACE_INSCRATCH(mstate, size)) {
5589 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5590 regs[rd] = 0;
5591 break;
5592 }
5593
5594 for (i = 0; i < size - 1; ++i) {
5595 if ((c = dtrace_load8(src + i)) == '\0')
5596 break;
5597 if (c >= lower && c <= upper)
5598 c = base + (c - lower);
5599 dest[i] = c;
5600 }
5601
5602 ASSERT(i < size);
5603
5604 dest[i] = '\0';
5605 regs[rd] = (uintptr_t) dest;
5606 mstate->dtms_scratch_ptr += size;
5607
5608 break;
5609 }
5610
5611 case DIF_SUBR_STRIP:
5612 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5613 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5614 break;
5615 }
5616 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5617 (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5618 break;
5619
5620 #if defined(__APPLE__)
5621 case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5622 if (!dtrace_priv_kernel(state)) {
5623 regs[rd] = 0;
5624 } else {
5625 regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5626 }
5627
5628 break;
5629 }
5630
5631 case DIF_SUBR_KDEBUG_TRACE: {
5632 uint32_t debugid;
5633 uintptr_t args[4] = {0};
5634 int i;
5635
5636 if (nargs < 2 || nargs > 5) {
5637 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5638 break;
5639 }
5640
5641 if (dtrace_destructive_disallow ||
5642 !dtrace_priv_kernel_destructive(state)) {
5643 return;
5644 }
5645
5646 debugid = tupregs[0].dttk_value;
5647 for (i = 0; i < nargs - 1; i++)
5648 args[i] = tupregs[i + 1].dttk_value;
5649
5650 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5651
5652 break;
5653 }
5654
5655 case DIF_SUBR_KDEBUG_TRACE_STRING: {
5656 if (nargs != 3) {
5657 break;
5658 }
5659
5660 if (dtrace_destructive_disallow ||
5661 !dtrace_priv_kernel_destructive(state)) {
5662 return;
5663 }
5664
5665 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5666 uint32_t debugid = tupregs[0].dttk_value;
5667 uint64_t str_id = tupregs[1].dttk_value;
5668 uintptr_t src = tupregs[2].dttk_value;
5669 size_t lim;
5670 char buf[size];
5671 char* str = NULL;
5672
5673 if (src != (uintptr_t)0) {
5674 str = buf;
5675 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5676 break;
5677 }
5678 dtrace_strcpy((void*)src, buf, size);
5679 }
5680
5681 (void)kernel_debug_string(debugid, &str_id, str);
5682 regs[rd] = str_id;
5683
5684 break;
5685 }
5686
5687 case DIF_SUBR_MTONS:
5688 absolutetime_to_nanoseconds(tupregs[0].dttk_value, ®s[rd]);
5689
5690 break;
5691 case DIF_SUBR_PHYSMEM_READ: {
5692 #if DEBUG || DEVELOPMENT
5693 if (dtrace_destructive_disallow ||
5694 !dtrace_priv_kernel_destructive(state)) {
5695 return;
5696 }
5697 regs[rd] = dtrace_physmem_read(tupregs[0].dttk_value,
5698 tupregs[1].dttk_value);
5699 #else
5700 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5701 #endif /* DEBUG || DEVELOPMENT */
5702 break;
5703 }
5704 case DIF_SUBR_PHYSMEM_WRITE: {
5705 #if DEBUG || DEVELOPMENT
5706 if (dtrace_destructive_disallow ||
5707 !dtrace_priv_kernel_destructive(state)) {
5708 return;
5709 }
5710
5711 dtrace_physmem_write(tupregs[0].dttk_value,
5712 tupregs[1].dttk_value, (size_t)tupregs[2].dttk_value);
5713 #else
5714 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5715 #endif /* DEBUG || DEVELOPMENT */
5716 break;
5717 }
5718
5719 case DIF_SUBR_KVTOPHYS: {
5720 #if DEBUG || DEVELOPMENT
5721 regs[rd] = kvtophys(tupregs[0].dttk_value);
5722 #else
5723 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5724 #endif /* DEBUG || DEVELOPMENT */
5725 break;
5726 }
5727
5728 case DIF_SUBR_LIVEDUMP: {
5729 #if DEBUG || DEVELOPMENT
5730 if (dtrace_destructive_disallow ||
5731 !dtrace_priv_kernel_destructive(state)) {
5732 break;
5733 }
5734
5735 /* For the moment, there is only one type of livedump. */
5736 if (nargs != 1 || tupregs[0].dttk_value != 0) {
5737 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5738 break;
5739 }
5740
5741 char *dest = (char *)mstate->dtms_scratch_ptr;
5742 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5743
5744 if (!DTRACE_INSCRATCH(mstate, size)) {
5745 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5746 regs[rd] = 0;
5747 break;
5748 }
5749
5750 dtrace_livedump(dest, size);
5751 regs[rd] = (uintptr_t) dest;
5752 mstate->dtms_scratch_ptr += strlen(dest) + 1;
5753 #else
5754 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5755 #endif /* DEBUG || DEVELOPMENT */
5756 break;
5757 }
5758 #endif /* defined(__APPLE__) */
5759
5760 }
5761 }
5762
5763 /*
5764 * Emulate the execution of DTrace IR instructions specified by the given
5765 * DIF object. This function is deliberately void of assertions as all of
5766 * the necessary checks are handled by a call to dtrace_difo_validate().
5767 */
5768 static uint64_t
dtrace_dif_emulate(dtrace_difo_t * difo,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate,dtrace_state_t * state)5769 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5770 dtrace_vstate_t *vstate, dtrace_state_t *state)
5771 {
5772 const dif_instr_t *text = difo->dtdo_buf;
5773 const uint_t textlen = difo->dtdo_len;
5774 const char *strtab = difo->dtdo_strtab;
5775 const uint64_t *inttab = difo->dtdo_inttab;
5776
5777 uint64_t rval = 0;
5778 dtrace_statvar_t *svar;
5779 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5780 dtrace_difv_t *v;
5781 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5782 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5783
5784 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5785 uint64_t regs[DIF_DIR_NREGS];
5786 uint64_t *tmp;
5787
5788 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5789 int64_t cc_r;
5790 uint_t pc = 0, id, opc = 0;
5791 uint8_t ttop = 0;
5792 dif_instr_t instr;
5793 uint_t r1, r2, rd;
5794
5795 /*
5796 * We stash the current DIF object into the machine state: we need it
5797 * for subsequent access checking.
5798 */
5799 mstate->dtms_difo = difo;
5800
5801 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5802
5803 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5804 opc = pc;
5805
5806 instr = text[pc++];
5807 r1 = DIF_INSTR_R1(instr);
5808 r2 = DIF_INSTR_R2(instr);
5809 rd = DIF_INSTR_RD(instr);
5810
5811 switch (DIF_INSTR_OP(instr)) {
5812 case DIF_OP_OR:
5813 regs[rd] = regs[r1] | regs[r2];
5814 break;
5815 case DIF_OP_XOR:
5816 regs[rd] = regs[r1] ^ regs[r2];
5817 break;
5818 case DIF_OP_AND:
5819 regs[rd] = regs[r1] & regs[r2];
5820 break;
5821 case DIF_OP_SLL:
5822 regs[rd] = regs[r1] << regs[r2];
5823 break;
5824 case DIF_OP_SRL:
5825 regs[rd] = regs[r1] >> regs[r2];
5826 break;
5827 case DIF_OP_SUB:
5828 regs[rd] = regs[r1] - regs[r2];
5829 break;
5830 case DIF_OP_ADD:
5831 regs[rd] = regs[r1] + regs[r2];
5832 break;
5833 case DIF_OP_MUL:
5834 regs[rd] = regs[r1] * regs[r2];
5835 break;
5836 case DIF_OP_SDIV:
5837 if (regs[r2] == 0) {
5838 regs[rd] = 0;
5839 *flags |= CPU_DTRACE_DIVZERO;
5840 } else {
5841 regs[rd] = (int64_t)regs[r1] /
5842 (int64_t)regs[r2];
5843 }
5844 break;
5845
5846 case DIF_OP_UDIV:
5847 if (regs[r2] == 0) {
5848 regs[rd] = 0;
5849 *flags |= CPU_DTRACE_DIVZERO;
5850 } else {
5851 regs[rd] = regs[r1] / regs[r2];
5852 }
5853 break;
5854
5855 case DIF_OP_SREM:
5856 if (regs[r2] == 0) {
5857 regs[rd] = 0;
5858 *flags |= CPU_DTRACE_DIVZERO;
5859 } else {
5860 regs[rd] = (int64_t)regs[r1] %
5861 (int64_t)regs[r2];
5862 }
5863 break;
5864
5865 case DIF_OP_UREM:
5866 if (regs[r2] == 0) {
5867 regs[rd] = 0;
5868 *flags |= CPU_DTRACE_DIVZERO;
5869 } else {
5870 regs[rd] = regs[r1] % regs[r2];
5871 }
5872 break;
5873
5874 case DIF_OP_NOT:
5875 regs[rd] = ~regs[r1];
5876 break;
5877 case DIF_OP_MOV:
5878 regs[rd] = regs[r1];
5879 break;
5880 case DIF_OP_CMP:
5881 cc_r = regs[r1] - regs[r2];
5882 cc_n = cc_r < 0;
5883 cc_z = cc_r == 0;
5884 cc_v = 0;
5885 cc_c = regs[r1] < regs[r2];
5886 break;
5887 case DIF_OP_TST:
5888 cc_n = cc_v = cc_c = 0;
5889 cc_z = regs[r1] == 0;
5890 break;
5891 case DIF_OP_BA:
5892 pc = DIF_INSTR_LABEL(instr);
5893 break;
5894 case DIF_OP_BE:
5895 if (cc_z)
5896 pc = DIF_INSTR_LABEL(instr);
5897 break;
5898 case DIF_OP_BNE:
5899 if (cc_z == 0)
5900 pc = DIF_INSTR_LABEL(instr);
5901 break;
5902 case DIF_OP_BG:
5903 if ((cc_z | (cc_n ^ cc_v)) == 0)
5904 pc = DIF_INSTR_LABEL(instr);
5905 break;
5906 case DIF_OP_BGU:
5907 if ((cc_c | cc_z) == 0)
5908 pc = DIF_INSTR_LABEL(instr);
5909 break;
5910 case DIF_OP_BGE:
5911 if ((cc_n ^ cc_v) == 0)
5912 pc = DIF_INSTR_LABEL(instr);
5913 break;
5914 case DIF_OP_BGEU:
5915 if (cc_c == 0)
5916 pc = DIF_INSTR_LABEL(instr);
5917 break;
5918 case DIF_OP_BL:
5919 if (cc_n ^ cc_v)
5920 pc = DIF_INSTR_LABEL(instr);
5921 break;
5922 case DIF_OP_BLU:
5923 if (cc_c)
5924 pc = DIF_INSTR_LABEL(instr);
5925 break;
5926 case DIF_OP_BLE:
5927 if (cc_z | (cc_n ^ cc_v))
5928 pc = DIF_INSTR_LABEL(instr);
5929 break;
5930 case DIF_OP_BLEU:
5931 if (cc_c | cc_z)
5932 pc = DIF_INSTR_LABEL(instr);
5933 break;
5934 case DIF_OP_RLDSB:
5935 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5936 *flags |= CPU_DTRACE_KPRIV;
5937 *illval = regs[r1];
5938 break;
5939 }
5940 OS_FALLTHROUGH;
5941 case DIF_OP_LDSB:
5942 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5943 break;
5944 case DIF_OP_RLDSH:
5945 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5946 *flags |= CPU_DTRACE_KPRIV;
5947 *illval = regs[r1];
5948 break;
5949 }
5950 OS_FALLTHROUGH;
5951 case DIF_OP_LDSH:
5952 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5953 break;
5954 case DIF_OP_RLDSW:
5955 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5956 *flags |= CPU_DTRACE_KPRIV;
5957 *illval = regs[r1];
5958 break;
5959 }
5960 OS_FALLTHROUGH;
5961 case DIF_OP_LDSW:
5962 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5963 break;
5964 case DIF_OP_RLDUB:
5965 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5966 *flags |= CPU_DTRACE_KPRIV;
5967 *illval = regs[r1];
5968 break;
5969 }
5970 OS_FALLTHROUGH;
5971 case DIF_OP_LDUB:
5972 regs[rd] = dtrace_load8(regs[r1]);
5973 break;
5974 case DIF_OP_RLDUH:
5975 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5976 *flags |= CPU_DTRACE_KPRIV;
5977 *illval = regs[r1];
5978 break;
5979 }
5980 OS_FALLTHROUGH;
5981 case DIF_OP_LDUH:
5982 regs[rd] = dtrace_load16(regs[r1]);
5983 break;
5984 case DIF_OP_RLDUW:
5985 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5986 *flags |= CPU_DTRACE_KPRIV;
5987 *illval = regs[r1];
5988 break;
5989 }
5990 OS_FALLTHROUGH;
5991 case DIF_OP_LDUW:
5992 regs[rd] = dtrace_load32(regs[r1]);
5993 break;
5994 case DIF_OP_RLDX:
5995 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5996 *flags |= CPU_DTRACE_KPRIV;
5997 *illval = regs[r1];
5998 break;
5999 }
6000 OS_FALLTHROUGH;
6001 case DIF_OP_LDX:
6002 regs[rd] = dtrace_load64(regs[r1]);
6003 break;
6004 /*
6005 * Darwin 32-bit kernel may fetch from 64-bit user.
6006 * Do not cast regs to uintptr_t
6007 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
6008 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
6009 */
6010 case DIF_OP_ULDSB:
6011 regs[rd] = (int8_t)
6012 dtrace_fuword8(regs[r1]);
6013 break;
6014 case DIF_OP_ULDSH:
6015 regs[rd] = (int16_t)
6016 dtrace_fuword16(regs[r1]);
6017 break;
6018 case DIF_OP_ULDSW:
6019 regs[rd] = (int32_t)
6020 dtrace_fuword32(regs[r1]);
6021 break;
6022 case DIF_OP_ULDUB:
6023 regs[rd] =
6024 dtrace_fuword8(regs[r1]);
6025 break;
6026 case DIF_OP_ULDUH:
6027 regs[rd] =
6028 dtrace_fuword16(regs[r1]);
6029 break;
6030 case DIF_OP_ULDUW:
6031 regs[rd] =
6032 dtrace_fuword32(regs[r1]);
6033 break;
6034 case DIF_OP_ULDX:
6035 regs[rd] =
6036 dtrace_fuword64(regs[r1]);
6037 break;
6038 case DIF_OP_RET:
6039 rval = regs[rd];
6040 pc = textlen;
6041 break;
6042 case DIF_OP_NOP:
6043 break;
6044 case DIF_OP_SETX:
6045 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6046 break;
6047 case DIF_OP_SETS:
6048 regs[rd] = (uint64_t)(uintptr_t)
6049 (strtab + DIF_INSTR_STRING(instr));
6050 break;
6051 case DIF_OP_SCMP: {
6052 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6053 uintptr_t s1 = regs[r1];
6054 uintptr_t s2 = regs[r2];
6055 size_t lim1 = sz, lim2 = sz;
6056
6057 if (s1 != 0 &&
6058 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
6059 break;
6060 if (s2 != 0 &&
6061 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
6062 break;
6063
6064 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
6065 MIN(lim1, lim2));
6066
6067 cc_n = cc_r < 0;
6068 cc_z = cc_r == 0;
6069 cc_v = cc_c = 0;
6070 break;
6071 }
6072 case DIF_OP_LDGA:
6073 regs[rd] = dtrace_dif_variable(mstate, state,
6074 r1, regs[r2]);
6075 break;
6076 case DIF_OP_LDGS:
6077 id = DIF_INSTR_VAR(instr);
6078
6079 if (id >= DIF_VAR_OTHER_UBASE) {
6080 uintptr_t a;
6081
6082 id -= DIF_VAR_OTHER_UBASE;
6083 svar = vstate->dtvs_globals[id];
6084 ASSERT(svar != NULL);
6085 v = &svar->dtsv_var;
6086
6087 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6088 regs[rd] = svar->dtsv_data;
6089 break;
6090 }
6091
6092 a = (uintptr_t)svar->dtsv_data;
6093
6094 if (*(uint8_t *)a == UINT8_MAX) {
6095 /*
6096 * If the 0th byte is set to UINT8_MAX
6097 * then this is to be treated as a
6098 * reference to a NULL variable.
6099 */
6100 regs[rd] = 0;
6101 } else {
6102 regs[rd] = a + sizeof (uint64_t);
6103 }
6104
6105 break;
6106 }
6107
6108 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6109 break;
6110
6111 case DIF_OP_STGS:
6112 id = DIF_INSTR_VAR(instr);
6113
6114 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6115 id -= DIF_VAR_OTHER_UBASE;
6116
6117 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6118 svar = vstate->dtvs_globals[id];
6119 ASSERT(svar != NULL);
6120 v = &svar->dtsv_var;
6121
6122 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6123 uintptr_t a = (uintptr_t)svar->dtsv_data;
6124 size_t lim = 0;
6125
6126 ASSERT(a != 0);
6127 ASSERT(svar->dtsv_size != 0);
6128
6129 if (regs[rd] == 0) {
6130 *(uint8_t *)a = UINT8_MAX;
6131 break;
6132 } else {
6133 *(uint8_t *)a = 0;
6134 a += sizeof (uint64_t);
6135 }
6136 if (!dtrace_vcanload(
6137 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6138 &lim, mstate, vstate))
6139 break;
6140
6141 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6142 (void *)a, &v->dtdv_type, lim);
6143 break;
6144 }
6145
6146 svar->dtsv_data = regs[rd];
6147 break;
6148
6149 case DIF_OP_LDTA:
6150 /*
6151 * There are no DTrace built-in thread-local arrays at
6152 * present. This opcode is saved for future work.
6153 */
6154 *flags |= CPU_DTRACE_ILLOP;
6155 regs[rd] = 0;
6156 break;
6157
6158 case DIF_OP_LDLS:
6159 id = DIF_INSTR_VAR(instr);
6160
6161 if (id < DIF_VAR_OTHER_UBASE) {
6162 /*
6163 * For now, this has no meaning.
6164 */
6165 regs[rd] = 0;
6166 break;
6167 }
6168
6169 id -= DIF_VAR_OTHER_UBASE;
6170
6171 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6172 ASSERT(vstate->dtvs_locals != NULL);
6173 svar = vstate->dtvs_locals[id];
6174 ASSERT(svar != NULL);
6175 v = &svar->dtsv_var;
6176
6177 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6178 uintptr_t a = (uintptr_t)svar->dtsv_data;
6179 size_t sz = v->dtdv_type.dtdt_size;
6180
6181 sz += sizeof (uint64_t);
6182 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6183 a += CPU->cpu_id * sz;
6184
6185 if (*(uint8_t *)a == UINT8_MAX) {
6186 /*
6187 * If the 0th byte is set to UINT8_MAX
6188 * then this is to be treated as a
6189 * reference to a NULL variable.
6190 */
6191 regs[rd] = 0;
6192 } else {
6193 regs[rd] = a + sizeof (uint64_t);
6194 }
6195
6196 break;
6197 }
6198
6199 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6200 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6201 regs[rd] = tmp[CPU->cpu_id];
6202 break;
6203
6204 case DIF_OP_STLS:
6205 id = DIF_INSTR_VAR(instr);
6206
6207 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6208 id -= DIF_VAR_OTHER_UBASE;
6209 VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6210 ASSERT(vstate->dtvs_locals != NULL);
6211 svar = vstate->dtvs_locals[id];
6212 ASSERT(svar != NULL);
6213 v = &svar->dtsv_var;
6214
6215 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6216 uintptr_t a = (uintptr_t)svar->dtsv_data;
6217 size_t sz = v->dtdv_type.dtdt_size;
6218 size_t lim = 0;
6219
6220 sz += sizeof (uint64_t);
6221 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6222 a += CPU->cpu_id * sz;
6223
6224 if (regs[rd] == 0) {
6225 *(uint8_t *)a = UINT8_MAX;
6226 break;
6227 } else {
6228 *(uint8_t *)a = 0;
6229 a += sizeof (uint64_t);
6230 }
6231
6232 if (!dtrace_vcanload(
6233 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6234 &lim, mstate, vstate))
6235 break;
6236
6237 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6238 (void *)a, &v->dtdv_type, lim);
6239 break;
6240 }
6241
6242 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6243 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6244 tmp[CPU->cpu_id] = regs[rd];
6245 break;
6246
6247 case DIF_OP_LDTS: {
6248 dtrace_dynvar_t *dvar;
6249 dtrace_key_t *key;
6250
6251 id = DIF_INSTR_VAR(instr);
6252 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6253 id -= DIF_VAR_OTHER_UBASE;
6254 v = &vstate->dtvs_tlocals[id];
6255
6256 key = &tupregs[DIF_DTR_NREGS];
6257 key[0].dttk_value = (uint64_t)id;
6258 key[0].dttk_size = 0;
6259 DTRACE_TLS_THRKEY(key[1].dttk_value);
6260 key[1].dttk_size = 0;
6261
6262 dvar = dtrace_dynvar(dstate, 2, key,
6263 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6264 mstate, vstate);
6265
6266 if (dvar == NULL) {
6267 regs[rd] = 0;
6268 break;
6269 }
6270
6271 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6272 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6273 } else {
6274 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6275 }
6276
6277 break;
6278 }
6279
6280 case DIF_OP_STTS: {
6281 dtrace_dynvar_t *dvar;
6282 dtrace_key_t *key;
6283
6284 id = DIF_INSTR_VAR(instr);
6285 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6286 id -= DIF_VAR_OTHER_UBASE;
6287 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6288
6289 key = &tupregs[DIF_DTR_NREGS];
6290 key[0].dttk_value = (uint64_t)id;
6291 key[0].dttk_size = 0;
6292 DTRACE_TLS_THRKEY(key[1].dttk_value);
6293 key[1].dttk_size = 0;
6294 v = &vstate->dtvs_tlocals[id];
6295
6296 dvar = dtrace_dynvar(dstate, 2, key,
6297 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6298 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6299 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6300 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6301
6302 /*
6303 * Given that we're storing to thread-local data,
6304 * we need to flush our predicate cache.
6305 */
6306 dtrace_set_thread_predcache(current_thread(), 0);
6307
6308 if (dvar == NULL)
6309 break;
6310
6311 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6312 size_t lim = 0;
6313
6314 if (!dtrace_vcanload(
6315 (void *)(uintptr_t)regs[rd],
6316 &v->dtdv_type, &lim, mstate, vstate))
6317 break;
6318
6319 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6320 dvar->dtdv_data, &v->dtdv_type, lim);
6321 } else {
6322 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6323 }
6324
6325 break;
6326 }
6327
6328 case DIF_OP_SRA:
6329 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6330 break;
6331
6332 case DIF_OP_CALL:
6333 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6334 regs, tupregs, ttop, mstate, state);
6335 break;
6336
6337 case DIF_OP_PUSHTR:
6338 if (ttop == DIF_DTR_NREGS) {
6339 *flags |= CPU_DTRACE_TUPOFLOW;
6340 break;
6341 }
6342
6343 if (r1 == DIF_TYPE_STRING) {
6344 /*
6345 * If this is a string type and the size is 0,
6346 * we'll use the system-wide default string
6347 * size. Note that we are _not_ looking at
6348 * the value of the DTRACEOPT_STRSIZE option;
6349 * had this been set, we would expect to have
6350 * a non-zero size value in the "pushtr".
6351 */
6352 tupregs[ttop].dttk_size =
6353 dtrace_strlen((char *)(uintptr_t)regs[rd],
6354 regs[r2] ? regs[r2] :
6355 dtrace_strsize_default) + 1;
6356 } else {
6357 if (regs[r2] > LONG_MAX) {
6358 *flags |= CPU_DTRACE_ILLOP;
6359 break;
6360 }
6361 tupregs[ttop].dttk_size = regs[r2];
6362 }
6363
6364 tupregs[ttop++].dttk_value = regs[rd];
6365 break;
6366
6367 case DIF_OP_PUSHTV:
6368 if (ttop == DIF_DTR_NREGS) {
6369 *flags |= CPU_DTRACE_TUPOFLOW;
6370 break;
6371 }
6372
6373 tupregs[ttop].dttk_value = regs[rd];
6374 tupregs[ttop++].dttk_size = 0;
6375 break;
6376
6377 case DIF_OP_POPTS:
6378 if (ttop != 0)
6379 ttop--;
6380 break;
6381
6382 case DIF_OP_FLUSHTS:
6383 ttop = 0;
6384 break;
6385
6386 case DIF_OP_LDGAA:
6387 case DIF_OP_LDTAA: {
6388 dtrace_dynvar_t *dvar;
6389 dtrace_key_t *key = tupregs;
6390 uint_t nkeys = ttop;
6391
6392 id = DIF_INSTR_VAR(instr);
6393 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6394 id -= DIF_VAR_OTHER_UBASE;
6395
6396 key[nkeys].dttk_value = (uint64_t)id;
6397 key[nkeys++].dttk_size = 0;
6398
6399 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6400 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6401 key[nkeys++].dttk_size = 0;
6402 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6403 v = &vstate->dtvs_tlocals[id];
6404 } else {
6405 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6406 v = &vstate->dtvs_globals[id]->dtsv_var;
6407 }
6408
6409 dvar = dtrace_dynvar(dstate, nkeys, key,
6410 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6411 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6412 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6413
6414 if (dvar == NULL) {
6415 regs[rd] = 0;
6416 break;
6417 }
6418
6419 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6420 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6421 } else {
6422 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6423 }
6424
6425 break;
6426 }
6427
6428 case DIF_OP_STGAA:
6429 case DIF_OP_STTAA: {
6430 dtrace_dynvar_t *dvar;
6431 dtrace_key_t *key = tupregs;
6432 uint_t nkeys = ttop;
6433
6434 id = DIF_INSTR_VAR(instr);
6435 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6436 id -= DIF_VAR_OTHER_UBASE;
6437
6438 key[nkeys].dttk_value = (uint64_t)id;
6439 key[nkeys++].dttk_size = 0;
6440
6441 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6442 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6443 key[nkeys++].dttk_size = 0;
6444 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6445 v = &vstate->dtvs_tlocals[id];
6446 } else {
6447 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6448 v = &vstate->dtvs_globals[id]->dtsv_var;
6449 }
6450
6451 dvar = dtrace_dynvar(dstate, nkeys, key,
6452 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6453 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6454 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6455 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6456
6457 if (dvar == NULL)
6458 break;
6459
6460 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6461 size_t lim = 0;
6462
6463 if (!dtrace_vcanload(
6464 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6465 &lim, mstate, vstate))
6466 break;
6467
6468 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6469 dvar->dtdv_data, &v->dtdv_type, lim);
6470 } else {
6471 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6472 }
6473
6474 break;
6475 }
6476
6477 case DIF_OP_ALLOCS: {
6478 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6479 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6480
6481 /*
6482 * Rounding up the user allocation size could have
6483 * overflowed large, bogus allocations (like -1ULL) to
6484 * 0.
6485 */
6486 if (size < regs[r1] ||
6487 !DTRACE_INSCRATCH(mstate, size)) {
6488 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6489 regs[rd] = 0;
6490 break;
6491 }
6492
6493 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6494 mstate->dtms_scratch_ptr += size;
6495 regs[rd] = ptr;
6496 break;
6497 }
6498
6499 case DIF_OP_COPYS:
6500 if (!dtrace_canstore(regs[rd], regs[r2],
6501 mstate, vstate)) {
6502 *flags |= CPU_DTRACE_BADADDR;
6503 *illval = regs[rd];
6504 break;
6505 }
6506
6507 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6508 break;
6509
6510 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6511 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6512 break;
6513
6514 case DIF_OP_STB:
6515 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6516 *flags |= CPU_DTRACE_BADADDR;
6517 *illval = regs[rd];
6518 break;
6519 }
6520 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6521 break;
6522
6523 case DIF_OP_STH:
6524 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6525 *flags |= CPU_DTRACE_BADADDR;
6526 *illval = regs[rd];
6527 break;
6528 }
6529 if (regs[rd] & 1) {
6530 *flags |= CPU_DTRACE_BADALIGN;
6531 *illval = regs[rd];
6532 break;
6533 }
6534 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6535 break;
6536
6537 case DIF_OP_STW:
6538 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6539 *flags |= CPU_DTRACE_BADADDR;
6540 *illval = regs[rd];
6541 break;
6542 }
6543 if (regs[rd] & 3) {
6544 *flags |= CPU_DTRACE_BADALIGN;
6545 *illval = regs[rd];
6546 break;
6547 }
6548 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6549 break;
6550
6551 case DIF_OP_STX:
6552 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6553 *flags |= CPU_DTRACE_BADADDR;
6554 *illval = regs[rd];
6555 break;
6556 }
6557
6558 /*
6559 * Darwin kmem_zalloc() called from
6560 * dtrace_difo_init() is 4-byte aligned.
6561 */
6562 if (regs[rd] & 3) {
6563 *flags |= CPU_DTRACE_BADALIGN;
6564 *illval = regs[rd];
6565 break;
6566 }
6567 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6568 break;
6569 case DIF_OP_STRIP:
6570 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6571 (void*)regs[r1], r2);
6572 break;
6573 }
6574 }
6575
6576 if (!(*flags & CPU_DTRACE_FAULT))
6577 return (rval);
6578
6579 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6580 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6581
6582 return (0);
6583 }
6584
6585 __attribute__((noinline))
6586 static void
dtrace_action_breakpoint(dtrace_ecb_t * ecb)6587 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6588 {
6589 dtrace_probe_t *probe = ecb->dte_probe;
6590 dtrace_provider_t *prov = probe->dtpr_provider;
6591 char c[DTRACE_FULLNAMELEN + 80], *str;
6592 const char *msg = "dtrace: breakpoint action at probe ";
6593 const char *ecbmsg = " (ecb ";
6594 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6595 uintptr_t val = (uintptr_t)ecb;
6596 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6597
6598 if (dtrace_destructive_disallow)
6599 return;
6600
6601 /*
6602 * It's impossible to be taking action on the NULL probe.
6603 */
6604 ASSERT(probe != NULL);
6605
6606 /*
6607 * This is a poor man's (destitute man's?) sprintf(): we want to
6608 * print the provider name, module name, function name and name of
6609 * the probe, along with the hex address of the ECB with the breakpoint
6610 * action -- all of which we must place in the character buffer by
6611 * hand.
6612 */
6613 while (*msg != '\0')
6614 c[i++] = *msg++;
6615
6616 for (str = prov->dtpv_name; *str != '\0'; str++)
6617 c[i++] = *str;
6618 c[i++] = ':';
6619
6620 for (str = probe->dtpr_mod; *str != '\0'; str++)
6621 c[i++] = *str;
6622 c[i++] = ':';
6623
6624 for (str = probe->dtpr_func; *str != '\0'; str++)
6625 c[i++] = *str;
6626 c[i++] = ':';
6627
6628 for (str = probe->dtpr_name; *str != '\0'; str++)
6629 c[i++] = *str;
6630
6631 while (*ecbmsg != '\0')
6632 c[i++] = *ecbmsg++;
6633
6634 while (shift >= 0) {
6635 mask = (uintptr_t)0xf << shift;
6636
6637 if (val >= ((uintptr_t)1 << shift))
6638 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6639 shift -= 4;
6640 }
6641
6642 c[i++] = ')';
6643 c[i] = '\0';
6644
6645 debug_enter(c);
6646 }
6647
6648 __attribute__((noinline))
6649 static void
dtrace_action_panic(dtrace_ecb_t * ecb)6650 dtrace_action_panic(dtrace_ecb_t *ecb)
6651 {
6652 dtrace_probe_t *probe = ecb->dte_probe;
6653
6654 /*
6655 * It's impossible to be taking action on the NULL probe.
6656 */
6657 ASSERT(probe != NULL);
6658
6659 if (dtrace_destructive_disallow)
6660 return;
6661
6662 if (dtrace_panicked != NULL)
6663 return;
6664
6665 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6666 return;
6667
6668 /*
6669 * We won the right to panic. (We want to be sure that only one
6670 * thread calls panic() from dtrace_probe(), and that panic() is
6671 * called exactly once.)
6672 */
6673 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6674 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6675 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6676
6677 /*
6678 * APPLE NOTE: this was for an old Mac OS X debug feature
6679 * allowing a return from panic(). Revisit someday.
6680 */
6681 dtrace_panicked = NULL;
6682 }
6683
6684 static void
dtrace_action_raise(uint64_t sig)6685 dtrace_action_raise(uint64_t sig)
6686 {
6687 if (dtrace_destructive_disallow)
6688 return;
6689
6690 if (sig >= NSIG) {
6691 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6692 return;
6693 }
6694
6695 /*
6696 * raise() has a queue depth of 1 -- we ignore all subsequent
6697 * invocations of the raise() action.
6698 */
6699
6700 uthread_t uthread = current_uthread();
6701
6702 if (uthread && uthread->t_dtrace_sig == 0) {
6703 uthread->t_dtrace_sig = sig;
6704 act_set_astbsd(current_thread());
6705 }
6706 }
6707
6708 static void
dtrace_action_stop(void)6709 dtrace_action_stop(void)
6710 {
6711 if (dtrace_destructive_disallow)
6712 return;
6713
6714 uthread_t uthread = current_uthread();
6715 if (uthread) {
6716 /*
6717 * The currently running process will be set to task_suspend
6718 * when it next leaves the kernel.
6719 */
6720 uthread->t_dtrace_stop = 1;
6721 act_set_astbsd(current_thread());
6722 }
6723 }
6724
6725
6726 /*
6727 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6728 * Both activate only when the currently running process next leaves the
6729 * kernel.
6730 */
6731 static void
dtrace_action_pidresume(uint64_t pid)6732 dtrace_action_pidresume(uint64_t pid)
6733 {
6734 if (dtrace_destructive_disallow)
6735 return;
6736
6737 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6738 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6739 return;
6740 }
6741 uthread_t uthread = current_uthread();
6742
6743 /*
6744 * When the currently running process leaves the kernel, it attempts to
6745 * task_resume the process (denoted by pid), if that pid appears to have
6746 * been stopped by dtrace_action_stop().
6747 * The currently running process has a pidresume() queue depth of 1 --
6748 * subsequent invocations of the pidresume() action are ignored.
6749 */
6750
6751 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6752 uthread->t_dtrace_resumepid = pid;
6753 act_set_astbsd(current_thread());
6754 }
6755 }
6756
6757 __attribute__((noinline))
6758 static void
dtrace_action_chill(dtrace_mstate_t * mstate,hrtime_t val)6759 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6760 {
6761 hrtime_t now;
6762 volatile uint16_t *flags;
6763 dtrace_cpu_t *cpu = CPU;
6764
6765 if (dtrace_destructive_disallow)
6766 return;
6767
6768 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6769
6770 now = dtrace_gethrtime();
6771
6772 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6773 /*
6774 * We need to advance the mark to the current time.
6775 */
6776 cpu->cpu_dtrace_chillmark = now;
6777 cpu->cpu_dtrace_chilled = 0;
6778 }
6779
6780 /*
6781 * Now check to see if the requested chill time would take us over
6782 * the maximum amount of time allowed in the chill interval. (Or
6783 * worse, if the calculation itself induces overflow.)
6784 */
6785 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6786 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6787 *flags |= CPU_DTRACE_ILLOP;
6788 return;
6789 }
6790
6791 while (dtrace_gethrtime() - now < val)
6792 continue;
6793
6794 /*
6795 * Normally, we assure that the value of the variable "timestamp" does
6796 * not change within an ECB. The presence of chill() represents an
6797 * exception to this rule, however.
6798 */
6799 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6800 cpu->cpu_dtrace_chilled += val;
6801 }
6802
6803 __attribute__((noinline))
6804 static void
dtrace_action_ustack(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t * buf,uint64_t arg)6805 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6806 uint64_t *buf, uint64_t arg)
6807 {
6808 int nframes = DTRACE_USTACK_NFRAMES(arg);
6809 int strsize = DTRACE_USTACK_STRSIZE(arg);
6810 uint64_t *pcs = &buf[1], *fps;
6811 char *str = (char *)&pcs[nframes];
6812 int size, offs = 0, i, j;
6813 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6814 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6815 char *sym;
6816
6817 /*
6818 * Should be taking a faster path if string space has not been
6819 * allocated.
6820 */
6821 ASSERT(strsize != 0);
6822
6823 /*
6824 * We will first allocate some temporary space for the frame pointers.
6825 */
6826 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6827 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6828 (nframes * sizeof (uint64_t));
6829
6830 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6831 /*
6832 * Not enough room for our frame pointers -- need to indicate
6833 * that we ran out of scratch space.
6834 */
6835 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6836 return;
6837 }
6838
6839 mstate->dtms_scratch_ptr += size;
6840 saved = mstate->dtms_scratch_ptr;
6841
6842 /*
6843 * Now get a stack with both program counters and frame pointers.
6844 */
6845 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6846 dtrace_getufpstack(buf, fps, nframes + 1);
6847 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6848
6849 /*
6850 * If that faulted, we're cooked.
6851 */
6852 if (*flags & CPU_DTRACE_FAULT)
6853 goto out;
6854
6855 /*
6856 * Now we want to walk up the stack, calling the USTACK helper. For
6857 * each iteration, we restore the scratch pointer.
6858 */
6859 for (i = 0; i < nframes; i++) {
6860 mstate->dtms_scratch_ptr = saved;
6861
6862 if (offs >= strsize)
6863 break;
6864
6865 sym = (char *)(uintptr_t)dtrace_helper(
6866 DTRACE_HELPER_ACTION_USTACK,
6867 mstate, state, pcs[i], fps[i]);
6868
6869 /*
6870 * If we faulted while running the helper, we're going to
6871 * clear the fault and null out the corresponding string.
6872 */
6873 if (*flags & CPU_DTRACE_FAULT) {
6874 *flags &= ~CPU_DTRACE_FAULT;
6875 str[offs++] = '\0';
6876 continue;
6877 }
6878
6879 if (sym == NULL) {
6880 str[offs++] = '\0';
6881 continue;
6882 }
6883
6884 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6885
6886 /*
6887 * Now copy in the string that the helper returned to us.
6888 */
6889 for (j = 0; offs + j < strsize; j++) {
6890 if ((str[offs + j] = sym[j]) == '\0')
6891 break;
6892 }
6893
6894 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6895
6896 offs += j + 1;
6897 }
6898
6899 if (offs >= strsize) {
6900 /*
6901 * If we didn't have room for all of the strings, we don't
6902 * abort processing -- this needn't be a fatal error -- but we
6903 * still want to increment a counter (dts_stkstroverflows) to
6904 * allow this condition to be warned about. (If this is from
6905 * a jstack() action, it is easily tuned via jstackstrsize.)
6906 */
6907 dtrace_error(&state->dts_stkstroverflows);
6908 }
6909
6910 while (offs < strsize)
6911 str[offs++] = '\0';
6912
6913 out:
6914 mstate->dtms_scratch_ptr = old;
6915 }
6916
6917 __attribute__((noinline))
6918 static void
dtrace_store_by_ref(dtrace_difo_t * dp,caddr_t tomax,size_t size,size_t * valoffsp,uint64_t * valp,uint64_t end,int intuple,int dtkind)6919 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6920 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6921 {
6922 volatile uint16_t *flags;
6923 uint64_t val = *valp;
6924 size_t valoffs = *valoffsp;
6925
6926 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6927 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6928
6929 /*
6930 * If this is a string, we're going to only load until we find the zero
6931 * byte -- after which we'll store zero bytes.
6932 */
6933 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6934 char c = '\0' + 1;
6935 size_t s;
6936
6937 for (s = 0; s < size; s++) {
6938 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6939 c = dtrace_load8(val++);
6940 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6941 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6942 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6943 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6944 if (*flags & CPU_DTRACE_FAULT)
6945 break;
6946 }
6947
6948 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6949
6950 if (c == '\0' && intuple)
6951 break;
6952 }
6953 } else {
6954 uint8_t c;
6955 while (valoffs < end) {
6956 if (dtkind == DIF_TF_BYREF) {
6957 c = dtrace_load8(val++);
6958 } else if (dtkind == DIF_TF_BYUREF) {
6959 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6960 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6961 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6962 if (*flags & CPU_DTRACE_FAULT)
6963 break;
6964 }
6965
6966 DTRACE_STORE(uint8_t, tomax,
6967 valoffs++, c);
6968 }
6969 }
6970
6971 *valp = val;
6972 *valoffsp = valoffs;
6973 }
6974
6975 /*
6976 * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6977 * defined, we also assert that we are not recursing unless the probe ID is an
6978 * error probe.
6979 */
6980 static dtrace_icookie_t
dtrace_probe_enter(dtrace_id_t id)6981 dtrace_probe_enter(dtrace_id_t id)
6982 {
6983 thread_t thread = current_thread();
6984 uint16_t inprobe;
6985
6986 dtrace_icookie_t cookie;
6987
6988 cookie = dtrace_interrupt_disable();
6989
6990 /*
6991 * Unless this is an ERROR probe, we are not allowed to recurse in
6992 * dtrace_probe(). Recursing into DTrace probe usually means that a
6993 * function is instrumented that should not have been instrumented or
6994 * that the ordering guarantee of the records will be violated,
6995 * resulting in unexpected output. If there is an exception to this
6996 * assertion, a new case should be added.
6997 */
6998 inprobe = dtrace_get_thread_inprobe(thread);
6999 VERIFY(inprobe == 0 ||
7000 id == dtrace_probeid_error);
7001 ASSERT(inprobe < UINT16_MAX);
7002 dtrace_set_thread_inprobe(thread, inprobe + 1);
7003
7004 return (cookie);
7005 }
7006
7007 /*
7008 * Clears the per-thread inprobe flag and enables interrupts.
7009 */
7010 static void
dtrace_probe_exit(dtrace_icookie_t cookie)7011 dtrace_probe_exit(dtrace_icookie_t cookie)
7012 {
7013 thread_t thread = current_thread();
7014 uint16_t inprobe = dtrace_get_thread_inprobe(thread);
7015
7016 ASSERT(inprobe > 0);
7017 dtrace_set_thread_inprobe(thread, inprobe - 1);
7018
7019 #if SCHED_HYGIENE_DEBUG
7020 /*
7021 * Probes can take a relatively long time depending on what the user has
7022 * requested be done in probe context.
7023 * Probes can fire from places where interrupts are already disabled
7024 * (like an interrupt handler) or where preemption has been disabled.
7025 * In order to not trip the interrupt or preemption thresholds, it is
7026 * important to reset timestamps when leaving probe context.
7027 */
7028
7029 /* Interrupts were disabled for the duration of this probe. */
7030 ml_spin_debug_reset(thread);
7031
7032 /* May have been called from an interrupt handler. */
7033 ml_irq_debug_abandon();
7034
7035 /* May have been called with preemption disabled. */
7036 abandon_preemption_disable_measurement();
7037
7038 #endif /* SCHED_HYGIENE_DEBUG */
7039
7040 dtrace_interrupt_enable(cookie);
7041 }
7042
7043 /*
7044 * If you're looking for the epicenter of DTrace, you just found it. This
7045 * is the function called by the provider to fire a probe -- from which all
7046 * subsequent probe-context DTrace activity emanates.
7047 */
7048 void
dtrace_probe(dtrace_id_t id,uint64_t arg0,uint64_t arg1,uint64_t arg2,uint64_t arg3,uint64_t arg4)7049 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7050 uint64_t arg2, uint64_t arg3, uint64_t arg4)
7051 {
7052 processorid_t cpuid;
7053 dtrace_icookie_t cookie;
7054 dtrace_probe_t *probe;
7055 dtrace_mstate_t mstate;
7056 dtrace_ecb_t *ecb;
7057 dtrace_action_t *act;
7058 intptr_t offs;
7059 size_t size;
7060 int vtime, onintr;
7061 volatile uint16_t *flags;
7062 hrtime_t now;
7063
7064 cookie = dtrace_probe_enter(id);
7065
7066 /* Ensure that probe id is valid. */
7067 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
7068 dtrace_probe_exit(cookie);
7069 return;
7070 }
7071
7072 probe = dtrace_probes[id - 1];
7073 if (probe == NULL) {
7074 dtrace_probe_exit(cookie);
7075 return;
7076 }
7077
7078 cpuid = CPU->cpu_id;
7079 onintr = CPU_ON_INTR(CPU);
7080
7081 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7082 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
7083 /*
7084 * We have hit in the predicate cache; we know that
7085 * this predicate would evaluate to be false.
7086 */
7087 dtrace_probe_exit(cookie);
7088 return;
7089 }
7090
7091 if (panic_quiesce) {
7092 /*
7093 * We don't trace anything if we're panicking.
7094 */
7095 dtrace_probe_exit(cookie);
7096 return;
7097 }
7098
7099 #if !defined(__APPLE__)
7100 now = dtrace_gethrtime();
7101 vtime = dtrace_vtime_references != 0;
7102
7103 if (vtime && curthread->t_dtrace_start)
7104 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7105 #else
7106 /*
7107 * APPLE NOTE: The time spent entering DTrace and arriving
7108 * to this point, is attributed to the current thread.
7109 * Instead it should accrue to DTrace. FIXME
7110 */
7111 vtime = dtrace_vtime_references != 0;
7112
7113 if (vtime)
7114 {
7115 int64_t dtrace_accum_time, recent_vtime;
7116 thread_t thread = current_thread();
7117
7118 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
7119
7120 if (dtrace_accum_time >= 0) {
7121 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
7122
7123 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
7124
7125 dtrace_set_thread_vtime(thread, recent_vtime);
7126 }
7127 }
7128
7129 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
7130 #endif /* __APPLE__ */
7131
7132 /*
7133 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
7134 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
7135 * However the provider has no access to ECB context, so passes
7136 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
7137 * Detect that here and cons up a viable state (from the probe_id).
7138 */
7139 if (dtrace_probeid_error == id && 0 == arg0) {
7140 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
7141 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
7142 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7143
7144 if (NULL != ftp_ecb) {
7145 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7146
7147 arg0 = (uint64_t)(uintptr_t)ftp_state;
7148 arg1 = ftp_ecb->dte_epid;
7149 /*
7150 * args[2-4] established by caller.
7151 */
7152 ftp_state->dts_arg_error_illval = -1; /* arg5 */
7153 }
7154 }
7155
7156 mstate.dtms_difo = NULL;
7157 mstate.dtms_probe = probe;
7158 mstate.dtms_strtok = 0;
7159 mstate.dtms_arg[0] = arg0;
7160 mstate.dtms_arg[1] = arg1;
7161 mstate.dtms_arg[2] = arg2;
7162 mstate.dtms_arg[3] = arg3;
7163 mstate.dtms_arg[4] = arg4;
7164
7165 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7166
7167 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7168 dtrace_predicate_t *pred = ecb->dte_predicate;
7169 dtrace_state_t *state = ecb->dte_state;
7170 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7171 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7172 dtrace_vstate_t *vstate = &state->dts_vstate;
7173 dtrace_provider_t *prov = probe->dtpr_provider;
7174 uint64_t tracememsize = 0;
7175 int committed = 0;
7176 caddr_t tomax;
7177
7178 /*
7179 * A little subtlety with the following (seemingly innocuous)
7180 * declaration of the automatic 'val': by looking at the
7181 * code, you might think that it could be declared in the
7182 * action processing loop, below. (That is, it's only used in
7183 * the action processing loop.) However, it must be declared
7184 * out of that scope because in the case of DIF expression
7185 * arguments to aggregating actions, one iteration of the
7186 * action loop will use the last iteration's value.
7187 */
7188 #ifdef lint
7189 uint64_t val = 0;
7190 #else
7191 uint64_t val = 0;
7192 #endif
7193
7194 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7195 *flags &= ~CPU_DTRACE_ERROR;
7196
7197 if (prov == dtrace_provider) {
7198 /*
7199 * If dtrace itself is the provider of this probe,
7200 * we're only going to continue processing the ECB if
7201 * arg0 (the dtrace_state_t) is equal to the ECB's
7202 * creating state. (This prevents disjoint consumers
7203 * from seeing one another's metaprobes.)
7204 */
7205 if (arg0 != (uint64_t)(uintptr_t)state)
7206 continue;
7207 }
7208
7209 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7210 /*
7211 * We're not currently active. If our provider isn't
7212 * the dtrace pseudo provider, we're not interested.
7213 */
7214 if (prov != dtrace_provider)
7215 continue;
7216
7217 /*
7218 * Now we must further check if we are in the BEGIN
7219 * probe. If we are, we will only continue processing
7220 * if we're still in WARMUP -- if one BEGIN enabling
7221 * has invoked the exit() action, we don't want to
7222 * evaluate subsequent BEGIN enablings.
7223 */
7224 if (probe->dtpr_id == dtrace_probeid_begin &&
7225 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7226 ASSERT(state->dts_activity ==
7227 DTRACE_ACTIVITY_DRAINING);
7228 continue;
7229 }
7230 }
7231
7232 if (ecb->dte_cond) {
7233 /*
7234 * If the dte_cond bits indicate that this
7235 * consumer is only allowed to see user-mode firings
7236 * of this probe, call the provider's dtps_usermode()
7237 * entry point to check that the probe was fired
7238 * while in a user context. Skip this ECB if that's
7239 * not the case.
7240 */
7241 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7242 prov->dtpv_pops.dtps_usermode &&
7243 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7244 probe->dtpr_id, probe->dtpr_arg) == 0)
7245 continue;
7246
7247 /*
7248 * This is more subtle than it looks. We have to be
7249 * absolutely certain that CRED() isn't going to
7250 * change out from under us so it's only legit to
7251 * examine that structure if we're in constrained
7252 * situations. Currently, the only times we'll this
7253 * check is if a non-super-user has enabled the
7254 * profile or syscall providers -- providers that
7255 * allow visibility of all processes. For the
7256 * profile case, the check above will ensure that
7257 * we're examining a user context.
7258 */
7259 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7260 cred_t *cr;
7261 cred_t *s_cr =
7262 ecb->dte_state->dts_cred.dcr_cred;
7263 proc_t *proc;
7264 #pragma unused(proc) /* __APPLE__ */
7265
7266 ASSERT(s_cr != NULL);
7267
7268 /*
7269 * XXX this is hackish, but so is setting a variable
7270 * XXX in a McCarthy OR...
7271 */
7272 if ((cr = dtrace_CRED()) == NULL ||
7273 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7274 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7275 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7276 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7277 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7278 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7279 #if !defined(__APPLE__)
7280 (proc = ttoproc(curthread)) == NULL ||
7281 (proc->p_flag & SNOCD))
7282 #else
7283 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7284 #endif /* __APPLE__ */
7285 continue;
7286 }
7287
7288 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7289 cred_t *cr;
7290 cred_t *s_cr =
7291 ecb->dte_state->dts_cred.dcr_cred;
7292 #pragma unused(cr, s_cr) /* __APPLE__ */
7293
7294 ASSERT(s_cr != NULL);
7295
7296 #if !defined(__APPLE__)
7297 if ((cr = CRED()) == NULL ||
7298 s_cr->cr_zone->zone_id !=
7299 cr->cr_zone->zone_id)
7300 continue;
7301 #else
7302 /* APPLE NOTE: Darwin doesn't do zones. */
7303 #endif /* __APPLE__ */
7304 }
7305 }
7306
7307 if (now - state->dts_alive > dtrace_deadman_timeout) {
7308 /*
7309 * We seem to be dead. Unless we (a) have kernel
7310 * destructive permissions (b) have expicitly enabled
7311 * destructive actions and (c) destructive actions have
7312 * not been disabled, we're going to transition into
7313 * the KILLED state, from which no further processing
7314 * on this state will be performed.
7315 */
7316 if (!dtrace_priv_kernel_destructive(state) ||
7317 !state->dts_cred.dcr_destructive ||
7318 dtrace_destructive_disallow) {
7319 void *activity = &state->dts_activity;
7320 dtrace_activity_t current;
7321
7322 do {
7323 current = state->dts_activity;
7324 } while (dtrace_cas32(activity, current,
7325 DTRACE_ACTIVITY_KILLED) != current);
7326
7327 continue;
7328 }
7329 }
7330
7331 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7332 ecb->dte_alignment, state, &mstate)) < 0)
7333 continue;
7334
7335 tomax = buf->dtb_tomax;
7336 ASSERT(tomax != NULL);
7337
7338 /*
7339 * Build and store the record header corresponding to the ECB.
7340 */
7341 if (ecb->dte_size != 0) {
7342 dtrace_rechdr_t dtrh;
7343
7344 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7345 mstate.dtms_timestamp = dtrace_gethrtime();
7346 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7347 }
7348
7349 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7350
7351 dtrh.dtrh_epid = ecb->dte_epid;
7352 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7353 DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7354 }
7355
7356 mstate.dtms_epid = ecb->dte_epid;
7357 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7358
7359 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7360 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7361 else
7362 mstate.dtms_access = 0;
7363
7364 if (pred != NULL) {
7365 dtrace_difo_t *dp = pred->dtp_difo;
7366 uint64_t rval;
7367
7368 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7369
7370 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7371 dtrace_cacheid_t cid = probe->dtpr_predcache;
7372
7373 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7374 /*
7375 * Update the predicate cache...
7376 */
7377 ASSERT(cid == pred->dtp_cacheid);
7378
7379 dtrace_set_thread_predcache(current_thread(), cid);
7380 }
7381
7382 continue;
7383 }
7384 }
7385
7386 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7387 act != NULL; act = act->dta_next) {
7388 size_t valoffs;
7389 dtrace_difo_t *dp;
7390 dtrace_recdesc_t *rec = &act->dta_rec;
7391
7392 size = rec->dtrd_size;
7393 valoffs = offs + rec->dtrd_offset;
7394
7395 if (DTRACEACT_ISAGG(act->dta_kind)) {
7396 uint64_t v = 0xbad;
7397 dtrace_aggregation_t *agg;
7398
7399 agg = (dtrace_aggregation_t *)act;
7400
7401 if ((dp = act->dta_difo) != NULL)
7402 v = dtrace_dif_emulate(dp,
7403 &mstate, vstate, state);
7404
7405 if (*flags & CPU_DTRACE_ERROR)
7406 continue;
7407
7408 /*
7409 * Note that we always pass the expression
7410 * value from the previous iteration of the
7411 * action loop. This value will only be used
7412 * if there is an expression argument to the
7413 * aggregating action, denoted by the
7414 * dtag_hasarg field.
7415 */
7416 dtrace_aggregate(agg, buf,
7417 offs, aggbuf, v, val);
7418 continue;
7419 }
7420
7421 switch (act->dta_kind) {
7422 case DTRACEACT_STOP:
7423 if (dtrace_priv_proc_destructive(state))
7424 dtrace_action_stop();
7425 continue;
7426
7427 case DTRACEACT_BREAKPOINT:
7428 if (dtrace_priv_kernel_destructive(state))
7429 dtrace_action_breakpoint(ecb);
7430 continue;
7431
7432 case DTRACEACT_PANIC:
7433 if (dtrace_priv_kernel_destructive(state))
7434 dtrace_action_panic(ecb);
7435 continue;
7436
7437 case DTRACEACT_STACK:
7438 if (!dtrace_priv_kernel(state))
7439 continue;
7440
7441 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7442 size / sizeof (pc_t), probe->dtpr_aframes,
7443 DTRACE_ANCHORED(probe) ? NULL :
7444 (uint32_t *)(uintptr_t)arg0);
7445 continue;
7446
7447 case DTRACEACT_JSTACK:
7448 case DTRACEACT_USTACK:
7449 if (!dtrace_priv_proc(state))
7450 continue;
7451
7452 /*
7453 * See comment in DIF_VAR_PID.
7454 */
7455 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7456 CPU_ON_INTR(CPU)) {
7457 int depth = DTRACE_USTACK_NFRAMES(
7458 rec->dtrd_arg) + 1;
7459
7460 dtrace_bzero((void *)(tomax + valoffs),
7461 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7462 + depth * sizeof (uint64_t));
7463
7464 continue;
7465 }
7466
7467 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7468 curproc->p_dtrace_helpers != NULL) {
7469 /*
7470 * This is the slow path -- we have
7471 * allocated string space, and we're
7472 * getting the stack of a process that
7473 * has helpers. Call into a separate
7474 * routine to perform this processing.
7475 */
7476 dtrace_action_ustack(&mstate, state,
7477 (uint64_t *)(tomax + valoffs),
7478 rec->dtrd_arg);
7479 continue;
7480 }
7481
7482 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7483 dtrace_getupcstack((uint64_t *)
7484 (tomax + valoffs),
7485 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7486 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7487 continue;
7488
7489 default:
7490 break;
7491 }
7492
7493 dp = act->dta_difo;
7494 ASSERT(dp != NULL);
7495
7496 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7497
7498 if (*flags & CPU_DTRACE_ERROR)
7499 continue;
7500
7501 switch (act->dta_kind) {
7502 case DTRACEACT_SPECULATE: {
7503 dtrace_rechdr_t *dtrh = NULL;
7504
7505 ASSERT(buf == &state->dts_buffer[cpuid]);
7506 buf = dtrace_speculation_buffer(state,
7507 cpuid, val);
7508
7509 if (buf == NULL) {
7510 *flags |= CPU_DTRACE_DROP;
7511 continue;
7512 }
7513
7514 offs = dtrace_buffer_reserve(buf,
7515 ecb->dte_needed, ecb->dte_alignment,
7516 state, NULL);
7517
7518 if (offs < 0) {
7519 *flags |= CPU_DTRACE_DROP;
7520 continue;
7521 }
7522
7523 tomax = buf->dtb_tomax;
7524 ASSERT(tomax != NULL);
7525
7526 if (ecb->dte_size == 0)
7527 continue;
7528
7529 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7530 dtrh = ((void *)(tomax + offs));
7531 dtrh->dtrh_epid = ecb->dte_epid;
7532
7533 /*
7534 * When the speculation is committed, all of
7535 * the records in the speculative buffer will
7536 * have their timestamps set to the commit
7537 * time. Until then, it is set to a sentinel
7538 * value, for debugability.
7539 */
7540 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7541
7542 continue;
7543 }
7544
7545 case DTRACEACT_CHILL:
7546 if (dtrace_priv_kernel_destructive(state))
7547 dtrace_action_chill(&mstate, val);
7548 continue;
7549
7550 case DTRACEACT_RAISE:
7551 if (dtrace_priv_proc_destructive(state))
7552 dtrace_action_raise(val);
7553 continue;
7554
7555 case DTRACEACT_PIDRESUME: /* __APPLE__ */
7556 if (dtrace_priv_proc_destructive(state))
7557 dtrace_action_pidresume(val);
7558 continue;
7559
7560 case DTRACEACT_COMMIT:
7561 ASSERT(!committed);
7562
7563 /*
7564 * We need to commit our buffer state.
7565 */
7566 if (ecb->dte_size)
7567 buf->dtb_offset = offs + ecb->dte_size;
7568 buf = &state->dts_buffer[cpuid];
7569 dtrace_speculation_commit(state, cpuid, val);
7570 committed = 1;
7571 continue;
7572
7573 case DTRACEACT_DISCARD:
7574 dtrace_speculation_discard(state, cpuid, val);
7575 continue;
7576
7577 case DTRACEACT_DIFEXPR:
7578 case DTRACEACT_LIBACT:
7579 case DTRACEACT_PRINTF:
7580 case DTRACEACT_PRINTA:
7581 case DTRACEACT_SYSTEM:
7582 case DTRACEACT_FREOPEN:
7583 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
7584 case DTRACEACT_TRACEMEM:
7585 break;
7586
7587 case DTRACEACT_TRACEMEM_DYNSIZE:
7588 tracememsize = val;
7589 break;
7590
7591 case DTRACEACT_SYM:
7592 case DTRACEACT_MOD:
7593 if (!dtrace_priv_kernel(state))
7594 continue;
7595 break;
7596
7597 case DTRACEACT_USYM:
7598 case DTRACEACT_UMOD:
7599 case DTRACEACT_UADDR: {
7600 if (!dtrace_priv_proc(state))
7601 continue;
7602
7603 DTRACE_STORE(uint64_t, tomax,
7604 valoffs, (uint64_t)dtrace_proc_selfpid());
7605 DTRACE_STORE(uint64_t, tomax,
7606 valoffs + sizeof (uint64_t), val);
7607
7608 continue;
7609 }
7610
7611 case DTRACEACT_EXIT: {
7612 /*
7613 * For the exit action, we are going to attempt
7614 * to atomically set our activity to be
7615 * draining. If this fails (either because
7616 * another CPU has beat us to the exit action,
7617 * or because our current activity is something
7618 * other than ACTIVE or WARMUP), we will
7619 * continue. This assures that the exit action
7620 * can be successfully recorded at most once
7621 * when we're in the ACTIVE state. If we're
7622 * encountering the exit() action while in
7623 * COOLDOWN, however, we want to honor the new
7624 * status code. (We know that we're the only
7625 * thread in COOLDOWN, so there is no race.)
7626 */
7627 void *activity = &state->dts_activity;
7628 dtrace_activity_t current = state->dts_activity;
7629
7630 if (current == DTRACE_ACTIVITY_COOLDOWN)
7631 break;
7632
7633 if (current != DTRACE_ACTIVITY_WARMUP)
7634 current = DTRACE_ACTIVITY_ACTIVE;
7635
7636 if (dtrace_cas32(activity, current,
7637 DTRACE_ACTIVITY_DRAINING) != current) {
7638 *flags |= CPU_DTRACE_DROP;
7639 continue;
7640 }
7641
7642 break;
7643 }
7644
7645 default:
7646 ASSERT(0);
7647 }
7648
7649 if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7650 uintptr_t end = valoffs + size;
7651
7652 if (tracememsize != 0 &&
7653 valoffs + tracememsize < end)
7654 {
7655 end = valoffs + tracememsize;
7656 tracememsize = 0;
7657 }
7658
7659 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7660 !dtrace_vcanload((void *)(uintptr_t)val,
7661 &dp->dtdo_rtype, NULL, &mstate, vstate))
7662 {
7663 continue;
7664 }
7665
7666 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7667 &val, end, act->dta_intuple,
7668 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7669 DIF_TF_BYREF: DIF_TF_BYUREF);
7670
7671 continue;
7672 }
7673
7674 switch (size) {
7675 case 0:
7676 break;
7677
7678 case sizeof (uint8_t):
7679 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7680 break;
7681 case sizeof (uint16_t):
7682 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7683 break;
7684 case sizeof (uint32_t):
7685 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7686 break;
7687 case sizeof (uint64_t):
7688 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7689 break;
7690 default:
7691 /*
7692 * Any other size should have been returned by
7693 * reference, not by value.
7694 */
7695 ASSERT(0);
7696 break;
7697 }
7698 }
7699
7700 if (*flags & CPU_DTRACE_DROP)
7701 continue;
7702
7703 if (*flags & CPU_DTRACE_FAULT) {
7704 int ndx;
7705 dtrace_action_t *err;
7706
7707 buf->dtb_errors++;
7708
7709 if (probe->dtpr_id == dtrace_probeid_error) {
7710 /*
7711 * There's nothing we can do -- we had an
7712 * error on the error probe. We bump an
7713 * error counter to at least indicate that
7714 * this condition happened.
7715 */
7716 dtrace_error(&state->dts_dblerrors);
7717 continue;
7718 }
7719
7720 if (vtime) {
7721 /*
7722 * Before recursing on dtrace_probe(), we
7723 * need to explicitly clear out our start
7724 * time to prevent it from being accumulated
7725 * into t_dtrace_vtime.
7726 */
7727
7728 /*
7729 * Darwin sets the sign bit on t_dtrace_tracing
7730 * to suspend accumulation to it.
7731 */
7732 dtrace_set_thread_tracing(current_thread(),
7733 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7734 }
7735
7736 /*
7737 * Iterate over the actions to figure out which action
7738 * we were processing when we experienced the error.
7739 * Note that act points _past_ the faulting action; if
7740 * act is ecb->dte_action, the fault was in the
7741 * predicate, if it's ecb->dte_action->dta_next it's
7742 * in action #1, and so on.
7743 */
7744 for (err = ecb->dte_action, ndx = 0;
7745 err != act; err = err->dta_next, ndx++)
7746 continue;
7747
7748 dtrace_probe_error(state, ecb->dte_epid, ndx,
7749 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7750 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7751 cpu_core[cpuid].cpuc_dtrace_illval);
7752
7753 continue;
7754 }
7755
7756 if (!committed)
7757 buf->dtb_offset = offs + ecb->dte_size;
7758 }
7759
7760 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7761 to the current thread. Instead it should accrue to DTrace. */
7762 if (vtime) {
7763 thread_t thread = current_thread();
7764 int64_t t = dtrace_get_thread_tracing(thread);
7765
7766 if (t >= 0) {
7767 /* Usual case, accumulate time spent here into t_dtrace_tracing */
7768 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7769 } else {
7770 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7771 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7772 }
7773 }
7774
7775 dtrace_probe_exit(cookie);
7776 }
7777
7778 /*
7779 * DTrace Probe Hashing Functions
7780 *
7781 * The functions in this section (and indeed, the functions in remaining
7782 * sections) are not _called_ from probe context. (Any exceptions to this are
7783 * marked with a "Note:".) Rather, they are called from elsewhere in the
7784 * DTrace framework to look-up probes in, add probes to and remove probes from
7785 * the DTrace probe hashes. (Each probe is hashed by each element of the
7786 * probe tuple -- allowing for fast lookups, regardless of what was
7787 * specified.)
7788 */
7789 static uint_t
dtrace_hash_str(const char * p)7790 dtrace_hash_str(const char *p)
7791 {
7792 unsigned int g;
7793 uint_t hval = 0;
7794
7795 while (*p) {
7796 hval = (hval << 4) + *p++;
7797 if ((g = (hval & 0xf0000000)) != 0)
7798 hval ^= g >> 24;
7799 hval &= ~g;
7800 }
7801 return (hval);
7802 }
7803
7804 static const char*
dtrace_strkey_probe_provider(void * elm,uintptr_t offs)7805 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7806 {
7807 #pragma unused(offs)
7808 dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7809 return probe->dtpr_provider->dtpv_name;
7810 }
7811
7812 static const char*
dtrace_strkey_offset(void * elm,uintptr_t offs)7813 dtrace_strkey_offset(void *elm, uintptr_t offs)
7814 {
7815 return ((char *)((uintptr_t)(elm) + offs));
7816 }
7817
7818 static const char*
dtrace_strkey_deref_offset(void * elm,uintptr_t offs)7819 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7820 {
7821 return *((char **)((uintptr_t)(elm) + offs));
7822 }
7823
7824 static dtrace_hash_t *
dtrace_hash_create(dtrace_strkey_f func,uintptr_t arg,uintptr_t nextoffs,uintptr_t prevoffs)7825 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7826 {
7827 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7828
7829 hash->dth_getstr = func;
7830 hash->dth_stroffs = arg;
7831 hash->dth_nextoffs = nextoffs;
7832 hash->dth_prevoffs = prevoffs;
7833
7834 hash->dth_size = 1;
7835 hash->dth_mask = hash->dth_size - 1;
7836
7837 hash->dth_tab = kmem_zalloc(hash->dth_size *
7838 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7839
7840 return (hash);
7841 }
7842
7843 /*
7844 * APPLE NOTE: dtrace_hash_destroy is not used.
7845 * It is called by dtrace_detach which is not
7846 * currently implemented. Revisit someday.
7847 */
7848 #if !defined(__APPLE__)
7849 static void
dtrace_hash_destroy(dtrace_hash_t * hash)7850 dtrace_hash_destroy(dtrace_hash_t *hash)
7851 {
7852 #if DEBUG
7853 int i;
7854
7855 for (i = 0; i < hash->dth_size; i++)
7856 ASSERT(hash->dth_tab[i] == NULL);
7857 #endif
7858
7859 kmem_free(hash->dth_tab,
7860 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7861 kmem_free(hash, sizeof (dtrace_hash_t));
7862 }
7863 #endif /* __APPLE__ */
7864
7865 static void
dtrace_hash_resize(dtrace_hash_t * hash)7866 dtrace_hash_resize(dtrace_hash_t *hash)
7867 {
7868 int size = hash->dth_size, i, ndx;
7869 int new_size = hash->dth_size << 1;
7870 int new_mask = new_size - 1;
7871 dtrace_hashbucket_t **new_tab, *bucket, *next;
7872
7873 ASSERT((new_size & new_mask) == 0);
7874
7875 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7876
7877 for (i = 0; i < size; i++) {
7878 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7879 void *elm = bucket->dthb_chain;
7880
7881 ASSERT(elm != NULL);
7882 ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7883
7884 next = bucket->dthb_next;
7885 bucket->dthb_next = new_tab[ndx];
7886 new_tab[ndx] = bucket;
7887 }
7888 }
7889
7890 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7891 hash->dth_tab = new_tab;
7892 hash->dth_size = new_size;
7893 hash->dth_mask = new_mask;
7894 }
7895
7896 static void
dtrace_hash_add(dtrace_hash_t * hash,void * new)7897 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7898 {
7899 int hashval = DTRACE_HASHSTR(hash, new);
7900 int ndx = hashval & hash->dth_mask;
7901 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7902 void **nextp, **prevp;
7903
7904 for (; bucket != NULL; bucket = bucket->dthb_next) {
7905 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7906 goto add;
7907 }
7908
7909 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7910 dtrace_hash_resize(hash);
7911 dtrace_hash_add(hash, new);
7912 return;
7913 }
7914
7915 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7916 bucket->dthb_next = hash->dth_tab[ndx];
7917 hash->dth_tab[ndx] = bucket;
7918 hash->dth_nbuckets++;
7919
7920 add:
7921 nextp = DTRACE_HASHNEXT(hash, new);
7922 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7923 *nextp = bucket->dthb_chain;
7924
7925 if (bucket->dthb_chain != NULL) {
7926 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7927 ASSERT(*prevp == NULL);
7928 *prevp = new;
7929 }
7930
7931 bucket->dthb_chain = new;
7932 bucket->dthb_len++;
7933 }
7934
7935 static void *
dtrace_hash_lookup_string(dtrace_hash_t * hash,const char * str)7936 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7937 {
7938 int hashval = dtrace_hash_str(str);
7939 int ndx = hashval & hash->dth_mask;
7940 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7941
7942 for (; bucket != NULL; bucket = bucket->dthb_next) {
7943 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7944 return (bucket->dthb_chain);
7945 }
7946
7947 return (NULL);
7948 }
7949
7950 static dtrace_probe_t *
dtrace_hash_lookup(dtrace_hash_t * hash,void * template)7951 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7952 {
7953 return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7954 }
7955
7956 static int
dtrace_hash_collisions(dtrace_hash_t * hash,void * template)7957 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7958 {
7959 int hashval = DTRACE_HASHSTR(hash, template);
7960 int ndx = hashval & hash->dth_mask;
7961 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7962
7963 for (; bucket != NULL; bucket = bucket->dthb_next) {
7964 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7965 return (bucket->dthb_len);
7966 }
7967
7968 return (0);
7969 }
7970
7971 static void
dtrace_hash_remove(dtrace_hash_t * hash,void * elm)7972 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7973 {
7974 int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7975 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7976
7977 void **prevp = DTRACE_HASHPREV(hash, elm);
7978 void **nextp = DTRACE_HASHNEXT(hash, elm);
7979
7980 /*
7981 * Find the bucket that we're removing this elm from.
7982 */
7983 for (; bucket != NULL; bucket = bucket->dthb_next) {
7984 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7985 break;
7986 }
7987
7988 ASSERT(bucket != NULL);
7989
7990 if (*prevp == NULL) {
7991 if (*nextp == NULL) {
7992 /*
7993 * The removed element was the only element on this
7994 * bucket; we need to remove the bucket.
7995 */
7996 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7997
7998 ASSERT(bucket->dthb_chain == elm);
7999 ASSERT(b != NULL);
8000
8001 if (b == bucket) {
8002 hash->dth_tab[ndx] = bucket->dthb_next;
8003 } else {
8004 while (b->dthb_next != bucket)
8005 b = b->dthb_next;
8006 b->dthb_next = bucket->dthb_next;
8007 }
8008
8009 ASSERT(hash->dth_nbuckets > 0);
8010 hash->dth_nbuckets--;
8011 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
8012 return;
8013 }
8014
8015 bucket->dthb_chain = *nextp;
8016 } else {
8017 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
8018 }
8019
8020 if (*nextp != NULL)
8021 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
8022 }
8023
8024 /*
8025 * DTrace Utility Functions
8026 *
8027 * These are random utility functions that are _not_ called from probe context.
8028 */
8029 static int
dtrace_badattr(const dtrace_attribute_t * a)8030 dtrace_badattr(const dtrace_attribute_t *a)
8031 {
8032 return (a->dtat_name > DTRACE_STABILITY_MAX ||
8033 a->dtat_data > DTRACE_STABILITY_MAX ||
8034 a->dtat_class > DTRACE_CLASS_MAX);
8035 }
8036
8037 /*
8038 * Returns a dtrace-managed copy of a string, and will
8039 * deduplicate copies of the same string.
8040 * If the specified string is NULL, returns an empty string
8041 */
8042 static char *
dtrace_strref(const char * str)8043 dtrace_strref(const char *str)
8044 {
8045 dtrace_string_t *s = NULL;
8046 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
8047
8048 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8049
8050 if (str == NULL)
8051 str = "";
8052
8053 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8054 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8055 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8056 continue;
8057 }
8058 ASSERT(s->dtst_refcount != UINT32_MAX);
8059 s->dtst_refcount++;
8060 return s->dtst_str;
8061 }
8062
8063 s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
8064 s->dtst_refcount = 1;
8065 (void) strlcpy(s->dtst_str, str, bufsize);
8066
8067 dtrace_hash_add(dtrace_strings, s);
8068
8069 return s->dtst_str;
8070 }
8071
8072 static void
dtrace_strunref(const char * str)8073 dtrace_strunref(const char *str)
8074 {
8075 ASSERT(str != NULL);
8076 dtrace_string_t *s = NULL;
8077 size_t bufsize = strlen(str) + 1;
8078
8079 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8080
8081 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8082 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8083 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8084 continue;
8085 }
8086 ASSERT(s->dtst_refcount != 0);
8087 s->dtst_refcount--;
8088 if (s->dtst_refcount == 0) {
8089 dtrace_hash_remove(dtrace_strings, s);
8090 kmem_free(s, sizeof(dtrace_string_t) + bufsize);
8091 }
8092 return;
8093 }
8094 panic("attempt to unref non-existent string %s", str);
8095 }
8096
8097 #define DTRACE_ISALPHA(c) \
8098 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8099
8100 static int
dtrace_badname(const char * s)8101 dtrace_badname(const char *s)
8102 {
8103 char c;
8104
8105 if (s == NULL || (c = *s++) == '\0')
8106 return (0);
8107
8108 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8109 return (1);
8110
8111 while ((c = *s++) != '\0') {
8112 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8113 c != '-' && c != '_' && c != '.' && c != '`')
8114 return (1);
8115 }
8116
8117 return (0);
8118 }
8119
8120 static void
dtrace_cred2priv(cred_t * cr,uint32_t * privp,uid_t * uidp,zoneid_t * zoneidp)8121 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8122 {
8123 uint32_t priv;
8124
8125 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8126 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
8127 priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
8128 }
8129 else {
8130 priv = DTRACE_PRIV_ALL;
8131 }
8132 *uidp = 0;
8133 *zoneidp = 0;
8134 } else {
8135 *uidp = crgetuid(cr);
8136 *zoneidp = crgetzoneid(cr);
8137
8138 priv = 0;
8139 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8140 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8141 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8142 priv |= DTRACE_PRIV_USER;
8143 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8144 priv |= DTRACE_PRIV_PROC;
8145 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8146 priv |= DTRACE_PRIV_OWNER;
8147 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8148 priv |= DTRACE_PRIV_ZONEOWNER;
8149 }
8150
8151 *privp = priv;
8152 }
8153
8154 #ifdef DTRACE_ERRDEBUG
8155 static void
dtrace_errdebug(const char * str)8156 dtrace_errdebug(const char *str)
8157 {
8158 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8159 int occupied = 0;
8160
8161 lck_mtx_lock(&dtrace_errlock);
8162 dtrace_errlast = str;
8163 dtrace_errthread = (kthread_t *)current_thread();
8164
8165 while (occupied++ < DTRACE_ERRHASHSZ) {
8166 if (dtrace_errhash[hval].dter_msg == str) {
8167 dtrace_errhash[hval].dter_count++;
8168 goto out;
8169 }
8170
8171 if (dtrace_errhash[hval].dter_msg != NULL) {
8172 hval = (hval + 1) % DTRACE_ERRHASHSZ;
8173 continue;
8174 }
8175
8176 dtrace_errhash[hval].dter_msg = str;
8177 dtrace_errhash[hval].dter_count = 1;
8178 goto out;
8179 }
8180
8181 panic("dtrace: undersized error hash");
8182 out:
8183 lck_mtx_unlock(&dtrace_errlock);
8184 }
8185 #endif
8186
8187 /*
8188 * DTrace Matching Functions
8189 *
8190 * These functions are used to match groups of probes, given some elements of
8191 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8192 */
8193 static int
dtrace_match_priv(const dtrace_probe_t * prp,uint32_t priv,uid_t uid,zoneid_t zoneid)8194 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8195 zoneid_t zoneid)
8196 {
8197 if (priv != DTRACE_PRIV_ALL) {
8198 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8199 uint32_t match = priv & ppriv;
8200
8201 /*
8202 * No PRIV_DTRACE_* privileges...
8203 */
8204 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8205 DTRACE_PRIV_KERNEL)) == 0)
8206 return (0);
8207
8208 /*
8209 * No matching bits, but there were bits to match...
8210 */
8211 if (match == 0 && ppriv != 0)
8212 return (0);
8213
8214 /*
8215 * Need to have permissions to the process, but don't...
8216 */
8217 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8218 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8219 return (0);
8220 }
8221
8222 /*
8223 * Need to be in the same zone unless we possess the
8224 * privilege to examine all zones.
8225 */
8226 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8227 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8228 return (0);
8229 }
8230 }
8231
8232 return (1);
8233 }
8234
8235 /*
8236 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8237 * consists of input pattern strings and an ops-vector to evaluate them.
8238 * This function returns >0 for match, 0 for no match, and <0 for error.
8239 */
8240 static int
dtrace_match_probe(const dtrace_probe_t * prp,const dtrace_probekey_t * pkp,uint32_t priv,uid_t uid,zoneid_t zoneid)8241 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8242 uint32_t priv, uid_t uid, zoneid_t zoneid)
8243 {
8244 dtrace_provider_t *pvp = prp->dtpr_provider;
8245 int rv;
8246
8247 if (pvp->dtpv_defunct)
8248 return (0);
8249
8250 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8251 return (rv);
8252
8253 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8254 return (rv);
8255
8256 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8257 return (rv);
8258
8259 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8260 return (rv);
8261
8262 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8263 return (0);
8264
8265 return (rv);
8266 }
8267
8268 /*
8269 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8270 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8271 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8272 * In addition, all of the recursion cases except for '*' matching have been
8273 * unwound. For '*', we still implement recursive evaluation, but a depth
8274 * counter is maintained and matching is aborted if we recurse too deep.
8275 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8276 */
8277 static int
dtrace_match_glob(const char * s,const char * p,int depth)8278 dtrace_match_glob(const char *s, const char *p, int depth)
8279 {
8280 const char *olds;
8281 char s1, c;
8282 int gs;
8283
8284 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8285 return (-1);
8286
8287 if (s == NULL)
8288 s = ""; /* treat NULL as empty string */
8289
8290 top:
8291 olds = s;
8292 s1 = *s++;
8293
8294 if (p == NULL)
8295 return (0);
8296
8297 if ((c = *p++) == '\0')
8298 return (s1 == '\0');
8299
8300 switch (c) {
8301 case '[': {
8302 int ok = 0, notflag = 0;
8303 char lc = '\0';
8304
8305 if (s1 == '\0')
8306 return (0);
8307
8308 if (*p == '!') {
8309 notflag = 1;
8310 p++;
8311 }
8312
8313 if ((c = *p++) == '\0')
8314 return (0);
8315
8316 do {
8317 if (c == '-' && lc != '\0' && *p != ']') {
8318 if ((c = *p++) == '\0')
8319 return (0);
8320 if (c == '\\' && (c = *p++) == '\0')
8321 return (0);
8322
8323 if (notflag) {
8324 if (s1 < lc || s1 > c)
8325 ok++;
8326 else
8327 return (0);
8328 } else if (lc <= s1 && s1 <= c)
8329 ok++;
8330
8331 } else if (c == '\\' && (c = *p++) == '\0')
8332 return (0);
8333
8334 lc = c; /* save left-hand 'c' for next iteration */
8335
8336 if (notflag) {
8337 if (s1 != c)
8338 ok++;
8339 else
8340 return (0);
8341 } else if (s1 == c)
8342 ok++;
8343
8344 if ((c = *p++) == '\0')
8345 return (0);
8346
8347 } while (c != ']');
8348
8349 if (ok)
8350 goto top;
8351
8352 return (0);
8353 }
8354
8355 case '\\':
8356 if ((c = *p++) == '\0')
8357 return (0);
8358 OS_FALLTHROUGH;
8359
8360 default:
8361 if (c != s1)
8362 return (0);
8363 OS_FALLTHROUGH;
8364
8365 case '?':
8366 if (s1 != '\0')
8367 goto top;
8368 return (0);
8369
8370 case '*':
8371 while (*p == '*')
8372 p++; /* consecutive *'s are identical to a single one */
8373
8374 if (*p == '\0')
8375 return (1);
8376
8377 for (s = olds; *s != '\0'; s++) {
8378 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8379 return (gs);
8380 }
8381
8382 return (0);
8383 }
8384 }
8385
8386 /*ARGSUSED*/
8387 static int
dtrace_match_string(const char * s,const char * p,int depth)8388 dtrace_match_string(const char *s, const char *p, int depth)
8389 {
8390 #pragma unused(depth) /* __APPLE__ */
8391 return (s != NULL && s == p);
8392 }
8393
8394 /*ARGSUSED*/
8395 static int
dtrace_match_module(const char * s,const char * p,int depth)8396 dtrace_match_module(const char *s, const char *p, int depth)
8397 {
8398 #pragma unused(depth) /* __APPLE__ */
8399 size_t len;
8400 if (s == NULL || p == NULL)
8401 return (0);
8402
8403 len = strlen(p);
8404
8405 if (strncmp(p, s, len) != 0)
8406 return (0);
8407
8408 if (s[len] == '.' || s[len] == '\0')
8409 return (1);
8410
8411 return (0);
8412 }
8413
8414 /*ARGSUSED*/
8415 static int
dtrace_match_nul(const char * s,const char * p,int depth)8416 dtrace_match_nul(const char *s, const char *p, int depth)
8417 {
8418 #pragma unused(s, p, depth) /* __APPLE__ */
8419 return (1); /* always match the empty pattern */
8420 }
8421
8422 /*ARGSUSED*/
8423 static int
dtrace_match_nonzero(const char * s,const char * p,int depth)8424 dtrace_match_nonzero(const char *s, const char *p, int depth)
8425 {
8426 #pragma unused(p, depth) /* __APPLE__ */
8427 return (s != NULL && s[0] != '\0');
8428 }
8429
8430 static int
dtrace_match(const dtrace_probekey_t * pkp,uint32_t priv,uid_t uid,zoneid_t zoneid,int (* matched)(dtrace_probe_t *,void *,void *),void * arg1,void * arg2)8431 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8432 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8433 {
8434 dtrace_probe_t *probe;
8435 dtrace_provider_t prov_template = {
8436 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8437 };
8438
8439 dtrace_probe_t template = {
8440 .dtpr_provider = &prov_template,
8441 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8442 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8443 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8444 };
8445
8446 dtrace_hash_t *hash = NULL;
8447 int len, rc, best = INT_MAX, nmatched = 0;
8448 dtrace_id_t i;
8449
8450 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8451
8452 /*
8453 * If the probe ID is specified in the key, just lookup by ID and
8454 * invoke the match callback once if a matching probe is found.
8455 */
8456 if (pkp->dtpk_id != DTRACE_IDNONE) {
8457 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8458 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8459 if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8460 return (DTRACE_MATCH_FAIL);
8461 nmatched++;
8462 }
8463 return (nmatched);
8464 }
8465
8466 /*
8467 * We want to find the most distinct of the provider name, module name,
8468 * function name, and name. So for each one that is not a glob
8469 * pattern or empty string, we perform a lookup in the corresponding
8470 * hash and use the hash table with the fewest collisions to do our
8471 * search.
8472 */
8473 if (pkp->dtpk_pmatch == &dtrace_match_string &&
8474 (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8475 best = len;
8476 hash = dtrace_byprov;
8477 }
8478
8479 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8480 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8481 best = len;
8482 hash = dtrace_bymod;
8483 }
8484
8485 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8486 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8487 best = len;
8488 hash = dtrace_byfunc;
8489 }
8490
8491 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8492 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8493 best = len;
8494 hash = dtrace_byname;
8495 }
8496
8497 /*
8498 * If we did not select a hash table, iterate over every probe and
8499 * invoke our callback for each one that matches our input probe key.
8500 */
8501 if (hash == NULL) {
8502 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8503 if ((probe = dtrace_probes[i]) == NULL ||
8504 dtrace_match_probe(probe, pkp, priv, uid,
8505 zoneid) <= 0)
8506 continue;
8507
8508 nmatched++;
8509
8510 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8511 if (rc == DTRACE_MATCH_FAIL)
8512 return (DTRACE_MATCH_FAIL);
8513 break;
8514 }
8515 }
8516
8517 return (nmatched);
8518 }
8519
8520 /*
8521 * If we selected a hash table, iterate over each probe of the same key
8522 * name and invoke the callback for every probe that matches the other
8523 * attributes of our input probe key.
8524 */
8525 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8526 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8527
8528 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8529 continue;
8530
8531 nmatched++;
8532
8533 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8534 if (rc == DTRACE_MATCH_FAIL)
8535 return (DTRACE_MATCH_FAIL);
8536 break;
8537 }
8538 }
8539
8540 return (nmatched);
8541 }
8542
8543 /*
8544 * Return the function pointer dtrace_probecmp() should use to compare the
8545 * specified pattern with a string. For NULL or empty patterns, we select
8546 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8547 * For non-empty non-glob strings, we use dtrace_match_string().
8548 */
8549 static dtrace_probekey_f *
dtrace_probekey_func(const char * p)8550 dtrace_probekey_func(const char *p)
8551 {
8552 char c;
8553
8554 if (p == NULL || *p == '\0')
8555 return (&dtrace_match_nul);
8556
8557 while ((c = *p++) != '\0') {
8558 if (c == '[' || c == '?' || c == '*' || c == '\\')
8559 return (&dtrace_match_glob);
8560 }
8561
8562 return (&dtrace_match_string);
8563 }
8564
8565 static dtrace_probekey_f *
dtrace_probekey_module_func(const char * p)8566 dtrace_probekey_module_func(const char *p)
8567 {
8568 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8569
8570 dtrace_probekey_f *f = dtrace_probekey_func(p);
8571 if (f == &dtrace_match_string) {
8572 dtrace_probe_t template = {
8573 .dtpr_mod = (char *)(uintptr_t)p,
8574 };
8575 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8576 return (&dtrace_match_module);
8577 }
8578 return (&dtrace_match_string);
8579 }
8580 return f;
8581 }
8582
8583 /*
8584 * Build a probe comparison key for use with dtrace_match_probe() from the
8585 * given probe description. By convention, a null key only matches anchored
8586 * probes: if each field is the empty string, reset dtpk_fmatch to
8587 * dtrace_match_nonzero().
8588 */
8589 static void
dtrace_probekey(const dtrace_probedesc_t * pdp,dtrace_probekey_t * pkp)8590 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8591 {
8592
8593 pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8594 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8595
8596 pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8597 pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8598
8599 pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8600 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8601
8602 pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8603 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8604
8605 pkp->dtpk_id = pdp->dtpd_id;
8606
8607 if (pkp->dtpk_id == DTRACE_IDNONE &&
8608 pkp->dtpk_pmatch == &dtrace_match_nul &&
8609 pkp->dtpk_mmatch == &dtrace_match_nul &&
8610 pkp->dtpk_fmatch == &dtrace_match_nul &&
8611 pkp->dtpk_nmatch == &dtrace_match_nul)
8612 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8613 }
8614
8615 static void
dtrace_probekey_release(dtrace_probekey_t * pkp)8616 dtrace_probekey_release(dtrace_probekey_t *pkp)
8617 {
8618 dtrace_strunref(pkp->dtpk_prov);
8619 dtrace_strunref(pkp->dtpk_mod);
8620 dtrace_strunref(pkp->dtpk_func);
8621 dtrace_strunref(pkp->dtpk_name);
8622 }
8623
8624 static int
dtrace_cond_provider_match(dtrace_probedesc_t * desc,void * data)8625 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8626 {
8627 if (desc == NULL)
8628 return 1;
8629
8630 dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8631
8632 return func((char*)data, desc->dtpd_provider, 0);
8633 }
8634
8635 /*
8636 * DTrace Provider-to-Framework API Functions
8637 *
8638 * These functions implement much of the Provider-to-Framework API, as
8639 * described in <sys/dtrace.h>. The parts of the API not in this section are
8640 * the functions in the API for probe management (found below), and
8641 * dtrace_probe() itself (found above).
8642 */
8643
8644 /*
8645 * Register the calling provider with the DTrace framework. This should
8646 * generally be called by DTrace providers in their attach(9E) entry point.
8647 */
8648 int
dtrace_register(const char * name,const dtrace_pattr_t * pap,uint32_t priv,cred_t * cr,const dtrace_pops_t * pops,void * arg,dtrace_provider_id_t * idp)8649 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8650 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8651 {
8652 dtrace_provider_t *provider;
8653
8654 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8655 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8656 "arguments", name ? name : "<NULL>");
8657 return (EINVAL);
8658 }
8659
8660 if (name[0] == '\0' || dtrace_badname(name)) {
8661 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8662 "provider name", name);
8663 return (EINVAL);
8664 }
8665
8666 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8667 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8668 pops->dtps_destroy == NULL ||
8669 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8670 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8671 "provider ops", name);
8672 return (EINVAL);
8673 }
8674
8675 if (dtrace_badattr(&pap->dtpa_provider) ||
8676 dtrace_badattr(&pap->dtpa_mod) ||
8677 dtrace_badattr(&pap->dtpa_func) ||
8678 dtrace_badattr(&pap->dtpa_name) ||
8679 dtrace_badattr(&pap->dtpa_args)) {
8680 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8681 "provider attributes", name);
8682 return (EINVAL);
8683 }
8684
8685 if (priv & ~DTRACE_PRIV_ALL) {
8686 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8687 "privilege attributes", name);
8688 return (EINVAL);
8689 }
8690
8691 if ((priv & DTRACE_PRIV_KERNEL) &&
8692 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8693 pops->dtps_usermode == NULL) {
8694 cmn_err(CE_WARN, "failed to register provider '%s': need "
8695 "dtps_usermode() op for given privilege attributes", name);
8696 return (EINVAL);
8697 }
8698
8699 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8700
8701 provider->dtpv_attr = *pap;
8702 provider->dtpv_priv.dtpp_flags = priv;
8703 if (cr != NULL) {
8704 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8705 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8706 }
8707 provider->dtpv_pops = *pops;
8708
8709 if (pops->dtps_provide == NULL) {
8710 ASSERT(pops->dtps_provide_module != NULL);
8711 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8712 }
8713
8714 if (pops->dtps_provide_module == NULL) {
8715 ASSERT(pops->dtps_provide != NULL);
8716 provider->dtpv_pops.dtps_provide_module =
8717 dtrace_provide_module_nullop;
8718 }
8719
8720 if (pops->dtps_suspend == NULL) {
8721 ASSERT(pops->dtps_resume == NULL);
8722 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8723 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8724 }
8725
8726 provider->dtpv_arg = arg;
8727 *idp = (dtrace_provider_id_t)provider;
8728
8729 if (pops == &dtrace_provider_ops) {
8730 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8731 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8732
8733 provider->dtpv_name = dtrace_strref(name);
8734
8735 ASSERT(dtrace_anon.dta_enabling == NULL);
8736
8737 /*
8738 * We make sure that the DTrace provider is at the head of
8739 * the provider chain.
8740 */
8741 provider->dtpv_next = dtrace_provider;
8742 dtrace_provider = provider;
8743 return (0);
8744 }
8745
8746 lck_mtx_lock(&dtrace_provider_lock);
8747 lck_mtx_lock(&dtrace_lock);
8748
8749 provider->dtpv_name = dtrace_strref(name);
8750
8751 /*
8752 * If there is at least one provider registered, we'll add this
8753 * provider after the first provider.
8754 */
8755 if (dtrace_provider != NULL) {
8756 provider->dtpv_next = dtrace_provider->dtpv_next;
8757 dtrace_provider->dtpv_next = provider;
8758 } else {
8759 dtrace_provider = provider;
8760 }
8761
8762 if (dtrace_retained != NULL) {
8763 dtrace_enabling_provide(provider);
8764
8765 /*
8766 * Now we need to call dtrace_enabling_matchall_with_cond() --
8767 * with a condition matching the provider name we just added,
8768 * which will acquire cpu_lock and dtrace_lock. We therefore need
8769 * to drop all of our locks before calling into it...
8770 */
8771 lck_mtx_unlock(&dtrace_lock);
8772 lck_mtx_unlock(&dtrace_provider_lock);
8773
8774 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8775 dtrace_enabling_matchall_with_cond(&cond);
8776
8777 return (0);
8778 }
8779
8780 lck_mtx_unlock(&dtrace_lock);
8781 lck_mtx_unlock(&dtrace_provider_lock);
8782
8783 return (0);
8784 }
8785
8786 /*
8787 * Unregister the specified provider from the DTrace framework. This should
8788 * generally be called by DTrace providers in their detach(9E) entry point.
8789 */
8790 int
dtrace_unregister(dtrace_provider_id_t id)8791 dtrace_unregister(dtrace_provider_id_t id)
8792 {
8793 dtrace_provider_t *old = (dtrace_provider_t *)id;
8794 dtrace_provider_t *prev = NULL;
8795 int self = 0;
8796 dtrace_probe_t *probe, *first = NULL, *next = NULL;
8797 dtrace_probe_t template = {
8798 .dtpr_provider = old
8799 };
8800
8801 if (old->dtpv_pops.dtps_enable ==
8802 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8803 /*
8804 * If DTrace itself is the provider, we're called with locks
8805 * already held.
8806 */
8807 ASSERT(old == dtrace_provider);
8808 ASSERT(dtrace_devi != NULL);
8809 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8810 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8811 self = 1;
8812
8813 if (dtrace_provider->dtpv_next != NULL) {
8814 /*
8815 * There's another provider here; return failure.
8816 */
8817 return (EBUSY);
8818 }
8819 } else {
8820 lck_mtx_lock(&dtrace_provider_lock);
8821 lck_mtx_lock(&mod_lock);
8822 lck_mtx_lock(&dtrace_lock);
8823 }
8824
8825 /*
8826 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8827 * probes, we refuse to let providers slither away, unless this
8828 * provider has already been explicitly invalidated.
8829 */
8830 if (!old->dtpv_defunct &&
8831 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8832 dtrace_anon.dta_state->dts_necbs > 0))) {
8833 if (!self) {
8834 lck_mtx_unlock(&dtrace_lock);
8835 lck_mtx_unlock(&mod_lock);
8836 lck_mtx_unlock(&dtrace_provider_lock);
8837 }
8838 return (EBUSY);
8839 }
8840
8841 /*
8842 * Attempt to destroy the probes associated with this provider.
8843 */
8844 if (old->dtpv_ecb_count!=0) {
8845 /*
8846 * We have at least one ECB; we can't remove this provider.
8847 */
8848 if (!self) {
8849 lck_mtx_unlock(&dtrace_lock);
8850 lck_mtx_unlock(&mod_lock);
8851 lck_mtx_unlock(&dtrace_provider_lock);
8852 }
8853 return (EBUSY);
8854 }
8855
8856 /*
8857 * All of the probes for this provider are disabled; we can safely
8858 * remove all of them from their hash chains and from the probe array.
8859 */
8860 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8861 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8862 if (probe->dtpr_provider != old)
8863 continue;
8864
8865 dtrace_probes[probe->dtpr_id - 1] = NULL;
8866 old->dtpv_probe_count--;
8867
8868 dtrace_hash_remove(dtrace_bymod, probe);
8869 dtrace_hash_remove(dtrace_byfunc, probe);
8870 dtrace_hash_remove(dtrace_byname, probe);
8871
8872 if (first == NULL) {
8873 first = probe;
8874 probe->dtpr_nextmod = NULL;
8875 } else {
8876 /*
8877 * Use nextmod as the chain of probes to remove
8878 */
8879 probe->dtpr_nextmod = first;
8880 first = probe;
8881 }
8882 }
8883
8884 for (probe = first; probe != NULL; probe = next) {
8885 next = probe->dtpr_nextmod;
8886 dtrace_hash_remove(dtrace_byprov, probe);
8887 }
8888
8889 /*
8890 * The provider's probes have been removed from the hash chains and
8891 * from the probe array. Now issue a dtrace_sync() to be sure that
8892 * everyone has cleared out from any probe array processing.
8893 */
8894 dtrace_sync();
8895
8896 for (probe = first; probe != NULL; probe = next) {
8897 next = probe->dtpr_nextmod;
8898
8899 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8900 probe->dtpr_arg);
8901 dtrace_strunref(probe->dtpr_mod);
8902 dtrace_strunref(probe->dtpr_func);
8903 dtrace_strunref(probe->dtpr_name);
8904 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8905 zfree(dtrace_probe_t_zone, probe);
8906 }
8907
8908 if ((prev = dtrace_provider) == old) {
8909 ASSERT(self || dtrace_devi == NULL);
8910 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8911 dtrace_provider = old->dtpv_next;
8912 } else {
8913 while (prev != NULL && prev->dtpv_next != old)
8914 prev = prev->dtpv_next;
8915
8916 if (prev == NULL) {
8917 panic("attempt to unregister non-existent "
8918 "dtrace provider %p\n", (void *)id);
8919 }
8920
8921 prev->dtpv_next = old->dtpv_next;
8922 }
8923
8924 dtrace_strunref(old->dtpv_name);
8925
8926 if (!self) {
8927 lck_mtx_unlock(&dtrace_lock);
8928 lck_mtx_unlock(&mod_lock);
8929 lck_mtx_unlock(&dtrace_provider_lock);
8930 }
8931
8932 kmem_free(old, sizeof (dtrace_provider_t));
8933
8934 return (0);
8935 }
8936
8937 /*
8938 * Invalidate the specified provider. All subsequent probe lookups for the
8939 * specified provider will fail, but its probes will not be removed.
8940 */
8941 void
dtrace_invalidate(dtrace_provider_id_t id)8942 dtrace_invalidate(dtrace_provider_id_t id)
8943 {
8944 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8945
8946 ASSERT(pvp->dtpv_pops.dtps_enable !=
8947 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8948
8949 lck_mtx_lock(&dtrace_provider_lock);
8950 lck_mtx_lock(&dtrace_lock);
8951
8952 pvp->dtpv_defunct = 1;
8953
8954 lck_mtx_unlock(&dtrace_lock);
8955 lck_mtx_unlock(&dtrace_provider_lock);
8956 }
8957
8958 /*
8959 * Indicate whether or not DTrace has attached.
8960 */
8961 int
dtrace_attached(void)8962 dtrace_attached(void)
8963 {
8964 /*
8965 * dtrace_provider will be non-NULL iff the DTrace driver has
8966 * attached. (It's non-NULL because DTrace is always itself a
8967 * provider.)
8968 */
8969 return (dtrace_provider != NULL);
8970 }
8971
8972 /*
8973 * Remove all the unenabled probes for the given provider. This function is
8974 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8975 * -- just as many of its associated probes as it can.
8976 */
8977 int
dtrace_condense(dtrace_provider_id_t id)8978 dtrace_condense(dtrace_provider_id_t id)
8979 {
8980 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8981 dtrace_probe_t *probe, *first = NULL;
8982 dtrace_probe_t template = {
8983 .dtpr_provider = prov
8984 };
8985
8986 /*
8987 * Make sure this isn't the dtrace provider itself.
8988 */
8989 ASSERT(prov->dtpv_pops.dtps_enable !=
8990 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8991
8992 lck_mtx_lock(&dtrace_provider_lock);
8993 lck_mtx_lock(&dtrace_lock);
8994
8995 /*
8996 * Attempt to destroy the probes associated with this provider.
8997 */
8998 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8999 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
9000
9001 if (probe->dtpr_provider != prov)
9002 continue;
9003
9004 if (probe->dtpr_ecb != NULL)
9005 continue;
9006
9007 dtrace_probes[probe->dtpr_id - 1] = NULL;
9008 prov->dtpv_probe_count--;
9009
9010 dtrace_hash_remove(dtrace_bymod, probe);
9011 dtrace_hash_remove(dtrace_byfunc, probe);
9012 dtrace_hash_remove(dtrace_byname, probe);
9013
9014 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
9015 probe->dtpr_arg);
9016 dtrace_strunref(probe->dtpr_mod);
9017 dtrace_strunref(probe->dtpr_func);
9018 dtrace_strunref(probe->dtpr_name);
9019 if (first == NULL) {
9020 first = probe;
9021 probe->dtpr_nextmod = NULL;
9022 } else {
9023 /*
9024 * Use nextmod as the chain of probes to remove
9025 */
9026 probe->dtpr_nextmod = first;
9027 first = probe;
9028 }
9029 }
9030
9031 for (probe = first; probe != NULL; probe = first) {
9032 first = probe->dtpr_nextmod;
9033 dtrace_hash_remove(dtrace_byprov, probe);
9034 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
9035 zfree(dtrace_probe_t_zone, probe);
9036 }
9037
9038 lck_mtx_unlock(&dtrace_lock);
9039 lck_mtx_unlock(&dtrace_provider_lock);
9040
9041 return (0);
9042 }
9043
9044 /*
9045 * DTrace Probe Management Functions
9046 *
9047 * The functions in this section perform the DTrace probe management,
9048 * including functions to create probes, look-up probes, and call into the
9049 * providers to request that probes be provided. Some of these functions are
9050 * in the Provider-to-Framework API; these functions can be identified by the
9051 * fact that they are not declared "static".
9052 */
9053
9054 /*
9055 * Create a probe with the specified module name, function name, and name.
9056 */
9057 dtrace_id_t
dtrace_probe_create(dtrace_provider_id_t prov,const char * mod,const char * func,const char * name,int aframes,void * arg)9058 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
9059 const char *func, const char *name, int aframes, void *arg)
9060 {
9061 dtrace_probe_t *probe, **probes;
9062 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
9063 dtrace_id_t id;
9064
9065 if (provider == dtrace_provider) {
9066 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9067 } else {
9068 lck_mtx_lock(&dtrace_lock);
9069 }
9070
9071 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
9072 VM_BESTFIT | VM_SLEEP);
9073
9074 probe = zalloc_flags(dtrace_probe_t_zone, Z_WAITOK | Z_ZERO);
9075
9076 probe->dtpr_id = id;
9077 probe->dtpr_gen = dtrace_probegen++;
9078 probe->dtpr_mod = dtrace_strref(mod);
9079 probe->dtpr_func = dtrace_strref(func);
9080 probe->dtpr_name = dtrace_strref(name);
9081 probe->dtpr_arg = arg;
9082 probe->dtpr_aframes = aframes;
9083 probe->dtpr_provider = provider;
9084
9085 dtrace_hash_add(dtrace_byprov, probe);
9086 dtrace_hash_add(dtrace_bymod, probe);
9087 dtrace_hash_add(dtrace_byfunc, probe);
9088 dtrace_hash_add(dtrace_byname, probe);
9089
9090 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
9091 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
9092 size_t nsize = osize * 2;
9093
9094 probes = kmem_zalloc(nsize, KM_SLEEP);
9095
9096 dtrace_probe_t **oprobes = dtrace_probes;
9097
9098 bcopy(oprobes, probes, osize);
9099 dtrace_membar_producer();
9100 dtrace_probes = probes;
9101
9102 dtrace_sync();
9103
9104 /*
9105 * All CPUs are now seeing the new probes array; we can
9106 * safely free the old array.
9107 */
9108 kmem_free(oprobes, osize);
9109 dtrace_nprobes *= 2;
9110
9111 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
9112 }
9113
9114 ASSERT(dtrace_probes[id - 1] == NULL);
9115 dtrace_probes[id - 1] = probe;
9116 provider->dtpv_probe_count++;
9117
9118 if (provider != dtrace_provider)
9119 lck_mtx_unlock(&dtrace_lock);
9120
9121 return (id);
9122 }
9123
9124 static dtrace_probe_t *
dtrace_probe_lookup_id(dtrace_id_t id)9125 dtrace_probe_lookup_id(dtrace_id_t id)
9126 {
9127 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9128
9129 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
9130 return (NULL);
9131
9132 return (dtrace_probes[id - 1]);
9133 }
9134
9135 static int
dtrace_probe_lookup_match(dtrace_probe_t * probe,void * arg1,void * arg2)9136 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
9137 {
9138 #pragma unused(arg2)
9139 *((dtrace_id_t *)arg1) = probe->dtpr_id;
9140
9141 return (DTRACE_MATCH_DONE);
9142 }
9143
9144 /*
9145 * Look up a probe based on provider and one or more of module name, function
9146 * name and probe name.
9147 */
9148 dtrace_id_t
dtrace_probe_lookup(dtrace_provider_id_t prid,const char * mod,const char * func,const char * name)9149 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9150 const char *func, const char *name)
9151 {
9152 dtrace_probekey_t pkey;
9153 dtrace_id_t id;
9154 int match;
9155
9156 lck_mtx_lock(&dtrace_lock);
9157
9158 pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9159 pkey.dtpk_pmatch = &dtrace_match_string;
9160 pkey.dtpk_mod = dtrace_strref(mod);
9161 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9162 pkey.dtpk_func = dtrace_strref(func);
9163 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9164 pkey.dtpk_name = dtrace_strref(name);
9165 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9166 pkey.dtpk_id = DTRACE_IDNONE;
9167
9168 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9169 dtrace_probe_lookup_match, &id, NULL);
9170
9171 dtrace_probekey_release(&pkey);
9172
9173 lck_mtx_unlock(&dtrace_lock);
9174
9175 ASSERT(match == 1 || match == 0);
9176 return (match ? id : 0);
9177 }
9178
9179 /*
9180 * Returns the probe argument associated with the specified probe.
9181 */
9182 void *
dtrace_probe_arg(dtrace_provider_id_t id,dtrace_id_t pid)9183 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9184 {
9185 dtrace_probe_t *probe;
9186 void *rval = NULL;
9187
9188 lck_mtx_lock(&dtrace_lock);
9189
9190 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9191 probe->dtpr_provider == (dtrace_provider_t *)id)
9192 rval = probe->dtpr_arg;
9193
9194 lck_mtx_unlock(&dtrace_lock);
9195
9196 return (rval);
9197 }
9198
9199 /*
9200 * Copy a probe into a probe description.
9201 */
9202 static void
dtrace_probe_description(const dtrace_probe_t * prp,dtrace_probedesc_t * pdp)9203 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9204 {
9205 bzero(pdp, sizeof (dtrace_probedesc_t));
9206 pdp->dtpd_id = prp->dtpr_id;
9207
9208 /* APPLE NOTE: Darwin employs size bounded string operation. */
9209 (void) strlcpy(pdp->dtpd_provider,
9210 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9211
9212 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9213 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9214 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9215 }
9216
9217 /*
9218 * Called to indicate that a probe -- or probes -- should be provided by a
9219 * specfied provider. If the specified description is NULL, the provider will
9220 * be told to provide all of its probes. (This is done whenever a new
9221 * consumer comes along, or whenever a retained enabling is to be matched.) If
9222 * the specified description is non-NULL, the provider is given the
9223 * opportunity to dynamically provide the specified probe, allowing providers
9224 * to support the creation of probes on-the-fly. (So-called _autocreated_
9225 * probes.) If the provider is NULL, the operations will be applied to all
9226 * providers; if the provider is non-NULL the operations will only be applied
9227 * to the specified provider. The dtrace_provider_lock must be held, and the
9228 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9229 * will need to grab the dtrace_lock when it reenters the framework through
9230 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9231 */
9232 static void
dtrace_probe_provide(dtrace_probedesc_t * desc,dtrace_provider_t * prv)9233 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9234 {
9235 struct modctl *ctl;
9236 int all = 0;
9237
9238 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9239
9240 if (prv == NULL) {
9241 all = 1;
9242 prv = dtrace_provider;
9243 }
9244
9245 do {
9246 /*
9247 * First, call the blanket provide operation.
9248 */
9249 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9250
9251 /*
9252 * Now call the per-module provide operation. We will grab
9253 * mod_lock to prevent the list from being modified. Note
9254 * that this also prevents the mod_busy bits from changing.
9255 * (mod_busy can only be changed with mod_lock held.)
9256 */
9257 lck_mtx_lock(&mod_lock);
9258
9259 ctl = dtrace_modctl_list;
9260 while (ctl) {
9261 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9262 ctl = ctl->mod_next;
9263 }
9264
9265 lck_mtx_unlock(&mod_lock);
9266 } while (all && (prv = prv->dtpv_next) != NULL);
9267 }
9268
9269 /*
9270 * Iterate over each probe, and call the Framework-to-Provider API function
9271 * denoted by offs.
9272 */
9273 static void
dtrace_probe_foreach(uintptr_t offs)9274 dtrace_probe_foreach(uintptr_t offs)
9275 {
9276 dtrace_provider_t *prov;
9277 void (*func)(void *, dtrace_id_t, void *);
9278 dtrace_probe_t *probe;
9279 dtrace_icookie_t cookie;
9280 int i;
9281
9282 /*
9283 * We disable interrupts to walk through the probe array. This is
9284 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9285 * won't see stale data.
9286 */
9287 cookie = dtrace_interrupt_disable();
9288
9289 for (i = 0; i < dtrace_nprobes; i++) {
9290 if ((probe = dtrace_probes[i]) == NULL)
9291 continue;
9292
9293 if (probe->dtpr_ecb == NULL) {
9294 /*
9295 * This probe isn't enabled -- don't call the function.
9296 */
9297 continue;
9298 }
9299
9300 prov = probe->dtpr_provider;
9301 func = *((void(**)(void *, dtrace_id_t, void *))
9302 ((uintptr_t)&prov->dtpv_pops + offs));
9303
9304 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9305 }
9306
9307 dtrace_interrupt_enable(cookie);
9308 }
9309
9310 static int
dtrace_probe_enable(const dtrace_probedesc_t * desc,dtrace_enabling_t * enab,dtrace_ecbdesc_t * ep)9311 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9312 {
9313 dtrace_probekey_t pkey;
9314 uint32_t priv;
9315 uid_t uid;
9316 zoneid_t zoneid;
9317 int err;
9318
9319 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9320
9321 dtrace_ecb_create_cache = NULL;
9322
9323 if (desc == NULL) {
9324 /*
9325 * If we're passed a NULL description, we're being asked to
9326 * create an ECB with a NULL probe.
9327 */
9328 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9329 return (0);
9330 }
9331
9332 dtrace_probekey(desc, &pkey);
9333 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9334 &priv, &uid, &zoneid);
9335
9336 err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9337
9338 dtrace_probekey_release(&pkey);
9339
9340 return err;
9341 }
9342
9343 /*
9344 * DTrace Helper Provider Functions
9345 */
9346 static void
dtrace_dofattr2attr(dtrace_attribute_t * attr,const dof_attr_t dofattr)9347 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9348 {
9349 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9350 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9351 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9352 }
9353
9354 static void
dtrace_dofprov2hprov(dtrace_helper_provdesc_t * hprov,const dof_provider_t * dofprov,char * strtab)9355 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9356 const dof_provider_t *dofprov, char *strtab)
9357 {
9358 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9359 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9360 dofprov->dofpv_provattr);
9361 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9362 dofprov->dofpv_modattr);
9363 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9364 dofprov->dofpv_funcattr);
9365 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9366 dofprov->dofpv_nameattr);
9367 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9368 dofprov->dofpv_argsattr);
9369 }
9370
9371 static void
dtrace_helper_provide_one(dof_helper_t * dhp,dof_sec_t * sec,proc_t * p)9372 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9373 {
9374 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9375 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9376 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9377 dof_provider_t *provider;
9378 dof_probe_t *probe;
9379 uint32_t *off, *enoff;
9380 uint8_t *arg;
9381 char *strtab;
9382 uint_t i, nprobes;
9383 dtrace_helper_provdesc_t dhpv;
9384 dtrace_helper_probedesc_t dhpb;
9385 dtrace_meta_t *meta = dtrace_meta_pid;
9386 dtrace_mops_t *mops = &meta->dtm_mops;
9387 void *parg;
9388
9389 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9390 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9391 provider->dofpv_strtab * dof->dofh_secsize);
9392 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9393 provider->dofpv_probes * dof->dofh_secsize);
9394 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9395 provider->dofpv_prargs * dof->dofh_secsize);
9396 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9397 provider->dofpv_proffs * dof->dofh_secsize);
9398
9399 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9400 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9401 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9402 enoff = NULL;
9403
9404 /*
9405 * See dtrace_helper_provider_validate().
9406 */
9407 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9408 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9409 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9410 provider->dofpv_prenoffs * dof->dofh_secsize);
9411 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9412 }
9413
9414 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9415
9416 /*
9417 * Create the provider.
9418 */
9419 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9420
9421 if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9422 return;
9423
9424 meta->dtm_count++;
9425
9426 /*
9427 * Create the probes.
9428 */
9429 for (i = 0; i < nprobes; i++) {
9430 probe = (dof_probe_t *)(uintptr_t)(daddr +
9431 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9432
9433 dhpb.dthpb_mod = dhp->dofhp_mod;
9434 dhpb.dthpb_func = strtab + probe->dofpr_func;
9435 dhpb.dthpb_name = strtab + probe->dofpr_name;
9436 #if !defined(__APPLE__)
9437 dhpb.dthpb_base = probe->dofpr_addr;
9438 #else
9439 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9440 #endif
9441 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9442 dhpb.dthpb_noffs = probe->dofpr_noffs;
9443 if (enoff != NULL) {
9444 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9445 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9446 } else {
9447 dhpb.dthpb_enoffs = NULL;
9448 dhpb.dthpb_nenoffs = 0;
9449 }
9450 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9451 dhpb.dthpb_nargc = probe->dofpr_nargc;
9452 dhpb.dthpb_xargc = probe->dofpr_xargc;
9453 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9454 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9455
9456 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9457 }
9458
9459 /*
9460 * Since we just created probes, we need to match our enablings
9461 * against those, with a precondition knowing that we have only
9462 * added probes from this provider
9463 */
9464 char *prov_name = mops->dtms_provider_name(parg);
9465 ASSERT(prov_name != NULL);
9466 dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9467
9468 dtrace_enabling_matchall_with_cond(&cond);
9469 }
9470
9471 static void
dtrace_helper_provide(dof_helper_t * dhp,proc_t * p)9472 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9473 {
9474 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9475 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9476 uint32_t i;
9477
9478 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9479
9480 for (i = 0; i < dof->dofh_secnum; i++) {
9481 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9482 dof->dofh_secoff + i * dof->dofh_secsize);
9483
9484 if (sec->dofs_type != DOF_SECT_PROVIDER)
9485 continue;
9486
9487 dtrace_helper_provide_one(dhp, sec, p);
9488 }
9489 }
9490
9491 static void
dtrace_helper_provider_remove_one(dof_helper_t * dhp,dof_sec_t * sec,proc_t * p)9492 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9493 {
9494 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9495 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9496 dof_sec_t *str_sec;
9497 dof_provider_t *provider;
9498 char *strtab;
9499 dtrace_helper_provdesc_t dhpv;
9500 dtrace_meta_t *meta = dtrace_meta_pid;
9501 dtrace_mops_t *mops = &meta->dtm_mops;
9502
9503 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9504 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9505 provider->dofpv_strtab * dof->dofh_secsize);
9506
9507 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9508
9509 /*
9510 * Create the provider.
9511 */
9512 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9513
9514 mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9515
9516 meta->dtm_count--;
9517 }
9518
9519 static void
dtrace_helper_provider_remove(dof_helper_t * dhp,proc_t * p)9520 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9521 {
9522 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9523 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9524 uint32_t i;
9525
9526 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9527
9528 for (i = 0; i < dof->dofh_secnum; i++) {
9529 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9530 dof->dofh_secoff + i * dof->dofh_secsize);
9531
9532 if (sec->dofs_type != DOF_SECT_PROVIDER)
9533 continue;
9534
9535 dtrace_helper_provider_remove_one(dhp, sec, p);
9536 }
9537 }
9538
9539 /*
9540 * DTrace Meta Provider-to-Framework API Functions
9541 *
9542 * These functions implement the Meta Provider-to-Framework API, as described
9543 * in <sys/dtrace.h>.
9544 */
9545 int
dtrace_meta_register(const char * name,const dtrace_mops_t * mops,void * arg,dtrace_meta_provider_id_t * idp)9546 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9547 dtrace_meta_provider_id_t *idp)
9548 {
9549 dtrace_meta_t *meta;
9550 dtrace_helpers_t *help, *next;
9551 uint_t i;
9552
9553 *idp = DTRACE_METAPROVNONE;
9554
9555 /*
9556 * We strictly don't need the name, but we hold onto it for
9557 * debuggability. All hail error queues!
9558 */
9559 if (name == NULL) {
9560 cmn_err(CE_WARN, "failed to register meta-provider: "
9561 "invalid name");
9562 return (EINVAL);
9563 }
9564
9565 if (mops == NULL ||
9566 mops->dtms_create_probe == NULL ||
9567 mops->dtms_provide_proc == NULL ||
9568 mops->dtms_remove_proc == NULL) {
9569 cmn_err(CE_WARN, "failed to register meta-register %s: "
9570 "invalid ops", name);
9571 return (EINVAL);
9572 }
9573
9574 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9575 meta->dtm_mops = *mops;
9576 meta->dtm_arg = arg;
9577
9578 lck_mtx_lock(&dtrace_meta_lock);
9579 lck_mtx_lock(&dtrace_lock);
9580
9581 if (dtrace_meta_pid != NULL) {
9582 lck_mtx_unlock(&dtrace_lock);
9583 lck_mtx_unlock(&dtrace_meta_lock);
9584 cmn_err(CE_WARN, "failed to register meta-register %s: "
9585 "user-land meta-provider exists", name);
9586 kmem_free(meta, sizeof (dtrace_meta_t));
9587 return (EINVAL);
9588 }
9589
9590 meta->dtm_name = dtrace_strref(name);
9591
9592 dtrace_meta_pid = meta;
9593 *idp = (dtrace_meta_provider_id_t)meta;
9594
9595 /*
9596 * If there are providers and probes ready to go, pass them
9597 * off to the new meta provider now.
9598 */
9599
9600 help = dtrace_deferred_pid;
9601 dtrace_deferred_pid = NULL;
9602
9603 lck_mtx_unlock(&dtrace_lock);
9604
9605 while (help != NULL) {
9606 for (i = 0; i < help->dthps_nprovs; i++) {
9607 proc_t *p = proc_find(help->dthps_pid);
9608 if (p == PROC_NULL)
9609 continue;
9610 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9611 p);
9612 proc_rele(p);
9613 }
9614
9615 next = help->dthps_next;
9616 help->dthps_next = NULL;
9617 help->dthps_prev = NULL;
9618 help->dthps_deferred = 0;
9619 help = next;
9620 }
9621
9622 lck_mtx_unlock(&dtrace_meta_lock);
9623
9624 return (0);
9625 }
9626
9627 int
dtrace_meta_unregister(dtrace_meta_provider_id_t id)9628 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9629 {
9630 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9631
9632 lck_mtx_lock(&dtrace_meta_lock);
9633 lck_mtx_lock(&dtrace_lock);
9634
9635 if (old == dtrace_meta_pid) {
9636 pp = &dtrace_meta_pid;
9637 } else {
9638 panic("attempt to unregister non-existent "
9639 "dtrace meta-provider %p\n", (void *)old);
9640 }
9641
9642 if (old->dtm_count != 0) {
9643 lck_mtx_unlock(&dtrace_lock);
9644 lck_mtx_unlock(&dtrace_meta_lock);
9645 return (EBUSY);
9646 }
9647
9648 *pp = NULL;
9649
9650 dtrace_strunref(old->dtm_name);
9651
9652 lck_mtx_unlock(&dtrace_lock);
9653 lck_mtx_unlock(&dtrace_meta_lock);
9654
9655 kmem_free(old, sizeof (dtrace_meta_t));
9656
9657 return (0);
9658 }
9659
9660
9661 /*
9662 * DTrace DIF Object Functions
9663 */
9664 static int
dtrace_difo_err(uint_t pc,const char * format,...)9665 dtrace_difo_err(uint_t pc, const char *format, ...)
9666 {
9667 if (dtrace_err_verbose) {
9668 va_list alist;
9669
9670 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9671 va_start(alist, format);
9672 (void) vuprintf(format, alist);
9673 va_end(alist);
9674 }
9675
9676 #ifdef DTRACE_ERRDEBUG
9677 dtrace_errdebug(format);
9678 #endif
9679 return (1);
9680 }
9681
9682 /*
9683 * Validate a DTrace DIF object by checking the IR instructions. The following
9684 * rules are currently enforced by dtrace_difo_validate():
9685 *
9686 * 1. Each instruction must have a valid opcode
9687 * 2. Each register, string, variable, or subroutine reference must be valid
9688 * 3. No instruction can modify register %r0 (must be zero)
9689 * 4. All instruction reserved bits must be set to zero
9690 * 5. The last instruction must be a "ret" instruction
9691 * 6. All branch targets must reference a valid instruction _after_ the branch
9692 */
9693 static int
dtrace_difo_validate(dtrace_difo_t * dp,dtrace_vstate_t * vstate,uint_t nregs,cred_t * cr)9694 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9695 cred_t *cr)
9696 {
9697 int err = 0;
9698 uint_t i;
9699
9700 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9701 int kcheckload;
9702 uint_t pc;
9703 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9704
9705 kcheckload = cr == NULL ||
9706 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9707
9708 dp->dtdo_destructive = 0;
9709
9710 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9711 dif_instr_t instr = dp->dtdo_buf[pc];
9712
9713 uint_t r1 = DIF_INSTR_R1(instr);
9714 uint_t r2 = DIF_INSTR_R2(instr);
9715 uint_t rd = DIF_INSTR_RD(instr);
9716 uint_t rs = DIF_INSTR_RS(instr);
9717 uint_t label = DIF_INSTR_LABEL(instr);
9718 uint_t v = DIF_INSTR_VAR(instr);
9719 uint_t subr = DIF_INSTR_SUBR(instr);
9720 uint_t type = DIF_INSTR_TYPE(instr);
9721 uint_t op = DIF_INSTR_OP(instr);
9722
9723 switch (op) {
9724 case DIF_OP_OR:
9725 case DIF_OP_XOR:
9726 case DIF_OP_AND:
9727 case DIF_OP_SLL:
9728 case DIF_OP_SRL:
9729 case DIF_OP_SRA:
9730 case DIF_OP_SUB:
9731 case DIF_OP_ADD:
9732 case DIF_OP_MUL:
9733 case DIF_OP_SDIV:
9734 case DIF_OP_UDIV:
9735 case DIF_OP_SREM:
9736 case DIF_OP_UREM:
9737 case DIF_OP_COPYS:
9738 if (r1 >= nregs)
9739 err += efunc(pc, "invalid register %u\n", r1);
9740 if (r2 >= nregs)
9741 err += efunc(pc, "invalid register %u\n", r2);
9742 if (rd >= nregs)
9743 err += efunc(pc, "invalid register %u\n", rd);
9744 if (rd == 0)
9745 err += efunc(pc, "cannot write to %%r0\n");
9746 break;
9747 case DIF_OP_NOT:
9748 case DIF_OP_MOV:
9749 case DIF_OP_ALLOCS:
9750 if (r1 >= nregs)
9751 err += efunc(pc, "invalid register %u\n", r1);
9752 if (r2 != 0)
9753 err += efunc(pc, "non-zero reserved bits\n");
9754 if (rd >= nregs)
9755 err += efunc(pc, "invalid register %u\n", rd);
9756 if (rd == 0)
9757 err += efunc(pc, "cannot write to %%r0\n");
9758 break;
9759 case DIF_OP_LDSB:
9760 case DIF_OP_LDSH:
9761 case DIF_OP_LDSW:
9762 case DIF_OP_LDUB:
9763 case DIF_OP_LDUH:
9764 case DIF_OP_LDUW:
9765 case DIF_OP_LDX:
9766 if (r1 >= nregs)
9767 err += efunc(pc, "invalid register %u\n", r1);
9768 if (r2 != 0)
9769 err += efunc(pc, "non-zero reserved bits\n");
9770 if (rd >= nregs)
9771 err += efunc(pc, "invalid register %u\n", rd);
9772 if (rd == 0)
9773 err += efunc(pc, "cannot write to %%r0\n");
9774 if (kcheckload)
9775 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9776 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9777 break;
9778 case DIF_OP_RLDSB:
9779 case DIF_OP_RLDSH:
9780 case DIF_OP_RLDSW:
9781 case DIF_OP_RLDUB:
9782 case DIF_OP_RLDUH:
9783 case DIF_OP_RLDUW:
9784 case DIF_OP_RLDX:
9785 if (r1 >= nregs)
9786 err += efunc(pc, "invalid register %u\n", r1);
9787 if (r2 != 0)
9788 err += efunc(pc, "non-zero reserved bits\n");
9789 if (rd >= nregs)
9790 err += efunc(pc, "invalid register %u\n", rd);
9791 if (rd == 0)
9792 err += efunc(pc, "cannot write to %%r0\n");
9793 break;
9794 case DIF_OP_ULDSB:
9795 case DIF_OP_ULDSH:
9796 case DIF_OP_ULDSW:
9797 case DIF_OP_ULDUB:
9798 case DIF_OP_ULDUH:
9799 case DIF_OP_ULDUW:
9800 case DIF_OP_ULDX:
9801 if (r1 >= nregs)
9802 err += efunc(pc, "invalid register %u\n", r1);
9803 if (r2 != 0)
9804 err += efunc(pc, "non-zero reserved bits\n");
9805 if (rd >= nregs)
9806 err += efunc(pc, "invalid register %u\n", rd);
9807 if (rd == 0)
9808 err += efunc(pc, "cannot write to %%r0\n");
9809 break;
9810 case DIF_OP_STB:
9811 case DIF_OP_STH:
9812 case DIF_OP_STW:
9813 case DIF_OP_STX:
9814 if (r1 >= nregs)
9815 err += efunc(pc, "invalid register %u\n", r1);
9816 if (r2 != 0)
9817 err += efunc(pc, "non-zero reserved bits\n");
9818 if (rd >= nregs)
9819 err += efunc(pc, "invalid register %u\n", rd);
9820 if (rd == 0)
9821 err += efunc(pc, "cannot write to 0 address\n");
9822 break;
9823 case DIF_OP_CMP:
9824 case DIF_OP_SCMP:
9825 if (r1 >= nregs)
9826 err += efunc(pc, "invalid register %u\n", r1);
9827 if (r2 >= nregs)
9828 err += efunc(pc, "invalid register %u\n", r2);
9829 if (rd != 0)
9830 err += efunc(pc, "non-zero reserved bits\n");
9831 break;
9832 case DIF_OP_TST:
9833 if (r1 >= nregs)
9834 err += efunc(pc, "invalid register %u\n", r1);
9835 if (r2 != 0 || rd != 0)
9836 err += efunc(pc, "non-zero reserved bits\n");
9837 break;
9838 case DIF_OP_BA:
9839 case DIF_OP_BE:
9840 case DIF_OP_BNE:
9841 case DIF_OP_BG:
9842 case DIF_OP_BGU:
9843 case DIF_OP_BGE:
9844 case DIF_OP_BGEU:
9845 case DIF_OP_BL:
9846 case DIF_OP_BLU:
9847 case DIF_OP_BLE:
9848 case DIF_OP_BLEU:
9849 if (label >= dp->dtdo_len) {
9850 err += efunc(pc, "invalid branch target %u\n",
9851 label);
9852 }
9853 if (label <= pc) {
9854 err += efunc(pc, "backward branch to %u\n",
9855 label);
9856 }
9857 break;
9858 case DIF_OP_RET:
9859 if (r1 != 0 || r2 != 0)
9860 err += efunc(pc, "non-zero reserved bits\n");
9861 if (rd >= nregs)
9862 err += efunc(pc, "invalid register %u\n", rd);
9863 break;
9864 case DIF_OP_NOP:
9865 case DIF_OP_POPTS:
9866 case DIF_OP_FLUSHTS:
9867 if (r1 != 0 || r2 != 0 || rd != 0)
9868 err += efunc(pc, "non-zero reserved bits\n");
9869 break;
9870 case DIF_OP_SETX:
9871 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9872 err += efunc(pc, "invalid integer ref %u\n",
9873 DIF_INSTR_INTEGER(instr));
9874 }
9875 if (rd >= nregs)
9876 err += efunc(pc, "invalid register %u\n", rd);
9877 if (rd == 0)
9878 err += efunc(pc, "cannot write to %%r0\n");
9879 break;
9880 case DIF_OP_SETS:
9881 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9882 err += efunc(pc, "invalid string ref %u\n",
9883 DIF_INSTR_STRING(instr));
9884 }
9885 if (rd >= nregs)
9886 err += efunc(pc, "invalid register %u\n", rd);
9887 if (rd == 0)
9888 err += efunc(pc, "cannot write to %%r0\n");
9889 break;
9890 case DIF_OP_LDGA:
9891 case DIF_OP_LDTA:
9892 if (r1 > DIF_VAR_ARRAY_MAX)
9893 err += efunc(pc, "invalid array %u\n", r1);
9894 if (r2 >= nregs)
9895 err += efunc(pc, "invalid register %u\n", r2);
9896 if (rd >= nregs)
9897 err += efunc(pc, "invalid register %u\n", rd);
9898 if (rd == 0)
9899 err += efunc(pc, "cannot write to %%r0\n");
9900 break;
9901 case DIF_OP_LDGS:
9902 case DIF_OP_LDTS:
9903 case DIF_OP_LDLS:
9904 case DIF_OP_LDGAA:
9905 case DIF_OP_LDTAA:
9906 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9907 err += efunc(pc, "invalid variable %u\n", v);
9908 if (rd >= nregs)
9909 err += efunc(pc, "invalid register %u\n", rd);
9910 if (rd == 0)
9911 err += efunc(pc, "cannot write to %%r0\n");
9912 break;
9913 case DIF_OP_STGS:
9914 case DIF_OP_STTS:
9915 case DIF_OP_STLS:
9916 case DIF_OP_STGAA:
9917 case DIF_OP_STTAA:
9918 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9919 err += efunc(pc, "invalid variable %u\n", v);
9920 if (rs >= nregs)
9921 err += efunc(pc, "invalid register %u\n", rd);
9922 break;
9923 case DIF_OP_CALL:
9924 if (subr > DIF_SUBR_MAX &&
9925 !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9926 err += efunc(pc, "invalid subr %u\n", subr);
9927 if (rd >= nregs)
9928 err += efunc(pc, "invalid register %u\n", rd);
9929 if (rd == 0)
9930 err += efunc(pc, "cannot write to %%r0\n");
9931
9932 switch (subr) {
9933 case DIF_SUBR_COPYOUT:
9934 case DIF_SUBR_COPYOUTSTR:
9935 case DIF_SUBR_KDEBUG_TRACE:
9936 case DIF_SUBR_KDEBUG_TRACE_STRING:
9937 case DIF_SUBR_PHYSMEM_READ:
9938 case DIF_SUBR_PHYSMEM_WRITE:
9939 case DIF_SUBR_LIVEDUMP:
9940 dp->dtdo_destructive = 1;
9941 break;
9942 default:
9943 break;
9944 }
9945 break;
9946 case DIF_OP_PUSHTR:
9947 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9948 err += efunc(pc, "invalid ref type %u\n", type);
9949 if (r2 >= nregs)
9950 err += efunc(pc, "invalid register %u\n", r2);
9951 if (rs >= nregs)
9952 err += efunc(pc, "invalid register %u\n", rs);
9953 break;
9954 case DIF_OP_PUSHTV:
9955 if (type != DIF_TYPE_CTF)
9956 err += efunc(pc, "invalid val type %u\n", type);
9957 if (r2 >= nregs)
9958 err += efunc(pc, "invalid register %u\n", r2);
9959 if (rs >= nregs)
9960 err += efunc(pc, "invalid register %u\n", rs);
9961 break;
9962 case DIF_OP_STRIP:
9963 if (r1 >= nregs)
9964 err += efunc(pc, "invalid register %u\n", r1);
9965 if (!dtrace_is_valid_ptrauth_key(r2))
9966 err += efunc(pc, "invalid key\n");
9967 if (rd >= nregs)
9968 err += efunc(pc, "invalid register %u\n", rd);
9969 if (rd == 0)
9970 err += efunc(pc, "cannot write to %%r0\n");
9971 break;
9972 default:
9973 err += efunc(pc, "invalid opcode %u\n",
9974 DIF_INSTR_OP(instr));
9975 }
9976 }
9977
9978 if (dp->dtdo_len != 0 &&
9979 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9980 err += efunc(dp->dtdo_len - 1,
9981 "expected 'ret' as last DIF instruction\n");
9982 }
9983
9984 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9985 /*
9986 * If we're not returning by reference, the size must be either
9987 * 0 or the size of one of the base types.
9988 */
9989 switch (dp->dtdo_rtype.dtdt_size) {
9990 case 0:
9991 case sizeof (uint8_t):
9992 case sizeof (uint16_t):
9993 case sizeof (uint32_t):
9994 case sizeof (uint64_t):
9995 break;
9996
9997 default:
9998 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9999 }
10000 }
10001
10002 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
10003 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
10004 dtrace_diftype_t *vt, *et;
10005 uint_t id;
10006 int ndx;
10007
10008 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
10009 v->dtdv_scope != DIFV_SCOPE_THREAD &&
10010 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
10011 err += efunc(i, "unrecognized variable scope %d\n",
10012 v->dtdv_scope);
10013 break;
10014 }
10015
10016 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
10017 v->dtdv_kind != DIFV_KIND_SCALAR) {
10018 err += efunc(i, "unrecognized variable type %d\n",
10019 v->dtdv_kind);
10020 break;
10021 }
10022
10023 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
10024 err += efunc(i, "%d exceeds variable id limit\n", id);
10025 break;
10026 }
10027
10028 if (id < DIF_VAR_OTHER_UBASE)
10029 continue;
10030
10031 /*
10032 * For user-defined variables, we need to check that this
10033 * definition is identical to any previous definition that we
10034 * encountered.
10035 */
10036 ndx = id - DIF_VAR_OTHER_UBASE;
10037
10038 switch (v->dtdv_scope) {
10039 case DIFV_SCOPE_GLOBAL:
10040 if (maxglobal == -1 || ndx > maxglobal)
10041 maxglobal = ndx;
10042
10043 if (ndx < vstate->dtvs_nglobals) {
10044 dtrace_statvar_t *svar;
10045
10046 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10047 existing = &svar->dtsv_var;
10048 }
10049
10050 break;
10051
10052 case DIFV_SCOPE_THREAD:
10053 if (maxtlocal == -1 || ndx > maxtlocal)
10054 maxtlocal = ndx;
10055
10056 if (ndx < vstate->dtvs_ntlocals)
10057 existing = &vstate->dtvs_tlocals[ndx];
10058 break;
10059
10060 case DIFV_SCOPE_LOCAL:
10061 if (maxlocal == -1 || ndx > maxlocal)
10062 maxlocal = ndx;
10063 if (ndx < vstate->dtvs_nlocals) {
10064 dtrace_statvar_t *svar;
10065
10066 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10067 existing = &svar->dtsv_var;
10068 }
10069
10070 break;
10071 }
10072
10073 vt = &v->dtdv_type;
10074
10075 if (vt->dtdt_flags & DIF_TF_BYREF) {
10076 if (vt->dtdt_size == 0) {
10077 err += efunc(i, "zero-sized variable\n");
10078 break;
10079 }
10080
10081 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
10082 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10083 vt->dtdt_size > dtrace_statvar_maxsize) {
10084 err += efunc(i, "oversized by-ref static\n");
10085 break;
10086 }
10087 }
10088
10089 if (existing == NULL || existing->dtdv_id == 0)
10090 continue;
10091
10092 ASSERT(existing->dtdv_id == v->dtdv_id);
10093 ASSERT(existing->dtdv_scope == v->dtdv_scope);
10094
10095 if (existing->dtdv_kind != v->dtdv_kind)
10096 err += efunc(i, "%d changed variable kind\n", id);
10097
10098 et = &existing->dtdv_type;
10099
10100 if (vt->dtdt_flags != et->dtdt_flags) {
10101 err += efunc(i, "%d changed variable type flags\n", id);
10102 break;
10103 }
10104
10105 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
10106 err += efunc(i, "%d changed variable type size\n", id);
10107 break;
10108 }
10109 }
10110
10111 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
10112 dif_instr_t instr = dp->dtdo_buf[pc];
10113
10114 uint_t v = DIF_INSTR_VAR(instr);
10115 uint_t op = DIF_INSTR_OP(instr);
10116
10117 switch (op) {
10118 case DIF_OP_LDGS:
10119 case DIF_OP_LDGAA:
10120 case DIF_OP_STGS:
10121 case DIF_OP_STGAA:
10122 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
10123 err += efunc(pc, "invalid variable %u\n", v);
10124 break;
10125 case DIF_OP_LDTS:
10126 case DIF_OP_LDTAA:
10127 case DIF_OP_STTS:
10128 case DIF_OP_STTAA:
10129 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
10130 err += efunc(pc, "invalid variable %u\n", v);
10131 break;
10132 case DIF_OP_LDLS:
10133 case DIF_OP_STLS:
10134 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
10135 err += efunc(pc, "invalid variable %u\n", v);
10136 break;
10137 default:
10138 break;
10139 }
10140 }
10141
10142 return (err);
10143 }
10144
10145 /*
10146 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
10147 * are much more constrained than normal DIFOs. Specifically, they may
10148 * not:
10149 *
10150 * 1. Make calls to subroutines other than copyin(), copyinstr() or
10151 * miscellaneous string routines
10152 * 2. Access DTrace variables other than the args[] array, and the
10153 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10154 * 3. Have thread-local variables.
10155 * 4. Have dynamic variables.
10156 */
10157 static int
dtrace_difo_validate_helper(dtrace_difo_t * dp)10158 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10159 {
10160 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10161 int err = 0;
10162 uint_t pc;
10163
10164 for (pc = 0; pc < dp->dtdo_len; pc++) {
10165 dif_instr_t instr = dp->dtdo_buf[pc];
10166
10167 uint_t v = DIF_INSTR_VAR(instr);
10168 uint_t subr = DIF_INSTR_SUBR(instr);
10169 uint_t op = DIF_INSTR_OP(instr);
10170
10171 switch (op) {
10172 case DIF_OP_OR:
10173 case DIF_OP_XOR:
10174 case DIF_OP_AND:
10175 case DIF_OP_SLL:
10176 case DIF_OP_SRL:
10177 case DIF_OP_SRA:
10178 case DIF_OP_SUB:
10179 case DIF_OP_ADD:
10180 case DIF_OP_MUL:
10181 case DIF_OP_SDIV:
10182 case DIF_OP_UDIV:
10183 case DIF_OP_SREM:
10184 case DIF_OP_UREM:
10185 case DIF_OP_COPYS:
10186 case DIF_OP_NOT:
10187 case DIF_OP_MOV:
10188 case DIF_OP_RLDSB:
10189 case DIF_OP_RLDSH:
10190 case DIF_OP_RLDSW:
10191 case DIF_OP_RLDUB:
10192 case DIF_OP_RLDUH:
10193 case DIF_OP_RLDUW:
10194 case DIF_OP_RLDX:
10195 case DIF_OP_ULDSB:
10196 case DIF_OP_ULDSH:
10197 case DIF_OP_ULDSW:
10198 case DIF_OP_ULDUB:
10199 case DIF_OP_ULDUH:
10200 case DIF_OP_ULDUW:
10201 case DIF_OP_ULDX:
10202 case DIF_OP_STB:
10203 case DIF_OP_STH:
10204 case DIF_OP_STW:
10205 case DIF_OP_STX:
10206 case DIF_OP_ALLOCS:
10207 case DIF_OP_CMP:
10208 case DIF_OP_SCMP:
10209 case DIF_OP_TST:
10210 case DIF_OP_BA:
10211 case DIF_OP_BE:
10212 case DIF_OP_BNE:
10213 case DIF_OP_BG:
10214 case DIF_OP_BGU:
10215 case DIF_OP_BGE:
10216 case DIF_OP_BGEU:
10217 case DIF_OP_BL:
10218 case DIF_OP_BLU:
10219 case DIF_OP_BLE:
10220 case DIF_OP_BLEU:
10221 case DIF_OP_RET:
10222 case DIF_OP_NOP:
10223 case DIF_OP_POPTS:
10224 case DIF_OP_FLUSHTS:
10225 case DIF_OP_SETX:
10226 case DIF_OP_SETS:
10227 case DIF_OP_LDGA:
10228 case DIF_OP_LDLS:
10229 case DIF_OP_STGS:
10230 case DIF_OP_STLS:
10231 case DIF_OP_PUSHTR:
10232 case DIF_OP_PUSHTV:
10233 break;
10234
10235 case DIF_OP_LDGS:
10236 if (v >= DIF_VAR_OTHER_UBASE)
10237 break;
10238
10239 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10240 break;
10241
10242 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10243 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10244 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10245 v == DIF_VAR_UID || v == DIF_VAR_GID)
10246 break;
10247
10248 err += efunc(pc, "illegal variable %u\n", v);
10249 break;
10250
10251 case DIF_OP_LDTA:
10252 case DIF_OP_LDTS:
10253 case DIF_OP_LDGAA:
10254 case DIF_OP_LDTAA:
10255 err += efunc(pc, "illegal dynamic variable load\n");
10256 break;
10257
10258 case DIF_OP_STTS:
10259 case DIF_OP_STGAA:
10260 case DIF_OP_STTAA:
10261 err += efunc(pc, "illegal dynamic variable store\n");
10262 break;
10263
10264 case DIF_OP_CALL:
10265 switch (subr) {
10266 case DIF_SUBR_ALLOCA:
10267 case DIF_SUBR_BCOPY:
10268 case DIF_SUBR_COPYIN:
10269 case DIF_SUBR_COPYINTO:
10270 case DIF_SUBR_COPYINSTR:
10271 case DIF_SUBR_HTONS:
10272 case DIF_SUBR_HTONL:
10273 case DIF_SUBR_HTONLL:
10274 case DIF_SUBR_INDEX:
10275 case DIF_SUBR_INET_NTOA:
10276 case DIF_SUBR_INET_NTOA6:
10277 case DIF_SUBR_INET_NTOP:
10278 case DIF_SUBR_JSON:
10279 case DIF_SUBR_LLTOSTR:
10280 case DIF_SUBR_NTOHS:
10281 case DIF_SUBR_NTOHL:
10282 case DIF_SUBR_NTOHLL:
10283 case DIF_SUBR_RINDEX:
10284 case DIF_SUBR_STRCHR:
10285 case DIF_SUBR_STRTOLL:
10286 case DIF_SUBR_STRJOIN:
10287 case DIF_SUBR_STRRCHR:
10288 case DIF_SUBR_STRSTR:
10289 break;
10290 default:
10291 err += efunc(pc, "invalid subr %u\n", subr);
10292 }
10293 break;
10294
10295 default:
10296 err += efunc(pc, "invalid opcode %u\n",
10297 DIF_INSTR_OP(instr));
10298 }
10299 }
10300
10301 return (err);
10302 }
10303
10304 /*
10305 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10306 * basis; 0 if not.
10307 */
10308 static int
dtrace_difo_cacheable(dtrace_difo_t * dp)10309 dtrace_difo_cacheable(dtrace_difo_t *dp)
10310 {
10311 uint_t i;
10312
10313 if (dp == NULL)
10314 return (0);
10315
10316 for (i = 0; i < dp->dtdo_varlen; i++) {
10317 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10318
10319 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10320 continue;
10321
10322 switch (v->dtdv_id) {
10323 case DIF_VAR_CURTHREAD:
10324 case DIF_VAR_PID:
10325 case DIF_VAR_TID:
10326 case DIF_VAR_EXECNAME:
10327 case DIF_VAR_ZONENAME:
10328 break;
10329
10330 default:
10331 return (0);
10332 }
10333 }
10334
10335 /*
10336 * This DIF object may be cacheable. Now we need to look for any
10337 * array loading instructions, any memory loading instructions, or
10338 * any stores to thread-local variables.
10339 */
10340 for (i = 0; i < dp->dtdo_len; i++) {
10341 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10342
10343 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10344 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10345 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10346 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10347 return (0);
10348 }
10349
10350 return (1);
10351 }
10352
10353 static void
dtrace_difo_hold(dtrace_difo_t * dp)10354 dtrace_difo_hold(dtrace_difo_t *dp)
10355 {
10356 uint_t i;
10357
10358 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10359
10360 dp->dtdo_refcnt++;
10361 ASSERT(dp->dtdo_refcnt != 0);
10362
10363 /*
10364 * We need to check this DIF object for references to the variable
10365 * DIF_VAR_VTIMESTAMP.
10366 */
10367 for (i = 0; i < dp->dtdo_varlen; i++) {
10368 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10369
10370 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10371 continue;
10372
10373 if (dtrace_vtime_references++ == 0)
10374 dtrace_vtime_enable();
10375 }
10376 }
10377
10378 /*
10379 * This routine calculates the dynamic variable chunksize for a given DIF
10380 * object. The calculation is not fool-proof, and can probably be tricked by
10381 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10382 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10383 * if a dynamic variable size exceeds the chunksize.
10384 */
10385 static void
dtrace_difo_chunksize(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10386 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10387 {
10388 uint64_t sval = 0;
10389 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10390 const dif_instr_t *text = dp->dtdo_buf;
10391 uint_t pc, srd = 0;
10392 uint_t ttop = 0;
10393 size_t size, ksize;
10394 uint_t id, i;
10395
10396 for (pc = 0; pc < dp->dtdo_len; pc++) {
10397 dif_instr_t instr = text[pc];
10398 uint_t op = DIF_INSTR_OP(instr);
10399 uint_t rd = DIF_INSTR_RD(instr);
10400 uint_t r1 = DIF_INSTR_R1(instr);
10401 uint_t nkeys = 0;
10402 uchar_t scope;
10403
10404 dtrace_key_t *key = tupregs;
10405
10406 switch (op) {
10407 case DIF_OP_SETX:
10408 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10409 srd = rd;
10410 continue;
10411
10412 case DIF_OP_STTS:
10413 key = &tupregs[DIF_DTR_NREGS];
10414 key[0].dttk_size = 0;
10415 key[1].dttk_size = 0;
10416 nkeys = 2;
10417 scope = DIFV_SCOPE_THREAD;
10418 break;
10419
10420 case DIF_OP_STGAA:
10421 case DIF_OP_STTAA:
10422 nkeys = ttop;
10423
10424 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10425 key[nkeys++].dttk_size = 0;
10426
10427 key[nkeys++].dttk_size = 0;
10428
10429 if (op == DIF_OP_STTAA) {
10430 scope = DIFV_SCOPE_THREAD;
10431 } else {
10432 scope = DIFV_SCOPE_GLOBAL;
10433 }
10434
10435 break;
10436
10437 case DIF_OP_PUSHTR:
10438 if (ttop == DIF_DTR_NREGS)
10439 return;
10440
10441 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10442 /*
10443 * If the register for the size of the "pushtr"
10444 * is %r0 (or the value is 0) and the type is
10445 * a string, we'll use the system-wide default
10446 * string size.
10447 */
10448 tupregs[ttop++].dttk_size =
10449 dtrace_strsize_default;
10450 } else {
10451 if (srd == 0)
10452 return;
10453
10454 if (sval > LONG_MAX)
10455 return;
10456
10457 tupregs[ttop++].dttk_size = sval;
10458 }
10459
10460 break;
10461
10462 case DIF_OP_PUSHTV:
10463 if (ttop == DIF_DTR_NREGS)
10464 return;
10465
10466 tupregs[ttop++].dttk_size = 0;
10467 break;
10468
10469 case DIF_OP_FLUSHTS:
10470 ttop = 0;
10471 break;
10472
10473 case DIF_OP_POPTS:
10474 if (ttop != 0)
10475 ttop--;
10476 break;
10477 }
10478
10479 sval = 0;
10480 srd = 0;
10481
10482 if (nkeys == 0)
10483 continue;
10484
10485 /*
10486 * We have a dynamic variable allocation; calculate its size.
10487 */
10488 for (ksize = 0, i = 0; i < nkeys; i++)
10489 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10490
10491 size = sizeof (dtrace_dynvar_t);
10492 size += sizeof (dtrace_key_t) * (nkeys - 1);
10493 size += ksize;
10494
10495 /*
10496 * Now we need to determine the size of the stored data.
10497 */
10498 id = DIF_INSTR_VAR(instr);
10499
10500 for (i = 0; i < dp->dtdo_varlen; i++) {
10501 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10502
10503 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10504 size += v->dtdv_type.dtdt_size;
10505 break;
10506 }
10507 }
10508
10509 if (i == dp->dtdo_varlen)
10510 return;
10511
10512 /*
10513 * We have the size. If this is larger than the chunk size
10514 * for our dynamic variable state, reset the chunk size.
10515 */
10516 size = P2ROUNDUP(size, sizeof (uint64_t));
10517
10518 /*
10519 * Before setting the chunk size, check that we're not going
10520 * to set it to a negative value...
10521 */
10522 if (size > LONG_MAX)
10523 return;
10524
10525 /*
10526 * ...and make certain that we didn't badly overflow.
10527 */
10528 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10529 return;
10530
10531 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10532 vstate->dtvs_dynvars.dtds_chunksize = size;
10533 }
10534 }
10535
10536 static void
dtrace_difo_init(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10537 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10538 {
10539 int oldsvars, osz, nsz, otlocals, ntlocals;
10540 uint_t i, id;
10541
10542 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10543 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10544
10545 for (i = 0; i < dp->dtdo_varlen; i++) {
10546 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10547 dtrace_statvar_t *svar;
10548 dtrace_statvar_t ***svarp = NULL;
10549 size_t dsize = 0;
10550 uint8_t scope = v->dtdv_scope;
10551 int *np = (int *)NULL;
10552
10553 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10554 continue;
10555
10556 id -= DIF_VAR_OTHER_UBASE;
10557
10558 switch (scope) {
10559 case DIFV_SCOPE_THREAD:
10560 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10561 dtrace_difv_t *tlocals;
10562
10563 if ((ntlocals = (otlocals << 1)) == 0)
10564 ntlocals = 1;
10565
10566 osz = otlocals * sizeof (dtrace_difv_t);
10567 nsz = ntlocals * sizeof (dtrace_difv_t);
10568
10569 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10570
10571 if (osz != 0) {
10572 bcopy(vstate->dtvs_tlocals,
10573 tlocals, osz);
10574 kmem_free(vstate->dtvs_tlocals, osz);
10575 }
10576
10577 vstate->dtvs_tlocals = tlocals;
10578 vstate->dtvs_ntlocals = ntlocals;
10579 }
10580
10581 vstate->dtvs_tlocals[id] = *v;
10582 continue;
10583
10584 case DIFV_SCOPE_LOCAL:
10585 np = &vstate->dtvs_nlocals;
10586 svarp = &vstate->dtvs_locals;
10587
10588 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10589 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10590 sizeof (uint64_t));
10591 else
10592 dsize = (int)NCPU * sizeof (uint64_t);
10593
10594 break;
10595
10596 case DIFV_SCOPE_GLOBAL:
10597 np = &vstate->dtvs_nglobals;
10598 svarp = &vstate->dtvs_globals;
10599
10600 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10601 dsize = v->dtdv_type.dtdt_size +
10602 sizeof (uint64_t);
10603
10604 break;
10605
10606 default:
10607 ASSERT(0);
10608 }
10609
10610 while (id >= (uint_t)(oldsvars = *np)) {
10611 dtrace_statvar_t **statics;
10612 int newsvars, oldsize, newsize;
10613
10614 if ((newsvars = (oldsvars << 1)) == 0)
10615 newsvars = 1;
10616
10617 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10618 newsize = newsvars * sizeof (dtrace_statvar_t *);
10619
10620 statics = kmem_zalloc(newsize, KM_SLEEP);
10621
10622 if (oldsize != 0) {
10623 bcopy(*svarp, statics, oldsize);
10624 kmem_free(*svarp, oldsize);
10625 }
10626
10627 *svarp = statics;
10628 *np = newsvars;
10629 }
10630
10631 if ((svar = (*svarp)[id]) == NULL) {
10632 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10633 svar->dtsv_var = *v;
10634
10635 if ((svar->dtsv_size = dsize) != 0) {
10636 svar->dtsv_data = (uint64_t)(uintptr_t)
10637 kmem_zalloc(dsize, KM_SLEEP);
10638 }
10639
10640 (*svarp)[id] = svar;
10641 }
10642
10643 svar->dtsv_refcnt++;
10644 }
10645
10646 dtrace_difo_chunksize(dp, vstate);
10647 dtrace_difo_hold(dp);
10648 }
10649
10650 static dtrace_difo_t *
dtrace_difo_duplicate(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10651 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10652 {
10653 dtrace_difo_t *new;
10654 size_t sz;
10655
10656 ASSERT(dp->dtdo_buf != NULL);
10657 ASSERT(dp->dtdo_refcnt != 0);
10658
10659 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10660
10661 ASSERT(dp->dtdo_buf != NULL);
10662 sz = dp->dtdo_len * sizeof (dif_instr_t);
10663 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10664 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10665 new->dtdo_len = dp->dtdo_len;
10666
10667 if (dp->dtdo_strtab != NULL) {
10668 ASSERT(dp->dtdo_strlen != 0);
10669 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10670 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10671 new->dtdo_strlen = dp->dtdo_strlen;
10672 }
10673
10674 if (dp->dtdo_inttab != NULL) {
10675 ASSERT(dp->dtdo_intlen != 0);
10676 sz = dp->dtdo_intlen * sizeof (uint64_t);
10677 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10678 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10679 new->dtdo_intlen = dp->dtdo_intlen;
10680 }
10681
10682 if (dp->dtdo_vartab != NULL) {
10683 ASSERT(dp->dtdo_varlen != 0);
10684 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10685 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10686 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10687 new->dtdo_varlen = dp->dtdo_varlen;
10688 }
10689
10690 dtrace_difo_init(new, vstate);
10691 return (new);
10692 }
10693
10694 static void
dtrace_difo_destroy(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10695 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10696 {
10697 uint_t i;
10698
10699 ASSERT(dp->dtdo_refcnt == 0);
10700
10701 for (i = 0; i < dp->dtdo_varlen; i++) {
10702 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10703 dtrace_statvar_t *svar;
10704 dtrace_statvar_t **svarp = NULL;
10705 uint_t id;
10706 uint8_t scope = v->dtdv_scope;
10707 int *np = NULL;
10708
10709 switch (scope) {
10710 case DIFV_SCOPE_THREAD:
10711 continue;
10712
10713 case DIFV_SCOPE_LOCAL:
10714 np = &vstate->dtvs_nlocals;
10715 svarp = vstate->dtvs_locals;
10716 break;
10717
10718 case DIFV_SCOPE_GLOBAL:
10719 np = &vstate->dtvs_nglobals;
10720 svarp = vstate->dtvs_globals;
10721 break;
10722
10723 default:
10724 ASSERT(0);
10725 }
10726
10727 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10728 continue;
10729
10730 id -= DIF_VAR_OTHER_UBASE;
10731
10732 ASSERT(id < (uint_t)*np);
10733
10734 svar = svarp[id];
10735 ASSERT(svar != NULL);
10736 ASSERT(svar->dtsv_refcnt > 0);
10737
10738 if (--svar->dtsv_refcnt > 0)
10739 continue;
10740
10741 if (svar->dtsv_size != 0) {
10742 ASSERT(svar->dtsv_data != 0);
10743 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10744 svar->dtsv_size);
10745 }
10746
10747 kmem_free(svar, sizeof (dtrace_statvar_t));
10748 svarp[id] = NULL;
10749 }
10750
10751 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10752 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10753 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10754 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10755
10756 kmem_free(dp, sizeof (dtrace_difo_t));
10757 }
10758
10759 static void
dtrace_difo_release(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10760 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10761 {
10762 uint_t i;
10763
10764 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10765 ASSERT(dp->dtdo_refcnt != 0);
10766
10767 for (i = 0; i < dp->dtdo_varlen; i++) {
10768 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10769
10770 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10771 continue;
10772
10773 ASSERT(dtrace_vtime_references > 0);
10774 if (--dtrace_vtime_references == 0)
10775 dtrace_vtime_disable();
10776 }
10777
10778 if (--dp->dtdo_refcnt == 0)
10779 dtrace_difo_destroy(dp, vstate);
10780 }
10781
10782 /*
10783 * DTrace Format Functions
10784 */
10785
10786 static dtrace_format_t*
dtrace_format_new(char * str)10787 dtrace_format_new(char *str)
10788 {
10789 dtrace_format_t *fmt = NULL;
10790 size_t bufsize = strlen(str) + 1;
10791
10792 fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10793
10794 fmt->dtf_refcount = 1;
10795 (void) strlcpy(fmt->dtf_str, str, bufsize);
10796
10797 return fmt;
10798 }
10799
10800 static uint16_t
dtrace_format_add(dtrace_state_t * state,char * str)10801 dtrace_format_add(dtrace_state_t *state, char *str)
10802 {
10803 dtrace_format_t **new;
10804 uint16_t ndx;
10805
10806 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10807 if (state->dts_formats[ndx] == NULL) {
10808 state->dts_formats[ndx] = dtrace_format_new(str);
10809 return (ndx + 1);
10810 }
10811 else if (strcmp(state->dts_formats[ndx]->dtf_str, str) == 0) {
10812 VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10813 state->dts_formats[ndx]->dtf_refcount++;
10814 return (ndx + 1);
10815 }
10816 }
10817
10818 if (state->dts_nformats == USHRT_MAX) {
10819 /*
10820 * This is only likely if a denial-of-service attack is being
10821 * attempted. As such, it's okay to fail silently here.
10822 */
10823 return (0);
10824 }
10825
10826 /*
10827 * For simplicity, we always resize the formats array to be exactly the
10828 * number of formats.
10829 */
10830 ndx = state->dts_nformats++;
10831 new = kmem_alloc((ndx + 1) * sizeof (*state->dts_formats), KM_SLEEP);
10832
10833 if (state->dts_formats != NULL) {
10834 ASSERT(ndx != 0);
10835 bcopy(state->dts_formats, new, ndx * sizeof (*state->dts_formats));
10836 kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10837 }
10838
10839 state->dts_formats = new;
10840 state->dts_formats[ndx] = dtrace_format_new(str);
10841
10842 return (ndx + 1);
10843 }
10844
10845 static void
dtrace_format_remove(dtrace_state_t * state,uint16_t format)10846 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10847 {
10848 dtrace_format_t *fmt;
10849
10850 ASSERT(state->dts_formats != NULL);
10851 ASSERT(format <= state->dts_nformats);
10852
10853 fmt = state->dts_formats[format - 1];
10854
10855 ASSERT(fmt != NULL);
10856 VERIFY(fmt->dtf_refcount > 0);
10857
10858 fmt->dtf_refcount--;
10859
10860 if (fmt->dtf_refcount == 0) {
10861 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10862 state->dts_formats[format - 1] = NULL;
10863 }
10864 }
10865
10866 static void
dtrace_format_destroy(dtrace_state_t * state)10867 dtrace_format_destroy(dtrace_state_t *state)
10868 {
10869 int i;
10870
10871 if (state->dts_nformats == 0) {
10872 ASSERT(state->dts_formats == NULL);
10873 return;
10874 }
10875
10876 ASSERT(state->dts_formats != NULL);
10877
10878 for (i = 0; i < state->dts_nformats; i++) {
10879 dtrace_format_t *fmt = state->dts_formats[i];
10880
10881 if (fmt == NULL)
10882 continue;
10883
10884 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10885 }
10886
10887 kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10888 state->dts_nformats = 0;
10889 state->dts_formats = NULL;
10890 }
10891
10892 /*
10893 * DTrace Predicate Functions
10894 */
10895 static dtrace_predicate_t *
dtrace_predicate_create(dtrace_difo_t * dp)10896 dtrace_predicate_create(dtrace_difo_t *dp)
10897 {
10898 dtrace_predicate_t *pred;
10899
10900 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10901 ASSERT(dp->dtdo_refcnt != 0);
10902
10903 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10904 pred->dtp_difo = dp;
10905 pred->dtp_refcnt = 1;
10906
10907 if (!dtrace_difo_cacheable(dp))
10908 return (pred);
10909
10910 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10911 /*
10912 * This is only theoretically possible -- we have had 2^32
10913 * cacheable predicates on this machine. We cannot allow any
10914 * more predicates to become cacheable: as unlikely as it is,
10915 * there may be a thread caching a (now stale) predicate cache
10916 * ID. (N.B.: the temptation is being successfully resisted to
10917 * have this cmn_err() "Holy shit -- we executed this code!")
10918 */
10919 return (pred);
10920 }
10921
10922 pred->dtp_cacheid = dtrace_predcache_id++;
10923
10924 return (pred);
10925 }
10926
10927 static void
dtrace_predicate_hold(dtrace_predicate_t * pred)10928 dtrace_predicate_hold(dtrace_predicate_t *pred)
10929 {
10930 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10931 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10932 ASSERT(pred->dtp_refcnt > 0);
10933
10934 pred->dtp_refcnt++;
10935 }
10936
10937 static void
dtrace_predicate_release(dtrace_predicate_t * pred,dtrace_vstate_t * vstate)10938 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10939 {
10940 dtrace_difo_t *dp = pred->dtp_difo;
10941 #pragma unused(dp) /* __APPLE__ */
10942
10943 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10944 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10945 ASSERT(pred->dtp_refcnt > 0);
10946
10947 if (--pred->dtp_refcnt == 0) {
10948 dtrace_difo_release(pred->dtp_difo, vstate);
10949 kmem_free(pred, sizeof (dtrace_predicate_t));
10950 }
10951 }
10952
10953 /*
10954 * DTrace Action Description Functions
10955 */
10956 static dtrace_actdesc_t *
dtrace_actdesc_create(dtrace_actkind_t kind,uint32_t ntuple,uint64_t uarg,uint64_t arg)10957 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10958 uint64_t uarg, uint64_t arg)
10959 {
10960 dtrace_actdesc_t *act;
10961
10962 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10963 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10964
10965 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10966 act->dtad_kind = kind;
10967 act->dtad_ntuple = ntuple;
10968 act->dtad_uarg = uarg;
10969 act->dtad_arg = arg;
10970 act->dtad_refcnt = 1;
10971
10972 return (act);
10973 }
10974
10975 static void
dtrace_actdesc_hold(dtrace_actdesc_t * act)10976 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10977 {
10978 ASSERT(act->dtad_refcnt >= 1);
10979 act->dtad_refcnt++;
10980 }
10981
10982 static void
dtrace_actdesc_release(dtrace_actdesc_t * act,dtrace_vstate_t * vstate)10983 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10984 {
10985 dtrace_actkind_t kind = act->dtad_kind;
10986 dtrace_difo_t *dp;
10987
10988 ASSERT(act->dtad_refcnt >= 1);
10989
10990 if (--act->dtad_refcnt != 0)
10991 return;
10992
10993 if ((dp = act->dtad_difo) != NULL)
10994 dtrace_difo_release(dp, vstate);
10995
10996 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10997 char *str = (char *)(uintptr_t)act->dtad_arg;
10998
10999 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
11000 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
11001
11002 if (str != NULL)
11003 kmem_free(str, strlen(str) + 1);
11004 }
11005
11006 kmem_free(act, sizeof (dtrace_actdesc_t));
11007 }
11008
11009 /*
11010 * DTrace ECB Functions
11011 */
11012 static dtrace_ecb_t *
dtrace_ecb_add(dtrace_state_t * state,dtrace_probe_t * probe)11013 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
11014 {
11015 dtrace_ecb_t *ecb;
11016 dtrace_epid_t epid;
11017
11018 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11019
11020 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
11021 ecb->dte_predicate = NULL;
11022 ecb->dte_probe = probe;
11023
11024 /*
11025 * The default size is the size of the default action: recording
11026 * the header.
11027 */
11028 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
11029 ecb->dte_alignment = sizeof (dtrace_epid_t);
11030
11031 epid = state->dts_epid++;
11032
11033 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
11034 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
11035 int necbs = state->dts_necbs << 1;
11036
11037 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
11038
11039 if (necbs == 0) {
11040 ASSERT(oecbs == NULL);
11041 necbs = 1;
11042 }
11043
11044 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
11045
11046 if (oecbs != NULL)
11047 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
11048
11049 dtrace_membar_producer();
11050 state->dts_ecbs = ecbs;
11051
11052 if (oecbs != NULL) {
11053 /*
11054 * If this state is active, we must dtrace_sync()
11055 * before we can free the old dts_ecbs array: we're
11056 * coming in hot, and there may be active ring
11057 * buffer processing (which indexes into the dts_ecbs
11058 * array) on another CPU.
11059 */
11060 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11061 dtrace_sync();
11062
11063 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11064 }
11065
11066 dtrace_membar_producer();
11067 state->dts_necbs = necbs;
11068 }
11069
11070 ecb->dte_state = state;
11071
11072 ASSERT(state->dts_ecbs[epid - 1] == NULL);
11073 dtrace_membar_producer();
11074 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
11075
11076 return (ecb);
11077 }
11078
11079 static int
dtrace_ecb_enable(dtrace_ecb_t * ecb)11080 dtrace_ecb_enable(dtrace_ecb_t *ecb)
11081 {
11082 dtrace_probe_t *probe = ecb->dte_probe;
11083
11084 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11085 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11086 ASSERT(ecb->dte_next == NULL);
11087
11088 if (probe == NULL) {
11089 /*
11090 * This is the NULL probe -- there's nothing to do.
11091 */
11092 return(0);
11093 }
11094
11095 probe->dtpr_provider->dtpv_ecb_count++;
11096 if (probe->dtpr_ecb == NULL) {
11097 dtrace_provider_t *prov = probe->dtpr_provider;
11098
11099 /*
11100 * We're the first ECB on this probe.
11101 */
11102 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11103
11104 if (ecb->dte_predicate != NULL)
11105 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11106
11107 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11108 probe->dtpr_id, probe->dtpr_arg));
11109 } else {
11110 /*
11111 * This probe is already active. Swing the last pointer to
11112 * point to the new ECB, and issue a dtrace_sync() to assure
11113 * that all CPUs have seen the change.
11114 */
11115 ASSERT(probe->dtpr_ecb_last != NULL);
11116 probe->dtpr_ecb_last->dte_next = ecb;
11117 probe->dtpr_ecb_last = ecb;
11118 probe->dtpr_predcache = 0;
11119
11120 dtrace_sync();
11121 return(0);
11122 }
11123 }
11124
11125 static int
dtrace_ecb_resize(dtrace_ecb_t * ecb)11126 dtrace_ecb_resize(dtrace_ecb_t *ecb)
11127 {
11128 dtrace_action_t *act;
11129 uint32_t curneeded = UINT32_MAX;
11130 uint32_t aggbase = UINT32_MAX;
11131
11132 /*
11133 * If we record anything, we always record the dtrace_rechdr_t. (And
11134 * we always record it first.)
11135 */
11136 ecb->dte_size = sizeof (dtrace_rechdr_t);
11137 ecb->dte_alignment = sizeof (dtrace_epid_t);
11138
11139 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11140 dtrace_recdesc_t *rec = &act->dta_rec;
11141 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
11142
11143 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
11144
11145 if (DTRACEACT_ISAGG(act->dta_kind)) {
11146 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11147
11148 ASSERT(rec->dtrd_size != 0);
11149 ASSERT(agg->dtag_first != NULL);
11150 ASSERT(act->dta_prev->dta_intuple);
11151 ASSERT(aggbase != UINT32_MAX);
11152 ASSERT(curneeded != UINT32_MAX);
11153
11154 agg->dtag_base = aggbase;
11155 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11156 rec->dtrd_offset = curneeded;
11157 if (curneeded + rec->dtrd_size < curneeded)
11158 return (EINVAL);
11159 curneeded += rec->dtrd_size;
11160 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11161
11162 aggbase = UINT32_MAX;
11163 curneeded = UINT32_MAX;
11164 } else if (act->dta_intuple) {
11165 if (curneeded == UINT32_MAX) {
11166 /*
11167 * This is the first record in a tuple. Align
11168 * curneeded to be at offset 4 in an 8-byte
11169 * aligned block.
11170 */
11171 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11172 ASSERT(aggbase == UINT32_MAX);
11173
11174 curneeded = P2PHASEUP(ecb->dte_size,
11175 sizeof (uint64_t), sizeof (dtrace_aggid_t));
11176
11177 aggbase = curneeded - sizeof (dtrace_aggid_t);
11178 ASSERT(IS_P2ALIGNED(aggbase,
11179 sizeof (uint64_t)));
11180 }
11181
11182 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11183 rec->dtrd_offset = curneeded;
11184 curneeded += rec->dtrd_size;
11185 if (curneeded + rec->dtrd_size < curneeded)
11186 return (EINVAL);
11187 } else {
11188 /* tuples must be followed by an aggregation */
11189 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11190 ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11191 rec->dtrd_offset = ecb->dte_size;
11192 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11193 return (EINVAL);
11194 ecb->dte_size += rec->dtrd_size;
11195 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11196 }
11197 }
11198
11199 if ((act = ecb->dte_action) != NULL &&
11200 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11201 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11202 /*
11203 * If the size is still sizeof (dtrace_rechdr_t), then all
11204 * actions store no data; set the size to 0.
11205 */
11206 ecb->dte_size = 0;
11207 }
11208
11209 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11210 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11211 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11212 return (0);
11213 }
11214
11215 static dtrace_action_t *
dtrace_ecb_aggregation_create(dtrace_ecb_t * ecb,dtrace_actdesc_t * desc)11216 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11217 {
11218 dtrace_aggregation_t *agg;
11219 size_t size = sizeof (uint64_t);
11220 int ntuple = desc->dtad_ntuple;
11221 dtrace_action_t *act;
11222 dtrace_recdesc_t *frec;
11223 dtrace_aggid_t aggid;
11224 dtrace_state_t *state = ecb->dte_state;
11225
11226 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11227 agg->dtag_ecb = ecb;
11228
11229 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11230
11231 switch (desc->dtad_kind) {
11232 case DTRACEAGG_MIN:
11233 agg->dtag_initial = INT64_MAX;
11234 agg->dtag_aggregate = dtrace_aggregate_min;
11235 break;
11236
11237 case DTRACEAGG_MAX:
11238 agg->dtag_initial = INT64_MIN;
11239 agg->dtag_aggregate = dtrace_aggregate_max;
11240 break;
11241
11242 case DTRACEAGG_COUNT:
11243 agg->dtag_aggregate = dtrace_aggregate_count;
11244 break;
11245
11246 case DTRACEAGG_QUANTIZE:
11247 agg->dtag_aggregate = dtrace_aggregate_quantize;
11248 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11249 sizeof (uint64_t);
11250 break;
11251
11252 case DTRACEAGG_LQUANTIZE: {
11253 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11254 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11255
11256 agg->dtag_initial = desc->dtad_arg;
11257 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11258
11259 if (step == 0 || levels == 0)
11260 goto err;
11261
11262 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11263 break;
11264 }
11265
11266 case DTRACEAGG_LLQUANTIZE: {
11267 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11268 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11269 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11270 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11271 int64_t v;
11272
11273 agg->dtag_initial = desc->dtad_arg;
11274 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11275
11276 if (factor < 2 || low >= high || nsteps < factor)
11277 goto err;
11278
11279 /*
11280 * Now check that the number of steps evenly divides a power
11281 * of the factor. (This assures both integer bucket size and
11282 * linearity within each magnitude.)
11283 */
11284 for (v = factor; v < nsteps; v *= factor)
11285 continue;
11286
11287 if ((v % nsteps) || (nsteps % factor))
11288 goto err;
11289
11290 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11291 break;
11292 }
11293
11294 case DTRACEAGG_AVG:
11295 agg->dtag_aggregate = dtrace_aggregate_avg;
11296 size = sizeof (uint64_t) * 2;
11297 break;
11298
11299 case DTRACEAGG_STDDEV:
11300 agg->dtag_aggregate = dtrace_aggregate_stddev;
11301 size = sizeof (uint64_t) * 4;
11302 break;
11303
11304 case DTRACEAGG_SUM:
11305 agg->dtag_aggregate = dtrace_aggregate_sum;
11306 break;
11307
11308 default:
11309 goto err;
11310 }
11311
11312 agg->dtag_action.dta_rec.dtrd_size = size;
11313
11314 if (ntuple == 0)
11315 goto err;
11316
11317 /*
11318 * We must make sure that we have enough actions for the n-tuple.
11319 */
11320 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11321 if (DTRACEACT_ISAGG(act->dta_kind))
11322 break;
11323
11324 if (--ntuple == 0) {
11325 /*
11326 * This is the action with which our n-tuple begins.
11327 */
11328 agg->dtag_first = act;
11329 goto success;
11330 }
11331 }
11332
11333 /*
11334 * This n-tuple is short by ntuple elements. Return failure.
11335 */
11336 ASSERT(ntuple != 0);
11337 err:
11338 kmem_free(agg, sizeof (dtrace_aggregation_t));
11339 return (NULL);
11340
11341 success:
11342 /*
11343 * If the last action in the tuple has a size of zero, it's actually
11344 * an expression argument for the aggregating action.
11345 */
11346 ASSERT(ecb->dte_action_last != NULL);
11347 act = ecb->dte_action_last;
11348
11349 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11350 ASSERT(act->dta_difo != NULL);
11351
11352 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11353 agg->dtag_hasarg = 1;
11354 }
11355
11356 /*
11357 * We need to allocate an id for this aggregation.
11358 */
11359 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11360 VM_BESTFIT | VM_SLEEP);
11361
11362 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11363 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11364 dtrace_aggregation_t **aggs;
11365 int naggs = state->dts_naggregations << 1;
11366 int onaggs = state->dts_naggregations;
11367
11368 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11369
11370 if (naggs == 0) {
11371 ASSERT(oaggs == NULL);
11372 naggs = 1;
11373 }
11374
11375 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11376
11377 if (oaggs != NULL) {
11378 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11379 kmem_free(oaggs, onaggs * sizeof (*aggs));
11380 }
11381
11382 state->dts_aggregations = aggs;
11383 state->dts_naggregations = naggs;
11384 }
11385
11386 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11387 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11388
11389 frec = &agg->dtag_first->dta_rec;
11390 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11391 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11392
11393 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11394 ASSERT(!act->dta_intuple);
11395 act->dta_intuple = 1;
11396 }
11397
11398 return (&agg->dtag_action);
11399 }
11400
11401 static void
dtrace_ecb_aggregation_destroy(dtrace_ecb_t * ecb,dtrace_action_t * act)11402 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11403 {
11404 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11405 dtrace_state_t *state = ecb->dte_state;
11406 dtrace_aggid_t aggid = agg->dtag_id;
11407
11408 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11409 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11410
11411 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11412 state->dts_aggregations[aggid - 1] = NULL;
11413
11414 kmem_free(agg, sizeof (dtrace_aggregation_t));
11415 }
11416
11417 static int
dtrace_ecb_action_add(dtrace_ecb_t * ecb,dtrace_actdesc_t * desc)11418 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11419 {
11420 dtrace_action_t *action, *last;
11421 dtrace_difo_t *dp = desc->dtad_difo;
11422 uint32_t size = 0, align = sizeof (uint8_t), mask;
11423 uint16_t format = 0;
11424 dtrace_recdesc_t *rec;
11425 dtrace_state_t *state = ecb->dte_state;
11426 dtrace_optval_t *opt = state->dts_options;
11427 dtrace_optval_t nframes=0, strsize;
11428 uint64_t arg = desc->dtad_arg;
11429
11430 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11431 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11432
11433 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11434 /*
11435 * If this is an aggregating action, there must be neither
11436 * a speculate nor a commit on the action chain.
11437 */
11438 dtrace_action_t *act;
11439
11440 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11441 if (act->dta_kind == DTRACEACT_COMMIT)
11442 return (EINVAL);
11443
11444 if (act->dta_kind == DTRACEACT_SPECULATE)
11445 return (EINVAL);
11446 }
11447
11448 action = dtrace_ecb_aggregation_create(ecb, desc);
11449
11450 if (action == NULL)
11451 return (EINVAL);
11452 } else {
11453 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11454 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11455 dp != NULL && dp->dtdo_destructive)) {
11456 state->dts_destructive = 1;
11457 }
11458
11459 switch (desc->dtad_kind) {
11460 case DTRACEACT_PRINTF:
11461 case DTRACEACT_PRINTA:
11462 case DTRACEACT_SYSTEM:
11463 case DTRACEACT_FREOPEN:
11464 case DTRACEACT_DIFEXPR:
11465 /*
11466 * We know that our arg is a string -- turn it into a
11467 * format.
11468 */
11469 if (arg == 0) {
11470 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11471 desc->dtad_kind == DTRACEACT_DIFEXPR);
11472 format = 0;
11473 } else {
11474 ASSERT(arg != 0);
11475 ASSERT(arg > KERNELBASE);
11476 format = dtrace_format_add(state,
11477 (char *)(uintptr_t)arg);
11478 }
11479
11480 OS_FALLTHROUGH;
11481 case DTRACEACT_LIBACT:
11482 case DTRACEACT_TRACEMEM:
11483 case DTRACEACT_TRACEMEM_DYNSIZE:
11484 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
11485 if (dp == NULL)
11486 return (EINVAL);
11487
11488 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11489 break;
11490
11491 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11492 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11493 return (EINVAL);
11494
11495 size = opt[DTRACEOPT_STRSIZE];
11496 }
11497
11498 break;
11499
11500 case DTRACEACT_STACK:
11501 if ((nframes = arg) == 0) {
11502 nframes = opt[DTRACEOPT_STACKFRAMES];
11503 ASSERT(nframes > 0);
11504 arg = nframes;
11505 }
11506
11507 size = nframes * sizeof (pc_t);
11508 break;
11509
11510 case DTRACEACT_JSTACK:
11511 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11512 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11513
11514 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11515 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11516
11517 arg = DTRACE_USTACK_ARG(nframes, strsize);
11518
11519 OS_FALLTHROUGH;
11520 case DTRACEACT_USTACK:
11521 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11522 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11523 strsize = DTRACE_USTACK_STRSIZE(arg);
11524 nframes = opt[DTRACEOPT_USTACKFRAMES];
11525 ASSERT(nframes > 0);
11526 arg = DTRACE_USTACK_ARG(nframes, strsize);
11527 }
11528
11529 /*
11530 * Save a slot for the pid.
11531 */
11532 size = (nframes + 1) * sizeof (uint64_t);
11533 size += DTRACE_USTACK_STRSIZE(arg);
11534 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11535
11536 break;
11537
11538 case DTRACEACT_SYM:
11539 case DTRACEACT_MOD:
11540 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11541 sizeof (uint64_t)) ||
11542 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11543 return (EINVAL);
11544 break;
11545
11546 case DTRACEACT_USYM:
11547 case DTRACEACT_UMOD:
11548 case DTRACEACT_UADDR:
11549 if (dp == NULL ||
11550 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11551 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11552 return (EINVAL);
11553
11554 /*
11555 * We have a slot for the pid, plus a slot for the
11556 * argument. To keep things simple (aligned with
11557 * bitness-neutral sizing), we store each as a 64-bit
11558 * quantity.
11559 */
11560 size = 2 * sizeof (uint64_t);
11561 break;
11562
11563 case DTRACEACT_STOP:
11564 case DTRACEACT_BREAKPOINT:
11565 case DTRACEACT_PANIC:
11566 break;
11567
11568 case DTRACEACT_CHILL:
11569 case DTRACEACT_DISCARD:
11570 case DTRACEACT_RAISE:
11571 case DTRACEACT_PIDRESUME: /* __APPLE__ */
11572 if (dp == NULL)
11573 return (EINVAL);
11574 break;
11575
11576 case DTRACEACT_EXIT:
11577 if (dp == NULL ||
11578 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11579 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11580 return (EINVAL);
11581 break;
11582
11583 case DTRACEACT_SPECULATE:
11584 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11585 return (EINVAL);
11586
11587 if (dp == NULL)
11588 return (EINVAL);
11589
11590 state->dts_speculates = 1;
11591 break;
11592
11593 case DTRACEACT_COMMIT: {
11594 dtrace_action_t *act = ecb->dte_action;
11595
11596 for (; act != NULL; act = act->dta_next) {
11597 if (act->dta_kind == DTRACEACT_COMMIT)
11598 return (EINVAL);
11599 }
11600
11601 if (dp == NULL)
11602 return (EINVAL);
11603 break;
11604 }
11605
11606 default:
11607 return (EINVAL);
11608 }
11609
11610 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11611 /*
11612 * If this is a data-storing action or a speculate,
11613 * we must be sure that there isn't a commit on the
11614 * action chain.
11615 */
11616 dtrace_action_t *act = ecb->dte_action;
11617
11618 for (; act != NULL; act = act->dta_next) {
11619 if (act->dta_kind == DTRACEACT_COMMIT)
11620 return (EINVAL);
11621 }
11622 }
11623
11624 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11625 action->dta_rec.dtrd_size = size;
11626 }
11627
11628 action->dta_refcnt = 1;
11629 rec = &action->dta_rec;
11630 size = rec->dtrd_size;
11631
11632 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11633 if (!(size & mask)) {
11634 align = mask + 1;
11635 break;
11636 }
11637 }
11638
11639 action->dta_kind = desc->dtad_kind;
11640
11641 if ((action->dta_difo = dp) != NULL)
11642 dtrace_difo_hold(dp);
11643
11644 rec->dtrd_action = action->dta_kind;
11645 rec->dtrd_arg = arg;
11646 rec->dtrd_uarg = desc->dtad_uarg;
11647 rec->dtrd_alignment = (uint16_t)align;
11648 rec->dtrd_format = format;
11649
11650 if ((last = ecb->dte_action_last) != NULL) {
11651 ASSERT(ecb->dte_action != NULL);
11652 action->dta_prev = last;
11653 last->dta_next = action;
11654 } else {
11655 ASSERT(ecb->dte_action == NULL);
11656 ecb->dte_action = action;
11657 }
11658
11659 ecb->dte_action_last = action;
11660
11661 return (0);
11662 }
11663
11664 static void
dtrace_ecb_action_remove(dtrace_ecb_t * ecb)11665 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11666 {
11667 dtrace_action_t *act = ecb->dte_action, *next;
11668 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11669 dtrace_difo_t *dp;
11670 uint16_t format;
11671
11672 if (act != NULL && act->dta_refcnt > 1) {
11673 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11674 act->dta_refcnt--;
11675 } else {
11676 for (; act != NULL; act = next) {
11677 next = act->dta_next;
11678 ASSERT(next != NULL || act == ecb->dte_action_last);
11679 ASSERT(act->dta_refcnt == 1);
11680
11681 if ((format = act->dta_rec.dtrd_format) != 0)
11682 dtrace_format_remove(ecb->dte_state, format);
11683
11684 if ((dp = act->dta_difo) != NULL)
11685 dtrace_difo_release(dp, vstate);
11686
11687 if (DTRACEACT_ISAGG(act->dta_kind)) {
11688 dtrace_ecb_aggregation_destroy(ecb, act);
11689 } else {
11690 kmem_free(act, sizeof (dtrace_action_t));
11691 }
11692 }
11693 }
11694
11695 ecb->dte_action = NULL;
11696 ecb->dte_action_last = NULL;
11697 ecb->dte_size = 0;
11698 }
11699
11700 static void
dtrace_ecb_disable(dtrace_ecb_t * ecb)11701 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11702 {
11703 /*
11704 * We disable the ECB by removing it from its probe.
11705 */
11706 dtrace_ecb_t *pecb, *prev = NULL;
11707 dtrace_probe_t *probe = ecb->dte_probe;
11708
11709 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11710
11711 if (probe == NULL) {
11712 /*
11713 * This is the NULL probe; there is nothing to disable.
11714 */
11715 return;
11716 }
11717
11718 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11719 if (pecb == ecb)
11720 break;
11721 prev = pecb;
11722 }
11723
11724 ASSERT(pecb != NULL);
11725
11726 if (prev == NULL) {
11727 probe->dtpr_ecb = ecb->dte_next;
11728 } else {
11729 prev->dte_next = ecb->dte_next;
11730 }
11731
11732 if (ecb == probe->dtpr_ecb_last) {
11733 ASSERT(ecb->dte_next == NULL);
11734 probe->dtpr_ecb_last = prev;
11735 }
11736
11737 probe->dtpr_provider->dtpv_ecb_count--;
11738 /*
11739 * The ECB has been disconnected from the probe; now sync to assure
11740 * that all CPUs have seen the change before returning.
11741 */
11742 dtrace_sync();
11743
11744 if (probe->dtpr_ecb == NULL) {
11745 /*
11746 * That was the last ECB on the probe; clear the predicate
11747 * cache ID for the probe, disable it and sync one more time
11748 * to assure that we'll never hit it again.
11749 */
11750 dtrace_provider_t *prov = probe->dtpr_provider;
11751
11752 ASSERT(ecb->dte_next == NULL);
11753 ASSERT(probe->dtpr_ecb_last == NULL);
11754 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11755 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11756 probe->dtpr_id, probe->dtpr_arg);
11757 dtrace_sync();
11758 } else {
11759 /*
11760 * There is at least one ECB remaining on the probe. If there
11761 * is _exactly_ one, set the probe's predicate cache ID to be
11762 * the predicate cache ID of the remaining ECB.
11763 */
11764 ASSERT(probe->dtpr_ecb_last != NULL);
11765 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11766
11767 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11768 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11769
11770 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11771
11772 if (p != NULL)
11773 probe->dtpr_predcache = p->dtp_cacheid;
11774 }
11775
11776 ecb->dte_next = NULL;
11777 }
11778 }
11779
11780 static void
dtrace_ecb_destroy(dtrace_ecb_t * ecb)11781 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11782 {
11783 dtrace_state_t *state = ecb->dte_state;
11784 dtrace_vstate_t *vstate = &state->dts_vstate;
11785 dtrace_predicate_t *pred;
11786 dtrace_epid_t epid = ecb->dte_epid;
11787
11788 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11789 ASSERT(ecb->dte_next == NULL);
11790 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11791
11792 if ((pred = ecb->dte_predicate) != NULL)
11793 dtrace_predicate_release(pred, vstate);
11794
11795 dtrace_ecb_action_remove(ecb);
11796
11797 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11798 state->dts_ecbs[epid - 1] = NULL;
11799
11800 kmem_free(ecb, sizeof (dtrace_ecb_t));
11801 }
11802
11803 static dtrace_ecb_t *
dtrace_ecb_create(dtrace_state_t * state,dtrace_probe_t * probe,dtrace_enabling_t * enab)11804 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11805 dtrace_enabling_t *enab)
11806 {
11807 dtrace_ecb_t *ecb;
11808 dtrace_predicate_t *pred;
11809 dtrace_actdesc_t *act;
11810 dtrace_provider_t *prov;
11811 dtrace_ecbdesc_t *desc = enab->dten_current;
11812
11813 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11814 ASSERT(state != NULL);
11815
11816 ecb = dtrace_ecb_add(state, probe);
11817 ecb->dte_uarg = desc->dted_uarg;
11818
11819 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11820 dtrace_predicate_hold(pred);
11821 ecb->dte_predicate = pred;
11822 }
11823
11824 if (probe != NULL) {
11825 /*
11826 * If the provider shows more leg than the consumer is old
11827 * enough to see, we need to enable the appropriate implicit
11828 * predicate bits to prevent the ecb from activating at
11829 * revealing times.
11830 *
11831 * Providers specifying DTRACE_PRIV_USER at register time
11832 * are stating that they need the /proc-style privilege
11833 * model to be enforced, and this is what DTRACE_COND_OWNER
11834 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11835 */
11836 prov = probe->dtpr_provider;
11837 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11838 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11839 ecb->dte_cond |= DTRACE_COND_OWNER;
11840
11841 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11842 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11843 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11844
11845 /*
11846 * If the provider shows us kernel innards and the user
11847 * is lacking sufficient privilege, enable the
11848 * DTRACE_COND_USERMODE implicit predicate.
11849 */
11850 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11851 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11852 ecb->dte_cond |= DTRACE_COND_USERMODE;
11853 }
11854
11855 if (dtrace_ecb_create_cache != NULL) {
11856 /*
11857 * If we have a cached ecb, we'll use its action list instead
11858 * of creating our own (saving both time and space).
11859 */
11860 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11861 dtrace_action_t *act_if = cached->dte_action;
11862
11863 if (act_if != NULL) {
11864 ASSERT(act_if->dta_refcnt > 0);
11865 act_if->dta_refcnt++;
11866 ecb->dte_action = act_if;
11867 ecb->dte_action_last = cached->dte_action_last;
11868 ecb->dte_needed = cached->dte_needed;
11869 ecb->dte_size = cached->dte_size;
11870 ecb->dte_alignment = cached->dte_alignment;
11871 }
11872
11873 return (ecb);
11874 }
11875
11876 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11877 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11878 dtrace_ecb_destroy(ecb);
11879 return (NULL);
11880 }
11881 }
11882
11883 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11884 dtrace_ecb_destroy(ecb);
11885 return (NULL);
11886 }
11887
11888 return (dtrace_ecb_create_cache = ecb);
11889 }
11890
11891 static int
dtrace_ecb_create_enable(dtrace_probe_t * probe,void * arg1,void * arg2)11892 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11893 {
11894 dtrace_ecb_t *ecb;
11895 dtrace_enabling_t *enab = arg1;
11896 dtrace_ecbdesc_t *ep = arg2;
11897 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11898
11899 ASSERT(state != NULL);
11900
11901 if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11902 /*
11903 * This probe was created in a generation for which this
11904 * enabling has previously created ECBs; we don't want to
11905 * enable it again, so just kick out.
11906 */
11907 return (DTRACE_MATCH_NEXT);
11908 }
11909
11910 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11911 return (DTRACE_MATCH_DONE);
11912
11913 if (dtrace_ecb_enable(ecb) < 0)
11914 return (DTRACE_MATCH_FAIL);
11915
11916 return (DTRACE_MATCH_NEXT);
11917 }
11918
11919 static dtrace_ecb_t *
dtrace_epid2ecb(dtrace_state_t * state,dtrace_epid_t id)11920 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11921 {
11922 dtrace_ecb_t *ecb;
11923 #pragma unused(ecb) /* __APPLE__ */
11924
11925 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11926
11927 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11928 return (NULL);
11929
11930 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11931 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11932
11933 return (state->dts_ecbs[id - 1]);
11934 }
11935
11936 static dtrace_aggregation_t *
dtrace_aggid2agg(dtrace_state_t * state,dtrace_aggid_t id)11937 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11938 {
11939 dtrace_aggregation_t *agg;
11940 #pragma unused(agg) /* __APPLE__ */
11941
11942 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11943
11944 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11945 return (NULL);
11946
11947 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11948 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11949 agg->dtag_id == id);
11950
11951 return (state->dts_aggregations[id - 1]);
11952 }
11953
11954 /*
11955 * DTrace Buffer Functions
11956 *
11957 * The following functions manipulate DTrace buffers. Most of these functions
11958 * are called in the context of establishing or processing consumer state;
11959 * exceptions are explicitly noted.
11960 */
11961
11962 /*
11963 * Note: called from cross call context. This function switches the two
11964 * buffers on a given CPU. The atomicity of this operation is assured by
11965 * disabling interrupts while the actual switch takes place; the disabling of
11966 * interrupts serializes the execution with any execution of dtrace_probe() on
11967 * the same CPU.
11968 */
11969 static void
dtrace_buffer_switch(dtrace_buffer_t * buf)11970 dtrace_buffer_switch(dtrace_buffer_t *buf)
11971 {
11972 caddr_t tomax = buf->dtb_tomax;
11973 caddr_t xamot = buf->dtb_xamot;
11974 dtrace_icookie_t cookie;
11975 hrtime_t now;
11976
11977 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11978 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11979
11980 cookie = dtrace_interrupt_disable();
11981 now = dtrace_gethrtime();
11982 buf->dtb_tomax = xamot;
11983 buf->dtb_xamot = tomax;
11984 buf->dtb_xamot_drops = buf->dtb_drops;
11985 buf->dtb_xamot_offset = buf->dtb_offset;
11986 buf->dtb_xamot_errors = buf->dtb_errors;
11987 buf->dtb_xamot_flags = buf->dtb_flags;
11988 buf->dtb_offset = 0;
11989 buf->dtb_drops = 0;
11990 buf->dtb_errors = 0;
11991 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11992 buf->dtb_interval = now - buf->dtb_switched;
11993 buf->dtb_switched = now;
11994 buf->dtb_cur_limit = buf->dtb_limit;
11995
11996 dtrace_interrupt_enable(cookie);
11997 }
11998
11999 /*
12000 * Note: called from cross call context. This function activates a buffer
12001 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
12002 * is guaranteed by the disabling of interrupts.
12003 */
12004 static void
dtrace_buffer_activate(dtrace_state_t * state)12005 dtrace_buffer_activate(dtrace_state_t *state)
12006 {
12007 dtrace_buffer_t *buf;
12008 dtrace_icookie_t cookie = dtrace_interrupt_disable();
12009
12010 buf = &state->dts_buffer[CPU->cpu_id];
12011
12012 if (buf->dtb_tomax != NULL) {
12013 /*
12014 * We might like to assert that the buffer is marked inactive,
12015 * but this isn't necessarily true: the buffer for the CPU
12016 * that processes the BEGIN probe has its buffer activated
12017 * manually. In this case, we take the (harmless) action
12018 * re-clearing the bit INACTIVE bit.
12019 */
12020 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
12021 }
12022
12023 dtrace_interrupt_enable(cookie);
12024 }
12025
12026 static int
dtrace_buffer_canalloc(size_t size)12027 dtrace_buffer_canalloc(size_t size)
12028 {
12029 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
12030 return (B_FALSE);
12031 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
12032 return (B_FALSE);
12033
12034 return (B_TRUE);
12035 }
12036
12037 static int
dtrace_buffer_alloc(dtrace_buffer_t * bufs,size_t limit,size_t size,int flags,processorid_t cpu)12038 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
12039 processorid_t cpu)
12040 {
12041 dtrace_cpu_t *cp;
12042 dtrace_buffer_t *buf;
12043 size_t size_before_alloc = dtrace_buffer_memory_inuse;
12044
12045 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12046 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12047
12048 if (size > (size_t)dtrace_nonroot_maxsize &&
12049 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12050 return (EFBIG);
12051
12052 cp = cpu_list;
12053
12054 do {
12055 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12056 continue;
12057
12058 buf = &bufs[cp->cpu_id];
12059
12060 /*
12061 * If there is already a buffer allocated for this CPU, it
12062 * is only possible that this is a DR event. In this case,
12063 * the buffer size must match our specified size.
12064 */
12065 if (buf->dtb_tomax != NULL) {
12066 ASSERT(buf->dtb_size == size);
12067 continue;
12068 }
12069
12070 ASSERT(buf->dtb_xamot == NULL);
12071
12072 /* DTrace, please do not eat all the memory. */
12073 if (dtrace_buffer_canalloc(size) == B_FALSE)
12074 goto err;
12075 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12076 goto err;
12077 dtrace_buffer_memory_inuse += size;
12078
12079 /* Unsure that limit is always lower than size */
12080 limit = limit == size ? limit - 1 : limit;
12081 buf->dtb_cur_limit = limit;
12082 buf->dtb_limit = limit;
12083 buf->dtb_size = size;
12084 buf->dtb_flags = flags;
12085 buf->dtb_offset = 0;
12086 buf->dtb_drops = 0;
12087
12088 if (flags & DTRACEBUF_NOSWITCH)
12089 continue;
12090
12091 /* DTrace, please do not eat all the memory. */
12092 if (dtrace_buffer_canalloc(size) == B_FALSE)
12093 goto err;
12094 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12095 goto err;
12096 dtrace_buffer_memory_inuse += size;
12097 } while ((cp = cp->cpu_next) != cpu_list);
12098
12099 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
12100
12101 return (0);
12102
12103 err:
12104 cp = cpu_list;
12105
12106 do {
12107 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12108 continue;
12109
12110 buf = &bufs[cp->cpu_id];
12111
12112 if (buf->dtb_xamot != NULL) {
12113 ASSERT(buf->dtb_tomax != NULL);
12114 ASSERT(buf->dtb_size == size);
12115 kmem_free(buf->dtb_xamot, size);
12116 }
12117
12118 if (buf->dtb_tomax != NULL) {
12119 ASSERT(buf->dtb_size == size);
12120 kmem_free(buf->dtb_tomax, size);
12121 }
12122
12123 buf->dtb_tomax = NULL;
12124 buf->dtb_xamot = NULL;
12125 buf->dtb_size = 0;
12126 } while ((cp = cp->cpu_next) != cpu_list);
12127
12128 /* Restore the size saved before allocating memory */
12129 dtrace_buffer_memory_inuse = size_before_alloc;
12130
12131 return (ENOMEM);
12132 }
12133
12134 /*
12135 * Note: called from probe context. This function just increments the drop
12136 * count on a buffer. It has been made a function to allow for the
12137 * possibility of understanding the source of mysterious drop counts. (A
12138 * problem for which one may be particularly disappointed that DTrace cannot
12139 * be used to understand DTrace.)
12140 */
12141 static void
dtrace_buffer_drop(dtrace_buffer_t * buf)12142 dtrace_buffer_drop(dtrace_buffer_t *buf)
12143 {
12144 buf->dtb_drops++;
12145 }
12146
12147 /*
12148 * Note: called from probe context. This function is called to reserve space
12149 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12150 * mstate. Returns the new offset in the buffer, or a negative value if an
12151 * error has occurred.
12152 */
12153 static intptr_t
dtrace_buffer_reserve(dtrace_buffer_t * buf,size_t needed,size_t align,dtrace_state_t * state,dtrace_mstate_t * mstate)12154 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12155 dtrace_state_t *state, dtrace_mstate_t *mstate)
12156 {
12157 intptr_t offs = buf->dtb_offset, soffs;
12158 intptr_t woffs;
12159 caddr_t tomax;
12160 size_t total_off;
12161
12162 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12163 return (-1);
12164
12165 if ((tomax = buf->dtb_tomax) == NULL) {
12166 dtrace_buffer_drop(buf);
12167 return (-1);
12168 }
12169
12170 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12171 while (offs & (align - 1)) {
12172 /*
12173 * Assert that our alignment is off by a number which
12174 * is itself sizeof (uint32_t) aligned.
12175 */
12176 ASSERT(!((align - (offs & (align - 1))) &
12177 (sizeof (uint32_t) - 1)));
12178 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12179 offs += sizeof (uint32_t);
12180 }
12181
12182 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12183 if (buf->dtb_cur_limit == buf->dtb_limit) {
12184 buf->dtb_cur_limit = buf->dtb_size;
12185
12186 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12187 /**
12188 * Set an AST on the current processor
12189 * so that we can wake up the process
12190 * outside of probe context, when we know
12191 * it is safe to do so
12192 */
12193 minor_t minor = getminor(state->dts_dev);
12194 ASSERT(minor < 32);
12195
12196 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12197 ast_dtrace_on();
12198 }
12199 if ((uint64_t)soffs > buf->dtb_size) {
12200 dtrace_buffer_drop(buf);
12201 return (-1);
12202 }
12203 }
12204
12205 if (mstate == NULL)
12206 return (offs);
12207
12208 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12209 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12210 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12211
12212 return (offs);
12213 }
12214
12215 if (buf->dtb_flags & DTRACEBUF_FILL) {
12216 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12217 (buf->dtb_flags & DTRACEBUF_FULL))
12218 return (-1);
12219 goto out;
12220 }
12221
12222 total_off = needed + (offs & (align - 1));
12223
12224 /*
12225 * For a ring buffer, life is quite a bit more complicated. Before
12226 * we can store any padding, we need to adjust our wrapping offset.
12227 * (If we've never before wrapped or we're not about to, no adjustment
12228 * is required.)
12229 */
12230 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12231 offs + total_off > buf->dtb_size) {
12232 woffs = buf->dtb_xamot_offset;
12233
12234 if (offs + total_off > buf->dtb_size) {
12235 /*
12236 * We can't fit in the end of the buffer. First, a
12237 * sanity check that we can fit in the buffer at all.
12238 */
12239 if (total_off > buf->dtb_size) {
12240 dtrace_buffer_drop(buf);
12241 return (-1);
12242 }
12243
12244 /*
12245 * We're going to be storing at the top of the buffer,
12246 * so now we need to deal with the wrapped offset. We
12247 * only reset our wrapped offset to 0 if it is
12248 * currently greater than the current offset. If it
12249 * is less than the current offset, it is because a
12250 * previous allocation induced a wrap -- but the
12251 * allocation didn't subsequently take the space due
12252 * to an error or false predicate evaluation. In this
12253 * case, we'll just leave the wrapped offset alone: if
12254 * the wrapped offset hasn't been advanced far enough
12255 * for this allocation, it will be adjusted in the
12256 * lower loop.
12257 */
12258 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12259 if (woffs >= offs)
12260 woffs = 0;
12261 } else {
12262 woffs = 0;
12263 }
12264
12265 /*
12266 * Now we know that we're going to be storing to the
12267 * top of the buffer and that there is room for us
12268 * there. We need to clear the buffer from the current
12269 * offset to the end (there may be old gunk there).
12270 */
12271 while ((uint64_t)offs < buf->dtb_size)
12272 tomax[offs++] = 0;
12273
12274 /*
12275 * We need to set our offset to zero. And because we
12276 * are wrapping, we need to set the bit indicating as
12277 * much. We can also adjust our needed space back
12278 * down to the space required by the ECB -- we know
12279 * that the top of the buffer is aligned.
12280 */
12281 offs = 0;
12282 total_off = needed;
12283 buf->dtb_flags |= DTRACEBUF_WRAPPED;
12284 } else {
12285 /*
12286 * There is room for us in the buffer, so we simply
12287 * need to check the wrapped offset.
12288 */
12289 if (woffs < offs) {
12290 /*
12291 * The wrapped offset is less than the offset.
12292 * This can happen if we allocated buffer space
12293 * that induced a wrap, but then we didn't
12294 * subsequently take the space due to an error
12295 * or false predicate evaluation. This is
12296 * okay; we know that _this_ allocation isn't
12297 * going to induce a wrap. We still can't
12298 * reset the wrapped offset to be zero,
12299 * however: the space may have been trashed in
12300 * the previous failed probe attempt. But at
12301 * least the wrapped offset doesn't need to
12302 * be adjusted at all...
12303 */
12304 goto out;
12305 }
12306 }
12307
12308 while (offs + total_off > (size_t)woffs) {
12309 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12310 size_t size;
12311
12312 if (epid == DTRACE_EPIDNONE) {
12313 size = sizeof (uint32_t);
12314 } else {
12315 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12316 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12317
12318 size = state->dts_ecbs[epid - 1]->dte_size;
12319 }
12320
12321 ASSERT(woffs + size <= buf->dtb_size);
12322 ASSERT(size != 0);
12323
12324 if (woffs + size == buf->dtb_size) {
12325 /*
12326 * We've reached the end of the buffer; we want
12327 * to set the wrapped offset to 0 and break
12328 * out. However, if the offs is 0, then we're
12329 * in a strange edge-condition: the amount of
12330 * space that we want to reserve plus the size
12331 * of the record that we're overwriting is
12332 * greater than the size of the buffer. This
12333 * is problematic because if we reserve the
12334 * space but subsequently don't consume it (due
12335 * to a failed predicate or error) the wrapped
12336 * offset will be 0 -- yet the EPID at offset 0
12337 * will not be committed. This situation is
12338 * relatively easy to deal with: if we're in
12339 * this case, the buffer is indistinguishable
12340 * from one that hasn't wrapped; we need only
12341 * finish the job by clearing the wrapped bit,
12342 * explicitly setting the offset to be 0, and
12343 * zero'ing out the old data in the buffer.
12344 */
12345 if (offs == 0) {
12346 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12347 buf->dtb_offset = 0;
12348 woffs = total_off;
12349
12350 while ((uint64_t)woffs < buf->dtb_size)
12351 tomax[woffs++] = 0;
12352 }
12353
12354 woffs = 0;
12355 break;
12356 }
12357
12358 woffs += size;
12359 }
12360
12361 /*
12362 * We have a wrapped offset. It may be that the wrapped offset
12363 * has become zero -- that's okay.
12364 */
12365 buf->dtb_xamot_offset = woffs;
12366 }
12367
12368 out:
12369 /*
12370 * Now we can plow the buffer with any necessary padding.
12371 */
12372 while (offs & (align - 1)) {
12373 /*
12374 * Assert that our alignment is off by a number which
12375 * is itself sizeof (uint32_t) aligned.
12376 */
12377 ASSERT(!((align - (offs & (align - 1))) &
12378 (sizeof (uint32_t) - 1)));
12379 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12380 offs += sizeof (uint32_t);
12381 }
12382
12383 if (buf->dtb_flags & DTRACEBUF_FILL) {
12384 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12385 buf->dtb_flags |= DTRACEBUF_FULL;
12386 return (-1);
12387 }
12388 }
12389
12390 if (mstate == NULL)
12391 return (offs);
12392
12393 /*
12394 * For ring buffers and fill buffers, the scratch space is always
12395 * the inactive buffer.
12396 */
12397 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12398 mstate->dtms_scratch_size = buf->dtb_size;
12399 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12400
12401 return (offs);
12402 }
12403
12404 static void
dtrace_buffer_polish(dtrace_buffer_t * buf)12405 dtrace_buffer_polish(dtrace_buffer_t *buf)
12406 {
12407 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12408 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12409
12410 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12411 return;
12412
12413 /*
12414 * We need to polish the ring buffer. There are three cases:
12415 *
12416 * - The first (and presumably most common) is that there is no gap
12417 * between the buffer offset and the wrapped offset. In this case,
12418 * there is nothing in the buffer that isn't valid data; we can
12419 * mark the buffer as polished and return.
12420 *
12421 * - The second (less common than the first but still more common
12422 * than the third) is that there is a gap between the buffer offset
12423 * and the wrapped offset, and the wrapped offset is larger than the
12424 * buffer offset. This can happen because of an alignment issue, or
12425 * can happen because of a call to dtrace_buffer_reserve() that
12426 * didn't subsequently consume the buffer space. In this case,
12427 * we need to zero the data from the buffer offset to the wrapped
12428 * offset.
12429 *
12430 * - The third (and least common) is that there is a gap between the
12431 * buffer offset and the wrapped offset, but the wrapped offset is
12432 * _less_ than the buffer offset. This can only happen because a
12433 * call to dtrace_buffer_reserve() induced a wrap, but the space
12434 * was not subsequently consumed. In this case, we need to zero the
12435 * space from the offset to the end of the buffer _and_ from the
12436 * top of the buffer to the wrapped offset.
12437 */
12438 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12439 bzero(buf->dtb_tomax + buf->dtb_offset,
12440 buf->dtb_xamot_offset - buf->dtb_offset);
12441 }
12442
12443 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12444 bzero(buf->dtb_tomax + buf->dtb_offset,
12445 buf->dtb_size - buf->dtb_offset);
12446 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12447 }
12448 }
12449
12450 static void
dtrace_buffer_free(dtrace_buffer_t * bufs)12451 dtrace_buffer_free(dtrace_buffer_t *bufs)
12452 {
12453 int i;
12454
12455 for (i = 0; i < (int)NCPU; i++) {
12456 dtrace_buffer_t *buf = &bufs[i];
12457
12458 if (buf->dtb_tomax == NULL) {
12459 ASSERT(buf->dtb_xamot == NULL);
12460 ASSERT(buf->dtb_size == 0);
12461 continue;
12462 }
12463
12464 if (buf->dtb_xamot != NULL) {
12465 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12466 kmem_free(buf->dtb_xamot, buf->dtb_size);
12467
12468 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12469 dtrace_buffer_memory_inuse -= buf->dtb_size;
12470 }
12471
12472 kmem_free(buf->dtb_tomax, buf->dtb_size);
12473 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12474 dtrace_buffer_memory_inuse -= buf->dtb_size;
12475
12476 buf->dtb_size = 0;
12477 buf->dtb_tomax = NULL;
12478 buf->dtb_xamot = NULL;
12479 }
12480 }
12481
12482 /*
12483 * DTrace Enabling Functions
12484 */
12485 static dtrace_enabling_t *
dtrace_enabling_create(dtrace_vstate_t * vstate)12486 dtrace_enabling_create(dtrace_vstate_t *vstate)
12487 {
12488 dtrace_enabling_t *enab;
12489
12490 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12491 enab->dten_vstate = vstate;
12492
12493 return (enab);
12494 }
12495
12496 static void
dtrace_enabling_add(dtrace_enabling_t * enab,dtrace_ecbdesc_t * ecb)12497 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12498 {
12499 dtrace_ecbdesc_t **ndesc;
12500 size_t osize, nsize;
12501
12502 /*
12503 * We can't add to enablings after we've enabled them, or after we've
12504 * retained them.
12505 */
12506 ASSERT(enab->dten_probegen == 0);
12507 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12508
12509 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12510 if (ecb == NULL) return;
12511
12512 if (enab->dten_ndesc < enab->dten_maxdesc) {
12513 enab->dten_desc[enab->dten_ndesc++] = ecb;
12514 return;
12515 }
12516
12517 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12518
12519 if (enab->dten_maxdesc == 0) {
12520 enab->dten_maxdesc = 1;
12521 } else {
12522 enab->dten_maxdesc <<= 1;
12523 }
12524
12525 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12526
12527 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12528 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12529 bcopy(enab->dten_desc, ndesc, osize);
12530 kmem_free(enab->dten_desc, osize);
12531
12532 enab->dten_desc = ndesc;
12533 enab->dten_desc[enab->dten_ndesc++] = ecb;
12534 }
12535
12536 static void
dtrace_enabling_addlike(dtrace_enabling_t * enab,dtrace_ecbdesc_t * ecb,dtrace_probedesc_t * pd)12537 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12538 dtrace_probedesc_t *pd)
12539 {
12540 dtrace_ecbdesc_t *new;
12541 dtrace_predicate_t *pred;
12542 dtrace_actdesc_t *act;
12543
12544 /*
12545 * We're going to create a new ECB description that matches the
12546 * specified ECB in every way, but has the specified probe description.
12547 */
12548 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12549
12550 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12551 dtrace_predicate_hold(pred);
12552
12553 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12554 dtrace_actdesc_hold(act);
12555
12556 new->dted_action = ecb->dted_action;
12557 new->dted_pred = ecb->dted_pred;
12558 new->dted_probe = *pd;
12559 new->dted_uarg = ecb->dted_uarg;
12560
12561 dtrace_enabling_add(enab, new);
12562 }
12563
12564 static void
dtrace_enabling_dump(dtrace_enabling_t * enab)12565 dtrace_enabling_dump(dtrace_enabling_t *enab)
12566 {
12567 int i;
12568
12569 for (i = 0; i < enab->dten_ndesc; i++) {
12570 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12571
12572 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12573 desc->dtpd_provider, desc->dtpd_mod,
12574 desc->dtpd_func, desc->dtpd_name);
12575 }
12576 }
12577
12578 static void
dtrace_enabling_destroy(dtrace_enabling_t * enab)12579 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12580 {
12581 int i;
12582 dtrace_ecbdesc_t *ep;
12583 dtrace_vstate_t *vstate = enab->dten_vstate;
12584
12585 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12586
12587 for (i = 0; i < enab->dten_ndesc; i++) {
12588 dtrace_actdesc_t *act, *next;
12589 dtrace_predicate_t *pred;
12590
12591 ep = enab->dten_desc[i];
12592
12593 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12594 dtrace_predicate_release(pred, vstate);
12595
12596 for (act = ep->dted_action; act != NULL; act = next) {
12597 next = act->dtad_next;
12598 dtrace_actdesc_release(act, vstate);
12599 }
12600
12601 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12602 }
12603
12604 kmem_free(enab->dten_desc,
12605 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12606
12607 /*
12608 * If this was a retained enabling, decrement the dts_nretained count
12609 * and take it off of the dtrace_retained list.
12610 */
12611 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12612 dtrace_retained == enab) {
12613 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12614 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12615 enab->dten_vstate->dtvs_state->dts_nretained--;
12616 dtrace_retained_gen++;
12617 }
12618
12619 if (enab->dten_prev == NULL) {
12620 if (dtrace_retained == enab) {
12621 dtrace_retained = enab->dten_next;
12622
12623 if (dtrace_retained != NULL)
12624 dtrace_retained->dten_prev = NULL;
12625 }
12626 } else {
12627 ASSERT(enab != dtrace_retained);
12628 ASSERT(dtrace_retained != NULL);
12629 enab->dten_prev->dten_next = enab->dten_next;
12630 }
12631
12632 if (enab->dten_next != NULL) {
12633 ASSERT(dtrace_retained != NULL);
12634 enab->dten_next->dten_prev = enab->dten_prev;
12635 }
12636
12637 kmem_free(enab, sizeof (dtrace_enabling_t));
12638 }
12639
12640 static int
dtrace_enabling_retain(dtrace_enabling_t * enab)12641 dtrace_enabling_retain(dtrace_enabling_t *enab)
12642 {
12643 dtrace_state_t *state;
12644
12645 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12646 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12647 ASSERT(enab->dten_vstate != NULL);
12648
12649 state = enab->dten_vstate->dtvs_state;
12650 ASSERT(state != NULL);
12651
12652 /*
12653 * We only allow each state to retain dtrace_retain_max enablings.
12654 */
12655 if (state->dts_nretained >= dtrace_retain_max)
12656 return (ENOSPC);
12657
12658 state->dts_nretained++;
12659 dtrace_retained_gen++;
12660
12661 if (dtrace_retained == NULL) {
12662 dtrace_retained = enab;
12663 return (0);
12664 }
12665
12666 enab->dten_next = dtrace_retained;
12667 dtrace_retained->dten_prev = enab;
12668 dtrace_retained = enab;
12669
12670 return (0);
12671 }
12672
12673 static int
dtrace_enabling_replicate(dtrace_state_t * state,dtrace_probedesc_t * match,dtrace_probedesc_t * create)12674 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12675 dtrace_probedesc_t *create)
12676 {
12677 dtrace_enabling_t *new, *enab;
12678 int found = 0, err = ENOENT;
12679
12680 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12681 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12682 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12683 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12684 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12685
12686 new = dtrace_enabling_create(&state->dts_vstate);
12687
12688 /*
12689 * Iterate over all retained enablings, looking for enablings that
12690 * match the specified state.
12691 */
12692 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12693 int i;
12694
12695 /*
12696 * dtvs_state can only be NULL for helper enablings -- and
12697 * helper enablings can't be retained.
12698 */
12699 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12700
12701 if (enab->dten_vstate->dtvs_state != state)
12702 continue;
12703
12704 /*
12705 * Now iterate over each probe description; we're looking for
12706 * an exact match to the specified probe description.
12707 */
12708 for (i = 0; i < enab->dten_ndesc; i++) {
12709 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12710 dtrace_probedesc_t *pd = &ep->dted_probe;
12711
12712 /* APPLE NOTE: Darwin employs size bounded string operation. */
12713 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12714 continue;
12715
12716 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12717 continue;
12718
12719 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12720 continue;
12721
12722 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12723 continue;
12724
12725 /*
12726 * We have a winning probe! Add it to our growing
12727 * enabling.
12728 */
12729 found = 1;
12730 dtrace_enabling_addlike(new, ep, create);
12731 }
12732 }
12733
12734 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12735 dtrace_enabling_destroy(new);
12736 return (err);
12737 }
12738
12739 return (0);
12740 }
12741
12742 static void
dtrace_enabling_retract(dtrace_state_t * state)12743 dtrace_enabling_retract(dtrace_state_t *state)
12744 {
12745 dtrace_enabling_t *enab, *next;
12746
12747 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12748
12749 /*
12750 * Iterate over all retained enablings, destroy the enablings retained
12751 * for the specified state.
12752 */
12753 for (enab = dtrace_retained; enab != NULL; enab = next) {
12754 next = enab->dten_next;
12755
12756 /*
12757 * dtvs_state can only be NULL for helper enablings -- and
12758 * helper enablings can't be retained.
12759 */
12760 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12761
12762 if (enab->dten_vstate->dtvs_state == state) {
12763 ASSERT(state->dts_nretained > 0);
12764 dtrace_enabling_destroy(enab);
12765 }
12766 }
12767
12768 ASSERT(state->dts_nretained == 0);
12769 }
12770
12771 static int
dtrace_enabling_match(dtrace_enabling_t * enab,int * nmatched,dtrace_match_cond_t * cond)12772 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12773 {
12774 int i = 0;
12775 int total_matched = 0, matched = 0;
12776
12777 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12778 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12779
12780 for (i = 0; i < enab->dten_ndesc; i++) {
12781 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12782
12783 enab->dten_current = ep;
12784 enab->dten_error = 0;
12785
12786 /**
12787 * Before doing a dtrace_probe_enable, which is really
12788 * expensive, check that this enabling matches the matching precondition
12789 * if we have one
12790 */
12791 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12792 continue;
12793 }
12794 /*
12795 * If a provider failed to enable a probe then get out and
12796 * let the consumer know we failed.
12797 */
12798 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12799 return (EBUSY);
12800
12801 total_matched += matched;
12802
12803 if (enab->dten_error != 0) {
12804 /*
12805 * If we get an error half-way through enabling the
12806 * probes, we kick out -- perhaps with some number of
12807 * them enabled. Leaving enabled probes enabled may
12808 * be slightly confusing for user-level, but we expect
12809 * that no one will attempt to actually drive on in
12810 * the face of such errors. If this is an anonymous
12811 * enabling (indicated with a NULL nmatched pointer),
12812 * we cmn_err() a message. We aren't expecting to
12813 * get such an error -- such as it can exist at all,
12814 * it would be a result of corrupted DOF in the driver
12815 * properties.
12816 */
12817 if (nmatched == NULL) {
12818 cmn_err(CE_WARN, "dtrace_enabling_match() "
12819 "error on %p: %d", (void *)ep,
12820 enab->dten_error);
12821 }
12822
12823 return (enab->dten_error);
12824 }
12825
12826 ep->dted_probegen = dtrace_probegen;
12827 }
12828
12829 if (nmatched != NULL)
12830 *nmatched = total_matched;
12831
12832 return (0);
12833 }
12834
12835 static void
dtrace_enabling_matchall_with_cond(dtrace_match_cond_t * cond)12836 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12837 {
12838 dtrace_enabling_t *enab;
12839
12840 lck_mtx_lock(&cpu_lock);
12841 lck_mtx_lock(&dtrace_lock);
12842
12843 /*
12844 * Iterate over all retained enablings to see if any probes match
12845 * against them. We only perform this operation on enablings for which
12846 * we have sufficient permissions by virtue of being in the global zone
12847 * or in the same zone as the DTrace client. Because we can be called
12848 * after dtrace_detach() has been called, we cannot assert that there
12849 * are retained enablings. We can safely load from dtrace_retained,
12850 * however: the taskq_destroy() at the end of dtrace_detach() will
12851 * block pending our completion.
12852 */
12853
12854 /*
12855 * Darwin doesn't do zones.
12856 * Behave as if always in "global" zone."
12857 */
12858 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12859 (void) dtrace_enabling_match(enab, NULL, cond);
12860 }
12861
12862 lck_mtx_unlock(&dtrace_lock);
12863 lck_mtx_unlock(&cpu_lock);
12864
12865 }
12866
12867 static void
dtrace_enabling_matchall(void)12868 dtrace_enabling_matchall(void)
12869 {
12870 dtrace_enabling_matchall_with_cond(NULL);
12871 }
12872
12873
12874
12875 /*
12876 * If an enabling is to be enabled without having matched probes (that is, if
12877 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12878 * enabling must be _primed_ by creating an ECB for every ECB description.
12879 * This must be done to assure that we know the number of speculations, the
12880 * number of aggregations, the minimum buffer size needed, etc. before we
12881 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12882 * enabling any probes, we create ECBs for every ECB decription, but with a
12883 * NULL probe -- which is exactly what this function does.
12884 */
12885 static void
dtrace_enabling_prime(dtrace_state_t * state)12886 dtrace_enabling_prime(dtrace_state_t *state)
12887 {
12888 dtrace_enabling_t *enab;
12889 int i;
12890
12891 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12892 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12893
12894 if (enab->dten_vstate->dtvs_state != state)
12895 continue;
12896
12897 /*
12898 * We don't want to prime an enabling more than once, lest
12899 * we allow a malicious user to induce resource exhaustion.
12900 * (The ECBs that result from priming an enabling aren't
12901 * leaked -- but they also aren't deallocated until the
12902 * consumer state is destroyed.)
12903 */
12904 if (enab->dten_primed)
12905 continue;
12906
12907 for (i = 0; i < enab->dten_ndesc; i++) {
12908 enab->dten_current = enab->dten_desc[i];
12909 (void) dtrace_probe_enable(NULL, enab, NULL);
12910 }
12911
12912 enab->dten_primed = 1;
12913 }
12914 }
12915
12916 /*
12917 * Called to indicate that probes should be provided due to retained
12918 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12919 * must take an initial lap through the enabling calling the dtps_provide()
12920 * entry point explicitly to allow for autocreated probes.
12921 */
12922 static void
dtrace_enabling_provide(dtrace_provider_t * prv)12923 dtrace_enabling_provide(dtrace_provider_t *prv)
12924 {
12925 int i, all = 0;
12926 dtrace_probedesc_t desc;
12927 dtrace_genid_t gen;
12928
12929 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12930 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12931
12932 if (prv == NULL) {
12933 all = 1;
12934 prv = dtrace_provider;
12935 }
12936
12937 do {
12938 dtrace_enabling_t *enab;
12939 void *parg = prv->dtpv_arg;
12940
12941 retry:
12942 gen = dtrace_retained_gen;
12943 for (enab = dtrace_retained; enab != NULL;
12944 enab = enab->dten_next) {
12945 for (i = 0; i < enab->dten_ndesc; i++) {
12946 desc = enab->dten_desc[i]->dted_probe;
12947 lck_mtx_unlock(&dtrace_lock);
12948 prv->dtpv_pops.dtps_provide(parg, &desc);
12949 lck_mtx_lock(&dtrace_lock);
12950 /*
12951 * Process the retained enablings again if
12952 * they have changed while we weren't holding
12953 * dtrace_lock.
12954 */
12955 if (gen != dtrace_retained_gen)
12956 goto retry;
12957 }
12958 }
12959 } while (all && (prv = prv->dtpv_next) != NULL);
12960
12961 lck_mtx_unlock(&dtrace_lock);
12962 dtrace_probe_provide(NULL, all ? NULL : prv);
12963 lck_mtx_lock(&dtrace_lock);
12964 }
12965
12966 /*
12967 * DTrace DOF Functions
12968 */
12969 /*ARGSUSED*/
12970 static void
dtrace_dof_error(dof_hdr_t * dof,const char * str)12971 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12972 {
12973 #pragma unused(dof) /* __APPLE__ */
12974 if (dtrace_err_verbose)
12975 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12976
12977 #ifdef DTRACE_ERRDEBUG
12978 dtrace_errdebug(str);
12979 #endif
12980 }
12981
12982 /*
12983 * Create DOF out of a currently enabled state. Right now, we only create
12984 * DOF containing the run-time options -- but this could be expanded to create
12985 * complete DOF representing the enabled state.
12986 */
12987 static dof_hdr_t *
dtrace_dof_create(dtrace_state_t * state)12988 dtrace_dof_create(dtrace_state_t *state)
12989 {
12990 dof_hdr_t *dof;
12991 dof_sec_t *sec;
12992 dof_optdesc_t *opt;
12993 int i, len = sizeof (dof_hdr_t) +
12994 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12995 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12996
12997 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12998
12999 dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
13000 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
13001 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
13002 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
13003 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
13004
13005 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
13006 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
13007 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
13008 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
13009 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
13010 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
13011
13012 dof->dofh_flags = 0;
13013 dof->dofh_hdrsize = sizeof (dof_hdr_t);
13014 dof->dofh_secsize = sizeof (dof_sec_t);
13015 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
13016 dof->dofh_secoff = sizeof (dof_hdr_t);
13017 dof->dofh_loadsz = len;
13018 dof->dofh_filesz = len;
13019 dof->dofh_pad = 0;
13020
13021 /*
13022 * Fill in the option section header...
13023 */
13024 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
13025 sec->dofs_type = DOF_SECT_OPTDESC;
13026 sec->dofs_align = sizeof (uint64_t);
13027 sec->dofs_flags = DOF_SECF_LOAD;
13028 sec->dofs_entsize = sizeof (dof_optdesc_t);
13029
13030 opt = (dof_optdesc_t *)((uintptr_t)sec +
13031 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
13032
13033 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
13034 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13035
13036 for (i = 0; i < DTRACEOPT_MAX; i++) {
13037 opt[i].dofo_option = i;
13038 opt[i].dofo_strtab = DOF_SECIDX_NONE;
13039 opt[i].dofo_value = state->dts_options[i];
13040 }
13041
13042 return (dof);
13043 }
13044
13045 static dof_hdr_t *
dtrace_dof_copyin(user_addr_t uarg,int * errp)13046 dtrace_dof_copyin(user_addr_t uarg, int *errp)
13047 {
13048 dof_hdr_t hdr, *dof;
13049
13050 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13051
13052 /*
13053 * First, we're going to copyin() the sizeof (dof_hdr_t).
13054 */
13055 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
13056 dtrace_dof_error(NULL, "failed to copyin DOF header");
13057 *errp = EFAULT;
13058 return (NULL);
13059 }
13060
13061 /*
13062 * Now we'll allocate the entire DOF and copy it in -- provided
13063 * that the length isn't outrageous.
13064 */
13065 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13066 dtrace_dof_error(&hdr, "load size exceeds maximum");
13067 *errp = E2BIG;
13068 return (NULL);
13069 }
13070
13071 if (hdr.dofh_loadsz < sizeof (hdr)) {
13072 dtrace_dof_error(&hdr, "invalid load size");
13073 *errp = EINVAL;
13074 return (NULL);
13075 }
13076
13077 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13078
13079 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
13080 dof->dofh_loadsz != hdr.dofh_loadsz) {
13081 kmem_free_aligned(dof, hdr.dofh_loadsz);
13082 *errp = EFAULT;
13083 return (NULL);
13084 }
13085
13086 return (dof);
13087 }
13088
13089 static dof_hdr_t *
dtrace_dof_copyin_from_proc(proc_t * p,user_addr_t uarg,int * errp)13090 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
13091 {
13092 dof_hdr_t hdr, *dof;
13093
13094 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13095
13096 /*
13097 * First, we're going to copyin() the sizeof (dof_hdr_t).
13098 */
13099 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
13100 dtrace_dof_error(NULL, "failed to copyin DOF header");
13101 *errp = EFAULT;
13102 return (NULL);
13103 }
13104
13105 /*
13106 * Now we'll allocate the entire DOF and copy it in -- provided
13107 * that the length isn't outrageous.
13108 */
13109 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13110 dtrace_dof_error(&hdr, "load size exceeds maximum");
13111 *errp = E2BIG;
13112 return (NULL);
13113 }
13114
13115 if (hdr.dofh_loadsz < sizeof (hdr)) {
13116 dtrace_dof_error(&hdr, "invalid load size");
13117 *errp = EINVAL;
13118 return (NULL);
13119 }
13120
13121 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13122
13123 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS ||
13124 dof->dofh_loadsz != hdr.dofh_loadsz) {
13125 kmem_free_aligned(dof, hdr.dofh_loadsz);
13126 *errp = EFAULT;
13127 return (NULL);
13128 }
13129
13130 return (dof);
13131 }
13132
13133 static void
dtrace_dof_destroy(dof_hdr_t * dof)13134 dtrace_dof_destroy(dof_hdr_t *dof)
13135 {
13136 kmem_free_aligned(dof, dof->dofh_loadsz);
13137 }
13138
13139 static dof_hdr_t *
dtrace_dof_property(const char * name)13140 dtrace_dof_property(const char *name)
13141 {
13142 unsigned int len = 0;
13143 dof_hdr_t *dof;
13144
13145 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13146 return NULL;
13147 }
13148
13149 if (!PEReadNVRAMProperty(name, NULL, &len)) {
13150 return NULL;
13151 }
13152
13153 dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
13154
13155 if (!PEReadNVRAMProperty(name, dof, &len)) {
13156 dtrace_dof_destroy(dof);
13157 dtrace_dof_error(NULL, "unreadable DOF");
13158 return NULL;
13159 }
13160
13161 if (len < sizeof (dof_hdr_t)) {
13162 dtrace_dof_destroy(dof);
13163 dtrace_dof_error(NULL, "truncated header");
13164 return (NULL);
13165 }
13166
13167 if (len < dof->dofh_loadsz) {
13168 dtrace_dof_destroy(dof);
13169 dtrace_dof_error(NULL, "truncated DOF");
13170 return (NULL);
13171 }
13172
13173 if (len != dof->dofh_loadsz) {
13174 dtrace_dof_destroy(dof);
13175 dtrace_dof_error(NULL, "invalid DOF size");
13176 return (NULL);
13177 }
13178
13179 if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13180 dtrace_dof_destroy(dof);
13181 dtrace_dof_error(NULL, "oversized DOF");
13182 return (NULL);
13183 }
13184
13185 return (dof);
13186 }
13187
13188 /*
13189 * Return the dof_sec_t pointer corresponding to a given section index. If the
13190 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13191 * a type other than DOF_SECT_NONE is specified, the header is checked against
13192 * this type and NULL is returned if the types do not match.
13193 */
13194 static dof_sec_t *
dtrace_dof_sect(dof_hdr_t * dof,uint32_t type,dof_secidx_t i)13195 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13196 {
13197 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13198 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13199
13200 if (i >= dof->dofh_secnum) {
13201 dtrace_dof_error(dof, "referenced section index is invalid");
13202 return (NULL);
13203 }
13204
13205 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13206 dtrace_dof_error(dof, "referenced section is not loadable");
13207 return (NULL);
13208 }
13209
13210 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13211 dtrace_dof_error(dof, "referenced section is the wrong type");
13212 return (NULL);
13213 }
13214
13215 return (sec);
13216 }
13217
13218 static dtrace_probedesc_t *
dtrace_dof_probedesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_probedesc_t * desc)13219 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13220 {
13221 dof_probedesc_t *probe;
13222 dof_sec_t *strtab;
13223 uintptr_t daddr = (uintptr_t)dof;
13224 uintptr_t str;
13225 size_t size;
13226
13227 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13228 dtrace_dof_error(dof, "invalid probe section");
13229 return (NULL);
13230 }
13231
13232 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13233 dtrace_dof_error(dof, "bad alignment in probe description");
13234 return (NULL);
13235 }
13236
13237 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13238 dtrace_dof_error(dof, "truncated probe description");
13239 return (NULL);
13240 }
13241
13242 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13243 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13244
13245 if (strtab == NULL)
13246 return (NULL);
13247
13248 str = daddr + strtab->dofs_offset;
13249 size = strtab->dofs_size;
13250
13251 if (probe->dofp_provider >= strtab->dofs_size) {
13252 dtrace_dof_error(dof, "corrupt probe provider");
13253 return (NULL);
13254 }
13255
13256 (void) strncpy(desc->dtpd_provider,
13257 (char *)(str + probe->dofp_provider),
13258 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13259
13260 /* APPLE NOTE: Darwin employs size bounded string operation. */
13261 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13262
13263 if (probe->dofp_mod >= strtab->dofs_size) {
13264 dtrace_dof_error(dof, "corrupt probe module");
13265 return (NULL);
13266 }
13267
13268 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13269 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13270
13271 /* APPLE NOTE: Darwin employs size bounded string operation. */
13272 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13273
13274 if (probe->dofp_func >= strtab->dofs_size) {
13275 dtrace_dof_error(dof, "corrupt probe function");
13276 return (NULL);
13277 }
13278
13279 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13280 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13281
13282 /* APPLE NOTE: Darwin employs size bounded string operation. */
13283 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13284
13285 if (probe->dofp_name >= strtab->dofs_size) {
13286 dtrace_dof_error(dof, "corrupt probe name");
13287 return (NULL);
13288 }
13289
13290 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13291 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13292
13293 /* APPLE NOTE: Darwin employs size bounded string operation. */
13294 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13295
13296 return (desc);
13297 }
13298
13299 static dtrace_difo_t *
dtrace_dof_difo(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13300 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13301 cred_t *cr)
13302 {
13303 dtrace_difo_t *dp;
13304 size_t ttl = 0;
13305 dof_difohdr_t *dofd;
13306 uintptr_t daddr = (uintptr_t)dof;
13307 size_t max_size = dtrace_difo_maxsize;
13308 uint_t i;
13309 int l, n;
13310
13311
13312 static const struct {
13313 int section;
13314 int bufoffs;
13315 int lenoffs;
13316 int entsize;
13317 int align;
13318 const char *msg;
13319 } difo[] = {
13320 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13321 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13322 sizeof (dif_instr_t), "multiple DIF sections" },
13323
13324 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13325 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13326 sizeof (uint64_t), "multiple integer tables" },
13327
13328 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13329 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13330 sizeof (char), "multiple string tables" },
13331
13332 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13333 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13334 sizeof (uint_t), "multiple variable tables" },
13335
13336 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13337 };
13338
13339 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13340 dtrace_dof_error(dof, "invalid DIFO header section");
13341 return (NULL);
13342 }
13343
13344 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13345 dtrace_dof_error(dof, "bad alignment in DIFO header");
13346 return (NULL);
13347 }
13348
13349 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13350 sec->dofs_size % sizeof (dof_secidx_t)) {
13351 dtrace_dof_error(dof, "bad size in DIFO header");
13352 return (NULL);
13353 }
13354
13355 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13356 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13357
13358 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13359 dp->dtdo_rtype = dofd->dofd_rtype;
13360
13361 for (l = 0; l < n; l++) {
13362 dof_sec_t *subsec;
13363 void **bufp;
13364 uint32_t *lenp;
13365
13366 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13367 dofd->dofd_links[l])) == NULL)
13368 goto err; /* invalid section link */
13369
13370 if (ttl + subsec->dofs_size > max_size) {
13371 dtrace_dof_error(dof, "exceeds maximum size");
13372 goto err;
13373 }
13374
13375 ttl += subsec->dofs_size;
13376
13377 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13378
13379 if (subsec->dofs_type != (uint32_t)difo[i].section)
13380 continue;
13381
13382 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13383 dtrace_dof_error(dof, "section not loaded");
13384 goto err;
13385 }
13386
13387 if (subsec->dofs_align != (uint32_t)difo[i].align) {
13388 dtrace_dof_error(dof, "bad alignment");
13389 goto err;
13390 }
13391
13392 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13393 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13394
13395 if (*bufp != NULL) {
13396 dtrace_dof_error(dof, difo[i].msg);
13397 goto err;
13398 }
13399
13400 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13401 dtrace_dof_error(dof, "entry size mismatch");
13402 goto err;
13403 }
13404
13405 if (subsec->dofs_entsize != 0 &&
13406 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13407 dtrace_dof_error(dof, "corrupt entry size");
13408 goto err;
13409 }
13410
13411 *lenp = subsec->dofs_size;
13412 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13413 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13414 *bufp, subsec->dofs_size);
13415
13416 if (subsec->dofs_entsize != 0)
13417 *lenp /= subsec->dofs_entsize;
13418
13419 break;
13420 }
13421
13422 /*
13423 * If we encounter a loadable DIFO sub-section that is not
13424 * known to us, assume this is a broken program and fail.
13425 */
13426 if (difo[i].section == DOF_SECT_NONE &&
13427 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13428 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13429 goto err;
13430 }
13431 }
13432
13433 if (dp->dtdo_buf == NULL) {
13434 /*
13435 * We can't have a DIF object without DIF text.
13436 */
13437 dtrace_dof_error(dof, "missing DIF text");
13438 goto err;
13439 }
13440
13441 /*
13442 * Before we validate the DIF object, run through the variable table
13443 * looking for the strings -- if any of their size are under, we'll set
13444 * their size to be the system-wide default string size. Note that
13445 * this should _not_ happen if the "strsize" option has been set --
13446 * in this case, the compiler should have set the size to reflect the
13447 * setting of the option.
13448 */
13449 for (i = 0; i < dp->dtdo_varlen; i++) {
13450 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13451 dtrace_diftype_t *t = &v->dtdv_type;
13452
13453 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13454 continue;
13455
13456 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13457 t->dtdt_size = dtrace_strsize_default;
13458 }
13459
13460 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13461 goto err;
13462
13463 dtrace_difo_init(dp, vstate);
13464 return (dp);
13465
13466 err:
13467 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13468 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13469 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13470 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13471
13472 kmem_free(dp, sizeof (dtrace_difo_t));
13473 return (NULL);
13474 }
13475
13476 static dtrace_predicate_t *
dtrace_dof_predicate(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13477 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13478 cred_t *cr)
13479 {
13480 dtrace_difo_t *dp;
13481
13482 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13483 return (NULL);
13484
13485 return (dtrace_predicate_create(dp));
13486 }
13487
13488 static dtrace_actdesc_t *
dtrace_dof_actdesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13489 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13490 cred_t *cr)
13491 {
13492 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13493 dof_actdesc_t *desc;
13494 dof_sec_t *difosec;
13495 size_t offs;
13496 uintptr_t daddr = (uintptr_t)dof;
13497 uint64_t arg;
13498 dtrace_actkind_t kind;
13499
13500 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13501 dtrace_dof_error(dof, "invalid action section");
13502 return (NULL);
13503 }
13504
13505 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13506 dtrace_dof_error(dof, "truncated action description");
13507 return (NULL);
13508 }
13509
13510 if (sec->dofs_align != sizeof (uint64_t)) {
13511 dtrace_dof_error(dof, "bad alignment in action description");
13512 return (NULL);
13513 }
13514
13515 if (sec->dofs_size < sec->dofs_entsize) {
13516 dtrace_dof_error(dof, "section entry size exceeds total size");
13517 return (NULL);
13518 }
13519
13520 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13521 dtrace_dof_error(dof, "bad entry size in action description");
13522 return (NULL);
13523 }
13524
13525 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13526 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13527 return (NULL);
13528 }
13529
13530 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13531 desc = (dof_actdesc_t *)(daddr +
13532 (uintptr_t)sec->dofs_offset + offs);
13533 kind = (dtrace_actkind_t)desc->dofa_kind;
13534
13535 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13536 (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13537 (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13538 {
13539 dof_sec_t *strtab;
13540 char *str, *fmt;
13541 uint64_t i;
13542
13543 /*
13544 * The argument to these actions is an index into the
13545 * DOF string table. For printf()-like actions, this
13546 * is the format string. For print(), this is the
13547 * CTF type of the expression result.
13548 */
13549 if ((strtab = dtrace_dof_sect(dof,
13550 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13551 goto err;
13552
13553 str = (char *)((uintptr_t)dof +
13554 (uintptr_t)strtab->dofs_offset);
13555
13556 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13557 if (str[i] == '\0')
13558 break;
13559 }
13560
13561 if (i >= strtab->dofs_size) {
13562 dtrace_dof_error(dof, "bogus format string");
13563 goto err;
13564 }
13565
13566 if (i == desc->dofa_arg) {
13567 dtrace_dof_error(dof, "empty format string");
13568 goto err;
13569 }
13570
13571 i -= desc->dofa_arg;
13572 fmt = kmem_alloc(i + 1, KM_SLEEP);
13573 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13574 arg = (uint64_t)(uintptr_t)fmt;
13575 } else {
13576 if (kind == DTRACEACT_PRINTA) {
13577 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13578 arg = 0;
13579 } else {
13580 arg = desc->dofa_arg;
13581 }
13582 }
13583
13584 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13585 desc->dofa_uarg, arg);
13586
13587 if (last != NULL) {
13588 last->dtad_next = act;
13589 } else {
13590 first = act;
13591 }
13592
13593 last = act;
13594
13595 if (desc->dofa_difo == DOF_SECIDX_NONE)
13596 continue;
13597
13598 if ((difosec = dtrace_dof_sect(dof,
13599 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13600 goto err;
13601
13602 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13603
13604 if (act->dtad_difo == NULL)
13605 goto err;
13606 }
13607
13608 ASSERT(first != NULL);
13609 return (first);
13610
13611 err:
13612 for (act = first; act != NULL; act = next) {
13613 next = act->dtad_next;
13614 dtrace_actdesc_release(act, vstate);
13615 }
13616
13617 return (NULL);
13618 }
13619
13620 static dtrace_ecbdesc_t *
dtrace_dof_ecbdesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13621 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13622 cred_t *cr)
13623 {
13624 dtrace_ecbdesc_t *ep;
13625 dof_ecbdesc_t *ecb;
13626 dtrace_probedesc_t *desc;
13627 dtrace_predicate_t *pred = NULL;
13628
13629 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13630 dtrace_dof_error(dof, "Non loadable section with ECB description");
13631 return (NULL);
13632 }
13633
13634 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13635 dtrace_dof_error(dof, "truncated ECB description");
13636 return (NULL);
13637 }
13638
13639 if (sec->dofs_align != sizeof (uint64_t)) {
13640 dtrace_dof_error(dof, "bad alignment in ECB description");
13641 return (NULL);
13642 }
13643
13644 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13645 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13646
13647 if (sec == NULL)
13648 return (NULL);
13649
13650 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13651 ep->dted_uarg = ecb->dofe_uarg;
13652 desc = &ep->dted_probe;
13653
13654 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13655 goto err;
13656
13657 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13658 if ((sec = dtrace_dof_sect(dof,
13659 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13660 goto err;
13661
13662 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13663 goto err;
13664
13665 ep->dted_pred.dtpdd_predicate = pred;
13666 }
13667
13668 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13669 if ((sec = dtrace_dof_sect(dof,
13670 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13671 goto err;
13672
13673 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13674
13675 if (ep->dted_action == NULL)
13676 goto err;
13677 }
13678
13679 return (ep);
13680
13681 err:
13682 if (pred != NULL)
13683 dtrace_predicate_release(pred, vstate);
13684 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13685 return (NULL);
13686 }
13687
13688 /*
13689 * APPLE NOTE: dyld handles dof relocation.
13690 * Darwin does not need dtrace_dof_relocate()
13691 */
13692
13693 /*
13694 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13695 * header: it should be at the front of a memory region that is at least
13696 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13697 * size. It need not be validated in any other way.
13698 */
13699 static int
dtrace_dof_slurp(dof_hdr_t * dof,dtrace_vstate_t * vstate,cred_t * cr,dtrace_enabling_t ** enabp,uint64_t ubase,int noprobes)13700 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13701 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13702 {
13703 #pragma unused(ubase) /* __APPLE__ */
13704 uint64_t len = dof->dofh_loadsz, seclen;
13705 uintptr_t daddr = (uintptr_t)dof;
13706 dtrace_ecbdesc_t *ep;
13707 dtrace_enabling_t *enab;
13708 uint_t i;
13709
13710 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13711 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13712
13713 /*
13714 * Check the DOF header identification bytes. In addition to checking
13715 * valid settings, we also verify that unused bits/bytes are zeroed so
13716 * we can use them later without fear of regressing existing binaries.
13717 */
13718 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13719 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13720 dtrace_dof_error(dof, "DOF magic string mismatch");
13721 return (-1);
13722 }
13723
13724 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13725 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13726 dtrace_dof_error(dof, "DOF has invalid data model");
13727 return (-1);
13728 }
13729
13730 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13731 dtrace_dof_error(dof, "DOF encoding mismatch");
13732 return (-1);
13733 }
13734
13735 /*
13736 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13737 */
13738 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13739 dtrace_dof_error(dof, "DOF version mismatch");
13740 return (-1);
13741 }
13742
13743 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13744 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13745 return (-1);
13746 }
13747
13748 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13749 dtrace_dof_error(dof, "DOF uses too many integer registers");
13750 return (-1);
13751 }
13752
13753 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13754 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13755 return (-1);
13756 }
13757
13758 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13759 if (dof->dofh_ident[i] != 0) {
13760 dtrace_dof_error(dof, "DOF has invalid ident byte set");
13761 return (-1);
13762 }
13763 }
13764
13765 if (dof->dofh_flags & ~DOF_FL_VALID) {
13766 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13767 return (-1);
13768 }
13769
13770 if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13771 dtrace_dof_error(dof, "invalid section header size");
13772 return (-1);
13773 }
13774
13775 /*
13776 * Check that the section headers don't exceed the amount of DOF
13777 * data. Note that we cast the section size and number of sections
13778 * to uint64_t's to prevent possible overflow in the multiplication.
13779 */
13780 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13781
13782 if (dof->dofh_secoff > len || seclen > len ||
13783 dof->dofh_secoff + seclen > len) {
13784 dtrace_dof_error(dof, "truncated section headers");
13785 return (-1);
13786 }
13787
13788 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13789 dtrace_dof_error(dof, "misaligned section headers");
13790 return (-1);
13791 }
13792
13793 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13794 dtrace_dof_error(dof, "misaligned section size");
13795 return (-1);
13796 }
13797
13798 /*
13799 * Take an initial pass through the section headers to be sure that
13800 * the headers don't have stray offsets. If the 'noprobes' flag is
13801 * set, do not permit sections relating to providers, probes, or args.
13802 */
13803 for (i = 0; i < dof->dofh_secnum; i++) {
13804 dof_sec_t *sec = (dof_sec_t *)(daddr +
13805 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13806
13807 if (noprobes) {
13808 switch (sec->dofs_type) {
13809 case DOF_SECT_PROVIDER:
13810 case DOF_SECT_PROBES:
13811 case DOF_SECT_PRARGS:
13812 case DOF_SECT_PROFFS:
13813 dtrace_dof_error(dof, "illegal sections "
13814 "for enabling");
13815 return (-1);
13816 }
13817 }
13818
13819 if (!(sec->dofs_flags & DOF_SECF_LOAD))
13820 continue; /* just ignore non-loadable sections */
13821
13822 if (sec->dofs_align & (sec->dofs_align - 1)) {
13823 dtrace_dof_error(dof, "bad section alignment");
13824 return (-1);
13825 }
13826
13827 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13828 dtrace_dof_error(dof, "misaligned section");
13829 return (-1);
13830 }
13831
13832 if (sec->dofs_offset > len || sec->dofs_size > len ||
13833 sec->dofs_offset + sec->dofs_size > len) {
13834 dtrace_dof_error(dof, "corrupt section header");
13835 return (-1);
13836 }
13837
13838 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13839 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13840 dtrace_dof_error(dof, "non-terminating string table");
13841 return (-1);
13842 }
13843 }
13844
13845 /*
13846 * APPLE NOTE: We have no further relocation to perform.
13847 * All dof values are relative offsets.
13848 */
13849
13850 if ((enab = *enabp) == NULL)
13851 enab = *enabp = dtrace_enabling_create(vstate);
13852
13853 for (i = 0; i < dof->dofh_secnum; i++) {
13854 dof_sec_t *sec = (dof_sec_t *)(daddr +
13855 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13856
13857 if (sec->dofs_type != DOF_SECT_ECBDESC)
13858 continue;
13859
13860 /*
13861 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13862 * not all paths out of inlined dtrace_dof_ecbdesc
13863 * are checked for the NULL return value.
13864 * Check for NULL explicitly here.
13865 */
13866 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13867 if (ep == NULL) {
13868 dtrace_enabling_destroy(enab);
13869 *enabp = NULL;
13870 return (-1);
13871 }
13872
13873 dtrace_enabling_add(enab, ep);
13874 }
13875
13876 return (0);
13877 }
13878
13879 /*
13880 * Process DOF for any options. This routine assumes that the DOF has been
13881 * at least processed by dtrace_dof_slurp().
13882 */
13883 static int
dtrace_dof_options(dof_hdr_t * dof,dtrace_state_t * state)13884 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13885 {
13886 uint_t i;
13887 int rval;
13888 uint32_t entsize;
13889 size_t offs;
13890 dof_optdesc_t *desc;
13891
13892 for (i = 0; i < dof->dofh_secnum; i++) {
13893 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13894 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13895
13896 if (sec->dofs_type != DOF_SECT_OPTDESC)
13897 continue;
13898
13899 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13900 dtrace_dof_error(dof, "Non loadable option section");
13901 return (EINVAL);
13902 }
13903
13904 if (sec->dofs_align != sizeof (uint64_t)) {
13905 dtrace_dof_error(dof, "bad alignment in "
13906 "option description");
13907 return (EINVAL);
13908 }
13909
13910 if ((entsize = sec->dofs_entsize) == 0) {
13911 dtrace_dof_error(dof, "zeroed option entry size");
13912 return (EINVAL);
13913 }
13914
13915 if (entsize < sizeof (dof_optdesc_t)) {
13916 dtrace_dof_error(dof, "bad option entry size");
13917 return (EINVAL);
13918 }
13919
13920 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13921 desc = (dof_optdesc_t *)((uintptr_t)dof +
13922 (uintptr_t)sec->dofs_offset + offs);
13923
13924 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13925 dtrace_dof_error(dof, "non-zero option string");
13926 return (EINVAL);
13927 }
13928
13929 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13930 dtrace_dof_error(dof, "unset option");
13931 return (EINVAL);
13932 }
13933
13934 if ((rval = dtrace_state_option(state,
13935 desc->dofo_option, desc->dofo_value)) != 0) {
13936 dtrace_dof_error(dof, "rejected option");
13937 return (rval);
13938 }
13939 }
13940 }
13941
13942 return (0);
13943 }
13944
13945 /*
13946 * DTrace Consumer State Functions
13947 */
13948 static int
dtrace_dstate_init(dtrace_dstate_t * dstate,size_t size)13949 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13950 {
13951 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13952 void *base;
13953 uintptr_t limit;
13954 dtrace_dynvar_t *dvar, *next, *start;
13955
13956 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13957 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13958
13959 bzero(dstate, sizeof (dtrace_dstate_t));
13960
13961 if ((dstate->dtds_chunksize = chunksize) == 0)
13962 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13963
13964 VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13965
13966 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13967 size = min_size;
13968
13969 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13970 return (ENOMEM);
13971
13972 dstate->dtds_size = size;
13973 dstate->dtds_base = base;
13974 dstate->dtds_percpu = zalloc_percpu(dtrace_state_pcpu_zone, Z_WAITOK | Z_ZERO);
13975
13976 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13977
13978 if (hashsize != 1 && (hashsize & 1))
13979 hashsize--;
13980
13981 dstate->dtds_hashsize = hashsize;
13982 dstate->dtds_hash = dstate->dtds_base;
13983
13984 /*
13985 * Set all of our hash buckets to point to the single sink, and (if
13986 * it hasn't already been set), set the sink's hash value to be the
13987 * sink sentinel value. The sink is needed for dynamic variable
13988 * lookups to know that they have iterated over an entire, valid hash
13989 * chain.
13990 */
13991 for (size_t i = 0; i < hashsize; i++)
13992 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13993
13994 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13995 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13996
13997 /*
13998 * Determine number of active CPUs. Divide free list evenly among
13999 * active CPUs.
14000 */
14001 start = (dtrace_dynvar_t *)
14002 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
14003 limit = (uintptr_t)base + size;
14004
14005 VERIFY((uintptr_t)start < limit);
14006 VERIFY((uintptr_t)start >= (uintptr_t)base);
14007
14008 maxper = (limit - (uintptr_t)start) / (int)NCPU;
14009 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
14010
14011 zpercpu_foreach_cpu(i) {
14012 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
14013
14014 dcpu->dtdsc_free = dvar = start;
14015
14016 /*
14017 * If we don't even have enough chunks to make it once through
14018 * NCPUs, we're just going to allocate everything to the first
14019 * CPU. And if we're on the last CPU, we're going to allocate
14020 * whatever is left over. In either case, we set the limit to
14021 * be the limit of the dynamic variable space.
14022 */
14023 if (maxper == 0 || i == NCPU - 1) {
14024 limit = (uintptr_t)base + size;
14025 start = NULL;
14026 } else {
14027 limit = (uintptr_t)start + maxper;
14028 start = (dtrace_dynvar_t *)limit;
14029 }
14030
14031 VERIFY(limit <= (uintptr_t)base + size);
14032
14033 for (;;) {
14034 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14035 dstate->dtds_chunksize);
14036
14037 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14038 break;
14039
14040 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
14041 (uintptr_t)dvar <= (uintptr_t)base + size);
14042 dvar->dtdv_next = next;
14043 dvar = next;
14044 }
14045
14046 if (maxper == 0)
14047 break;
14048 }
14049
14050 return (0);
14051 }
14052
14053 static void
dtrace_dstate_fini(dtrace_dstate_t * dstate)14054 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14055 {
14056 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14057
14058 if (dstate->dtds_base == NULL)
14059 return;
14060
14061 kmem_free(dstate->dtds_base, dstate->dtds_size);
14062 zfree_percpu(dtrace_state_pcpu_zone, dstate->dtds_percpu);
14063 }
14064
14065 static void
dtrace_vstate_fini(dtrace_vstate_t * vstate)14066 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14067 {
14068 /*
14069 * Logical XOR, where are you?
14070 */
14071 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14072
14073 if (vstate->dtvs_nglobals > 0) {
14074 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14075 sizeof (dtrace_statvar_t *));
14076 }
14077
14078 if (vstate->dtvs_ntlocals > 0) {
14079 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14080 sizeof (dtrace_difv_t));
14081 }
14082
14083 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14084
14085 if (vstate->dtvs_nlocals > 0) {
14086 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14087 sizeof (dtrace_statvar_t *));
14088 }
14089 }
14090
14091 static void
dtrace_state_clean(dtrace_state_t * state)14092 dtrace_state_clean(dtrace_state_t *state)
14093 {
14094 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14095 return;
14096
14097 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14098 dtrace_speculation_clean(state);
14099 }
14100
14101 static void
dtrace_state_deadman(dtrace_state_t * state)14102 dtrace_state_deadman(dtrace_state_t *state)
14103 {
14104 hrtime_t now;
14105
14106 dtrace_sync();
14107
14108 now = dtrace_gethrtime();
14109
14110 if (state != dtrace_anon.dta_state &&
14111 now - state->dts_laststatus >= dtrace_deadman_user)
14112 return;
14113
14114 /*
14115 * We must be sure that dts_alive never appears to be less than the
14116 * value upon entry to dtrace_state_deadman(), and because we lack a
14117 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14118 * store INT64_MAX to it, followed by a memory barrier, followed by
14119 * the new value. This assures that dts_alive never appears to be
14120 * less than its true value, regardless of the order in which the
14121 * stores to the underlying storage are issued.
14122 */
14123 state->dts_alive = INT64_MAX;
14124 dtrace_membar_producer();
14125 state->dts_alive = now;
14126 }
14127
14128 static int
dtrace_state_create(dev_t * devp,cred_t * cr,dtrace_state_t ** new_state)14129 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
14130 {
14131 minor_t minor;
14132 major_t major;
14133 char c[30];
14134 dtrace_state_t *state;
14135 dtrace_optval_t *opt;
14136 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
14137 unsigned int cpu_it;
14138
14139 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14140 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14141
14142 /* Cause restart */
14143 *new_state = NULL;
14144
14145 if (devp != NULL) {
14146 minor = getminor(*devp);
14147 }
14148 else {
14149 minor = DTRACE_NCLIENTS - 1;
14150 }
14151
14152 state = dtrace_state_allocate(minor);
14153 if (NULL == state) {
14154 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
14155 return (ERESTART); /* can't reacquire */
14156 }
14157
14158 state->dts_epid = DTRACE_EPIDNONE + 1;
14159
14160 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
14161 state->dts_aggid_arena = vmem_create(c, (void *)1, INT32_MAX, 1,
14162 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14163
14164 if (devp != NULL) {
14165 major = getemajor(*devp);
14166 } else {
14167 major = ddi_driver_major(dtrace_devi);
14168 }
14169
14170 state->dts_dev = makedev(major, minor);
14171
14172 if (devp != NULL)
14173 *devp = state->dts_dev;
14174
14175 /*
14176 * We allocate NCPU buffers. On the one hand, this can be quite
14177 * a bit of memory per instance (nearly 36K on a Starcat). On the
14178 * other hand, it saves an additional memory reference in the probe
14179 * path.
14180 */
14181 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14182 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14183 state->dts_buf_over_limit = 0;
14184
14185 /*
14186 * Allocate and initialise the per-process per-CPU random state.
14187 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14188 * assumed to be seeded at this point (if from Fortuna seed file).
14189 */
14190 state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14191 state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14192 (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14193 for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14194 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14195 /*
14196 * Each CPU is assigned a 2^64 period, non-overlapping
14197 * subsequence.
14198 */
14199 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14200 state->dts_rstate[cpu_it]);
14201 }
14202
14203 state->dts_cleaner = CYCLIC_NONE;
14204 state->dts_deadman = CYCLIC_NONE;
14205 state->dts_vstate.dtvs_state = state;
14206
14207 for (i = 0; i < DTRACEOPT_MAX; i++)
14208 state->dts_options[i] = DTRACEOPT_UNSET;
14209
14210 /*
14211 * Set the default options.
14212 */
14213 opt = state->dts_options;
14214 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14215 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14216 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14217 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14218 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14219 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14220 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14221 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14222 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14223 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14224 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14225 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14226 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14227 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14228 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14229
14230 /*
14231 * Depending on the user credentials, we set flag bits which alter probe
14232 * visibility or the amount of destructiveness allowed. In the case of
14233 * actual anonymous tracing, or the possession of all privileges, all of
14234 * the normal checks are bypassed.
14235 */
14236 #if defined(__APPLE__)
14237 if (cr != NULL) {
14238 kauth_cred_ref(cr);
14239 state->dts_cred.dcr_cred = cr;
14240 }
14241 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14242 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14243 /*
14244 * Allow only proc credentials when DTrace is
14245 * restricted by the current security policy
14246 */
14247 state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14248 state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14249 }
14250 else {
14251 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14252 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14253 }
14254 }
14255
14256 #else
14257 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14258 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14259 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14260 }
14261 else {
14262 /*
14263 * Set up the credentials for this instantiation. We take a
14264 * hold on the credential to prevent it from disappearing on
14265 * us; this in turn prevents the zone_t referenced by this
14266 * credential from disappearing. This means that we can
14267 * examine the credential and the zone from probe context.
14268 */
14269 crhold(cr);
14270 state->dts_cred.dcr_cred = cr;
14271
14272 /*
14273 * CRA_PROC means "we have *some* privilege for dtrace" and
14274 * unlocks the use of variables like pid, zonename, etc.
14275 */
14276 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14277 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14278 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14279 }
14280
14281 /*
14282 * dtrace_user allows use of syscall and profile providers.
14283 * If the user also has proc_owner and/or proc_zone, we
14284 * extend the scope to include additional visibility and
14285 * destructive power.
14286 */
14287 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14288 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14289 state->dts_cred.dcr_visible |=
14290 DTRACE_CRV_ALLPROC;
14291
14292 state->dts_cred.dcr_action |=
14293 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14294 }
14295
14296 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14297 state->dts_cred.dcr_visible |=
14298 DTRACE_CRV_ALLZONE;
14299
14300 state->dts_cred.dcr_action |=
14301 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14302 }
14303
14304 /*
14305 * If we have all privs in whatever zone this is,
14306 * we can do destructive things to processes which
14307 * have altered credentials.
14308 *
14309 * APPLE NOTE: Darwin doesn't do zones.
14310 * Behave as if zone always has destructive privs.
14311 */
14312
14313 state->dts_cred.dcr_action |=
14314 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14315 }
14316
14317 /*
14318 * Holding the dtrace_kernel privilege also implies that
14319 * the user has the dtrace_user privilege from a visibility
14320 * perspective. But without further privileges, some
14321 * destructive actions are not available.
14322 */
14323 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14324 /*
14325 * Make all probes in all zones visible. However,
14326 * this doesn't mean that all actions become available
14327 * to all zones.
14328 */
14329 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14330 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14331
14332 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14333 DTRACE_CRA_PROC;
14334 /*
14335 * Holding proc_owner means that destructive actions
14336 * for *this* zone are allowed.
14337 */
14338 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14339 state->dts_cred.dcr_action |=
14340 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14341
14342 /*
14343 * Holding proc_zone means that destructive actions
14344 * for this user/group ID in all zones is allowed.
14345 */
14346 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14347 state->dts_cred.dcr_action |=
14348 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14349
14350 /*
14351 * If we have all privs in whatever zone this is,
14352 * we can do destructive things to processes which
14353 * have altered credentials.
14354 *
14355 * APPLE NOTE: Darwin doesn't do zones.
14356 * Behave as if zone always has destructive privs.
14357 */
14358 state->dts_cred.dcr_action |=
14359 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14360 }
14361
14362 /*
14363 * Holding the dtrace_proc privilege gives control over fasttrap
14364 * and pid providers. We need to grant wider destructive
14365 * privileges in the event that the user has proc_owner and/or
14366 * proc_zone.
14367 */
14368 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14369 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14370 state->dts_cred.dcr_action |=
14371 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14372
14373 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14374 state->dts_cred.dcr_action |=
14375 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14376 }
14377 }
14378 #endif
14379
14380 *new_state = state;
14381 return(0); /* Success */
14382 }
14383
14384 static int
dtrace_state_buffer(dtrace_state_t * state,dtrace_buffer_t * buf,int which)14385 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14386 {
14387 dtrace_optval_t *opt = state->dts_options, size;
14388 processorid_t cpu = 0;
14389 size_t limit = buf->dtb_size;
14390 int flags = 0, rval;
14391
14392 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14393 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14394 ASSERT(which < DTRACEOPT_MAX);
14395 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14396 (state == dtrace_anon.dta_state &&
14397 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14398
14399 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14400 return (0);
14401
14402 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14403 cpu = opt[DTRACEOPT_CPU];
14404
14405 if (which == DTRACEOPT_SPECSIZE)
14406 flags |= DTRACEBUF_NOSWITCH;
14407
14408 if (which == DTRACEOPT_BUFSIZE) {
14409 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14410 flags |= DTRACEBUF_RING;
14411
14412 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14413 flags |= DTRACEBUF_FILL;
14414
14415 if (state != dtrace_anon.dta_state ||
14416 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14417 flags |= DTRACEBUF_INACTIVE;
14418 }
14419
14420 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14421 /*
14422 * The size must be 8-byte aligned. If the size is not 8-byte
14423 * aligned, drop it down by the difference.
14424 */
14425 if (size & (sizeof (uint64_t) - 1))
14426 size -= size & (sizeof (uint64_t) - 1);
14427
14428 if (size < state->dts_reserve) {
14429 /*
14430 * Buffers always must be large enough to accommodate
14431 * their prereserved space. We return E2BIG instead
14432 * of ENOMEM in this case to allow for user-level
14433 * software to differentiate the cases.
14434 */
14435 return (E2BIG);
14436 }
14437 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14438 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14439
14440 if (rval != ENOMEM) {
14441 opt[which] = size;
14442 return (rval);
14443 }
14444
14445 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14446 return (rval);
14447 }
14448
14449 return (ENOMEM);
14450 }
14451
14452 static int
dtrace_state_buffers(dtrace_state_t * state)14453 dtrace_state_buffers(dtrace_state_t *state)
14454 {
14455 dtrace_speculation_t *spec = state->dts_speculations;
14456 int rval, i;
14457
14458 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14459 DTRACEOPT_BUFSIZE)) != 0)
14460 return (rval);
14461
14462 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14463 DTRACEOPT_AGGSIZE)) != 0)
14464 return (rval);
14465
14466 for (i = 0; i < state->dts_nspeculations; i++) {
14467 if ((rval = dtrace_state_buffer(state,
14468 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14469 return (rval);
14470 }
14471
14472 return (0);
14473 }
14474
14475 static void
dtrace_state_prereserve(dtrace_state_t * state)14476 dtrace_state_prereserve(dtrace_state_t *state)
14477 {
14478 dtrace_ecb_t *ecb;
14479 dtrace_probe_t *probe;
14480
14481 state->dts_reserve = 0;
14482
14483 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14484 return;
14485
14486 /*
14487 * If our buffer policy is a "fill" buffer policy, we need to set the
14488 * prereserved space to be the space required by the END probes.
14489 */
14490 probe = dtrace_probes[dtrace_probeid_end - 1];
14491 ASSERT(probe != NULL);
14492
14493 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14494 if (ecb->dte_state != state)
14495 continue;
14496
14497 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14498 }
14499 }
14500
14501 static int
dtrace_state_go(dtrace_state_t * state,processorid_t * cpu)14502 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14503 {
14504 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14505 dtrace_speculation_t *spec;
14506 dtrace_buffer_t *buf;
14507 cyc_handler_t hdlr;
14508 cyc_time_t when;
14509 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14510 dtrace_icookie_t cookie;
14511
14512 lck_mtx_lock(&cpu_lock);
14513 lck_mtx_lock(&dtrace_lock);
14514
14515 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14516 rval = EBUSY;
14517 goto out;
14518 }
14519
14520 /*
14521 * Before we can perform any checks, we must prime all of the
14522 * retained enablings that correspond to this state.
14523 */
14524 dtrace_enabling_prime(state);
14525
14526 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14527 rval = EACCES;
14528 goto out;
14529 }
14530
14531 dtrace_state_prereserve(state);
14532
14533 /*
14534 * Now we want to do is try to allocate our speculations.
14535 * We do not automatically resize the number of speculations; if
14536 * this fails, we will fail the operation.
14537 */
14538 nspec = opt[DTRACEOPT_NSPEC];
14539 ASSERT(nspec != DTRACEOPT_UNSET);
14540
14541 if (nspec > INT_MAX) {
14542 rval = ENOMEM;
14543 goto out;
14544 }
14545
14546 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14547
14548 if (spec == NULL) {
14549 rval = ENOMEM;
14550 goto out;
14551 }
14552
14553 state->dts_speculations = spec;
14554 state->dts_nspeculations = (int)nspec;
14555
14556 for (i = 0; i < nspec; i++) {
14557 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14558 rval = ENOMEM;
14559 goto err;
14560 }
14561
14562 spec[i].dtsp_buffer = buf;
14563 }
14564
14565 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14566 if (dtrace_anon.dta_state == NULL) {
14567 rval = ENOENT;
14568 goto out;
14569 }
14570
14571 if (state->dts_necbs != 0) {
14572 rval = EALREADY;
14573 goto out;
14574 }
14575
14576 state->dts_anon = dtrace_anon_grab();
14577 ASSERT(state->dts_anon != NULL);
14578 state = state->dts_anon;
14579
14580 /*
14581 * We want "grabanon" to be set in the grabbed state, so we'll
14582 * copy that option value from the grabbing state into the
14583 * grabbed state.
14584 */
14585 state->dts_options[DTRACEOPT_GRABANON] =
14586 opt[DTRACEOPT_GRABANON];
14587
14588 *cpu = dtrace_anon.dta_beganon;
14589
14590 /*
14591 * If the anonymous state is active (as it almost certainly
14592 * is if the anonymous enabling ultimately matched anything),
14593 * we don't allow any further option processing -- but we
14594 * don't return failure.
14595 */
14596 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14597 goto out;
14598 }
14599
14600 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14601 opt[DTRACEOPT_AGGSIZE] != 0) {
14602 if (state->dts_aggregations == NULL) {
14603 /*
14604 * We're not going to create an aggregation buffer
14605 * because we don't have any ECBs that contain
14606 * aggregations -- set this option to 0.
14607 */
14608 opt[DTRACEOPT_AGGSIZE] = 0;
14609 } else {
14610 /*
14611 * If we have an aggregation buffer, we must also have
14612 * a buffer to use as scratch.
14613 */
14614 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14615 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14616 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14617 }
14618 }
14619 }
14620
14621 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14622 opt[DTRACEOPT_SPECSIZE] != 0) {
14623 if (!state->dts_speculates) {
14624 /*
14625 * We're not going to create speculation buffers
14626 * because we don't have any ECBs that actually
14627 * speculate -- set the speculation size to 0.
14628 */
14629 opt[DTRACEOPT_SPECSIZE] = 0;
14630 }
14631 }
14632
14633 /*
14634 * The bare minimum size for any buffer that we're actually going to
14635 * do anything to is sizeof (uint64_t).
14636 */
14637 sz = sizeof (uint64_t);
14638
14639 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14640 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14641 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14642 /*
14643 * A buffer size has been explicitly set to 0 (or to a size
14644 * that will be adjusted to 0) and we need the space -- we
14645 * need to return failure. We return ENOSPC to differentiate
14646 * it from failing to allocate a buffer due to failure to meet
14647 * the reserve (for which we return E2BIG).
14648 */
14649 rval = ENOSPC;
14650 goto out;
14651 }
14652
14653 if ((rval = dtrace_state_buffers(state)) != 0)
14654 goto err;
14655
14656 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14657 sz = dtrace_dstate_defsize;
14658
14659 do {
14660 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14661
14662 if (rval == 0)
14663 break;
14664
14665 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14666 goto err;
14667 } while (sz >>= 1);
14668
14669 opt[DTRACEOPT_DYNVARSIZE] = sz;
14670
14671 if (rval != 0)
14672 goto err;
14673
14674 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14675 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14676
14677 if (opt[DTRACEOPT_CLEANRATE] == 0)
14678 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14679
14680 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14681 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14682
14683 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14684 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14685
14686 if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14687 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14688
14689 if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14690 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14691
14692 if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14693 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14694
14695 if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14696 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14697
14698 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14699 hdlr.cyh_arg = state;
14700 hdlr.cyh_level = CY_LOW_LEVEL;
14701
14702 when.cyt_when = 0;
14703 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14704
14705 state->dts_cleaner = cyclic_add(&hdlr, &when);
14706
14707 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14708 hdlr.cyh_arg = state;
14709 hdlr.cyh_level = CY_LOW_LEVEL;
14710
14711 when.cyt_when = 0;
14712 when.cyt_interval = dtrace_deadman_interval;
14713
14714 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14715 state->dts_deadman = cyclic_add(&hdlr, &when);
14716
14717 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14718
14719 /*
14720 * Now it's time to actually fire the BEGIN probe. We need to disable
14721 * interrupts here both to record the CPU on which we fired the BEGIN
14722 * probe (the data from this CPU will be processed first at user
14723 * level) and to manually activate the buffer for this CPU.
14724 */
14725 cookie = dtrace_interrupt_disable();
14726 *cpu = CPU->cpu_id;
14727 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14728 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14729
14730 dtrace_probe(dtrace_probeid_begin,
14731 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14732 dtrace_interrupt_enable(cookie);
14733 /*
14734 * We may have had an exit action from a BEGIN probe; only change our
14735 * state to ACTIVE if we're still in WARMUP.
14736 */
14737 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14738 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14739
14740 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14741 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14742
14743 /*
14744 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14745 * want each CPU to transition its principal buffer out of the
14746 * INACTIVE state. Doing this assures that no CPU will suddenly begin
14747 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14748 * atomically transition from processing none of a state's ECBs to
14749 * processing all of them.
14750 */
14751 dtrace_xcall(DTRACE_CPUALL,
14752 (dtrace_xcall_t)dtrace_buffer_activate, state);
14753 goto out;
14754
14755 err:
14756 dtrace_buffer_free(state->dts_buffer);
14757 dtrace_buffer_free(state->dts_aggbuffer);
14758
14759 if ((nspec = state->dts_nspeculations) == 0) {
14760 ASSERT(state->dts_speculations == NULL);
14761 goto out;
14762 }
14763
14764 spec = state->dts_speculations;
14765 ASSERT(spec != NULL);
14766
14767 for (i = 0; i < state->dts_nspeculations; i++) {
14768 if ((buf = spec[i].dtsp_buffer) == NULL)
14769 break;
14770
14771 dtrace_buffer_free(buf);
14772 kmem_free(buf, bufsize);
14773 }
14774
14775 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14776 state->dts_nspeculations = 0;
14777 state->dts_speculations = NULL;
14778
14779 out:
14780 lck_mtx_unlock(&dtrace_lock);
14781 lck_mtx_unlock(&cpu_lock);
14782
14783 return (rval);
14784 }
14785
14786 static int
dtrace_state_stop(dtrace_state_t * state,processorid_t * cpu)14787 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14788 {
14789 dtrace_icookie_t cookie;
14790
14791 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14792
14793 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14794 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14795 return (EINVAL);
14796
14797 /*
14798 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14799 * to be sure that every CPU has seen it. See below for the details
14800 * on why this is done.
14801 */
14802 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14803 dtrace_sync();
14804
14805 /*
14806 * By this point, it is impossible for any CPU to be still processing
14807 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14808 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14809 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14810 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14811 * iff we're in the END probe.
14812 */
14813 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14814 dtrace_sync();
14815 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14816
14817 /*
14818 * Finally, we can release the reserve and call the END probe. We
14819 * disable interrupts across calling the END probe to allow us to
14820 * return the CPU on which we actually called the END probe. This
14821 * allows user-land to be sure that this CPU's principal buffer is
14822 * processed last.
14823 */
14824 state->dts_reserve = 0;
14825
14826 cookie = dtrace_interrupt_disable();
14827 *cpu = CPU->cpu_id;
14828 dtrace_probe(dtrace_probeid_end,
14829 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14830 dtrace_interrupt_enable(cookie);
14831
14832 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14833 dtrace_sync();
14834
14835 return (0);
14836 }
14837
14838 static int
dtrace_state_option(dtrace_state_t * state,dtrace_optid_t option,dtrace_optval_t val)14839 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14840 dtrace_optval_t val)
14841 {
14842 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14843
14844 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14845 return (EBUSY);
14846
14847 if (option >= DTRACEOPT_MAX)
14848 return (EINVAL);
14849
14850 if (option != DTRACEOPT_CPU && val < 0)
14851 return (EINVAL);
14852
14853 switch (option) {
14854 case DTRACEOPT_DESTRUCTIVE:
14855 if (dtrace_destructive_disallow)
14856 return (EACCES);
14857
14858 state->dts_cred.dcr_destructive = 1;
14859 break;
14860
14861 case DTRACEOPT_BUFSIZE:
14862 case DTRACEOPT_DYNVARSIZE:
14863 case DTRACEOPT_AGGSIZE:
14864 case DTRACEOPT_SPECSIZE:
14865 case DTRACEOPT_STRSIZE:
14866 if (val < 0)
14867 return (EINVAL);
14868
14869 if (val >= LONG_MAX) {
14870 /*
14871 * If this is an otherwise negative value, set it to
14872 * the highest multiple of 128m less than LONG_MAX.
14873 * Technically, we're adjusting the size without
14874 * regard to the buffer resizing policy, but in fact,
14875 * this has no effect -- if we set the buffer size to
14876 * ~LONG_MAX and the buffer policy is ultimately set to
14877 * be "manual", the buffer allocation is guaranteed to
14878 * fail, if only because the allocation requires two
14879 * buffers. (We set the the size to the highest
14880 * multiple of 128m because it ensures that the size
14881 * will remain a multiple of a megabyte when
14882 * repeatedly halved -- all the way down to 15m.)
14883 */
14884 val = LONG_MAX - (1 << 27) + 1;
14885 }
14886 }
14887
14888 state->dts_options[option] = val;
14889
14890 return (0);
14891 }
14892
14893 static void
dtrace_state_destroy(dtrace_state_t * state)14894 dtrace_state_destroy(dtrace_state_t *state)
14895 {
14896 dtrace_ecb_t *ecb;
14897 dtrace_vstate_t *vstate = &state->dts_vstate;
14898 minor_t minor = getminor(state->dts_dev);
14899 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14900 dtrace_speculation_t *spec = state->dts_speculations;
14901 int nspec = state->dts_nspeculations;
14902 uint32_t match;
14903
14904 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14905 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14906
14907 /*
14908 * First, retract any retained enablings for this state.
14909 */
14910 dtrace_enabling_retract(state);
14911 ASSERT(state->dts_nretained == 0);
14912
14913 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14914 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14915 /*
14916 * We have managed to come into dtrace_state_destroy() on a
14917 * hot enabling -- almost certainly because of a disorderly
14918 * shutdown of a consumer. (That is, a consumer that is
14919 * exiting without having called dtrace_stop().) In this case,
14920 * we're going to set our activity to be KILLED, and then
14921 * issue a sync to be sure that everyone is out of probe
14922 * context before we start blowing away ECBs.
14923 */
14924 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14925 dtrace_sync();
14926 }
14927
14928 /*
14929 * Release the credential hold we took in dtrace_state_create().
14930 */
14931 if (state->dts_cred.dcr_cred != NULL)
14932 kauth_cred_unref(&state->dts_cred.dcr_cred);
14933
14934 /*
14935 * Now we can safely disable and destroy any enabled probes. Because
14936 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14937 * (especially if they're all enabled), we take two passes through the
14938 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14939 * in the second we disable whatever is left over.
14940 */
14941 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14942 for (i = 0; i < state->dts_necbs; i++) {
14943 if ((ecb = state->dts_ecbs[i]) == NULL)
14944 continue;
14945
14946 if (match && ecb->dte_probe != NULL) {
14947 dtrace_probe_t *probe = ecb->dte_probe;
14948 dtrace_provider_t *prov = probe->dtpr_provider;
14949
14950 if (!(prov->dtpv_priv.dtpp_flags & match))
14951 continue;
14952 }
14953
14954 dtrace_ecb_disable(ecb);
14955 dtrace_ecb_destroy(ecb);
14956 }
14957
14958 if (!match)
14959 break;
14960 }
14961
14962 /*
14963 * Before we free the buffers, perform one more sync to assure that
14964 * every CPU is out of probe context.
14965 */
14966 dtrace_sync();
14967
14968 dtrace_buffer_free(state->dts_buffer);
14969 dtrace_buffer_free(state->dts_aggbuffer);
14970
14971 for (i = 0; i < (int)NCPU; i++) {
14972 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14973 }
14974 kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14975
14976 for (i = 0; i < nspec; i++)
14977 dtrace_buffer_free(spec[i].dtsp_buffer);
14978
14979 if (state->dts_cleaner != CYCLIC_NONE)
14980 cyclic_remove(state->dts_cleaner);
14981
14982 if (state->dts_deadman != CYCLIC_NONE)
14983 cyclic_remove(state->dts_deadman);
14984
14985 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14986 dtrace_vstate_fini(vstate);
14987 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14988
14989 if (state->dts_aggregations != NULL) {
14990 #if DEBUG
14991 for (i = 0; i < state->dts_naggregations; i++)
14992 ASSERT(state->dts_aggregations[i] == NULL);
14993 #endif
14994 ASSERT(state->dts_naggregations > 0);
14995 kmem_free(state->dts_aggregations,
14996 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14997 }
14998
14999 kmem_free(state->dts_buffer, bufsize);
15000 kmem_free(state->dts_aggbuffer, bufsize);
15001
15002 for (i = 0; i < nspec; i++)
15003 kmem_free(spec[i].dtsp_buffer, bufsize);
15004
15005 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
15006
15007 dtrace_format_destroy(state);
15008
15009 vmem_destroy(state->dts_aggid_arena);
15010 dtrace_state_free(minor);
15011 }
15012
15013 /*
15014 * DTrace Anonymous Enabling Functions
15015 */
15016
15017 int
dtrace_keep_kernel_symbols(void)15018 dtrace_keep_kernel_symbols(void)
15019 {
15020 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
15021 return 0;
15022 }
15023
15024 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
15025 return 1;
15026
15027 return 0;
15028 }
15029
15030 static dtrace_state_t *
dtrace_anon_grab(void)15031 dtrace_anon_grab(void)
15032 {
15033 dtrace_state_t *state;
15034
15035 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15036
15037 if ((state = dtrace_anon.dta_state) == NULL) {
15038 ASSERT(dtrace_anon.dta_enabling == NULL);
15039 return (NULL);
15040 }
15041
15042 ASSERT(dtrace_anon.dta_enabling != NULL);
15043 ASSERT(dtrace_retained != NULL);
15044
15045 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15046 dtrace_anon.dta_enabling = NULL;
15047 dtrace_anon.dta_state = NULL;
15048
15049 return (state);
15050 }
15051
15052 static void
dtrace_anon_property(void)15053 dtrace_anon_property(void)
15054 {
15055 int i, rv;
15056 dtrace_state_t *state;
15057 dof_hdr_t *dof;
15058 char c[32]; /* enough for "dof-data-" + digits */
15059
15060 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15061 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15062
15063 for (i = 0; ; i++) {
15064 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15065
15066 dtrace_err_verbose = 1;
15067
15068 if ((dof = dtrace_dof_property(c)) == NULL) {
15069 dtrace_err_verbose = 0;
15070 break;
15071 }
15072
15073 #ifdef illumos
15074 /*
15075 * We want to create anonymous state, so we need to transition
15076 * the kernel debugger to indicate that DTrace is active. If
15077 * this fails (e.g. because the debugger has modified text in
15078 * some way), we won't continue with the processing.
15079 */
15080 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15081 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15082 "enabling ignored.");
15083 dtrace_dof_destroy(dof);
15084 break;
15085 }
15086 #endif
15087
15088 /*
15089 * If we haven't allocated an anonymous state, we'll do so now.
15090 */
15091 if ((state = dtrace_anon.dta_state) == NULL) {
15092 rv = dtrace_state_create(NULL, NULL, &state);
15093 dtrace_anon.dta_state = state;
15094 if (rv != 0 || state == NULL) {
15095 /*
15096 * This basically shouldn't happen: the only
15097 * failure mode from dtrace_state_create() is a
15098 * failure of ddi_soft_state_zalloc() that
15099 * itself should never happen. Still, the
15100 * interface allows for a failure mode, and
15101 * we want to fail as gracefully as possible:
15102 * we'll emit an error message and cease
15103 * processing anonymous state in this case.
15104 */
15105 cmn_err(CE_WARN, "failed to create "
15106 "anonymous state");
15107 dtrace_dof_destroy(dof);
15108 break;
15109 }
15110 }
15111
15112 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15113 &dtrace_anon.dta_enabling, 0, B_TRUE);
15114
15115 if (rv == 0)
15116 rv = dtrace_dof_options(dof, state);
15117
15118 dtrace_err_verbose = 0;
15119 dtrace_dof_destroy(dof);
15120
15121 if (rv != 0) {
15122 /*
15123 * This is malformed DOF; chuck any anonymous state
15124 * that we created.
15125 */
15126 ASSERT(dtrace_anon.dta_enabling == NULL);
15127 dtrace_state_destroy(state);
15128 dtrace_anon.dta_state = NULL;
15129 break;
15130 }
15131
15132 ASSERT(dtrace_anon.dta_enabling != NULL);
15133 }
15134
15135 if (dtrace_anon.dta_enabling != NULL) {
15136 int rval;
15137
15138 /*
15139 * dtrace_enabling_retain() can only fail because we are
15140 * trying to retain more enablings than are allowed -- but
15141 * we only have one anonymous enabling, and we are guaranteed
15142 * to be allowed at least one retained enabling; we assert
15143 * that dtrace_enabling_retain() returns success.
15144 */
15145 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15146 ASSERT(rval == 0);
15147
15148 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15149 }
15150 }
15151
15152 /*
15153 * DTrace Helper Functions
15154 */
15155 static void
dtrace_helper_trace(dtrace_helper_action_t * helper,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate,int where)15156 dtrace_helper_trace(dtrace_helper_action_t *helper,
15157 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15158 {
15159 uint32_t size, next, nnext;
15160 int i;
15161 dtrace_helptrace_t *ent;
15162 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15163
15164 if (!dtrace_helptrace_enabled)
15165 return;
15166
15167 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15168
15169 /*
15170 * What would a tracing framework be without its own tracing
15171 * framework? (Well, a hell of a lot simpler, for starters...)
15172 */
15173 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15174 sizeof (uint64_t) - sizeof (uint64_t);
15175
15176 /*
15177 * Iterate until we can allocate a slot in the trace buffer.
15178 */
15179 do {
15180 next = dtrace_helptrace_next;
15181
15182 if (next + size < dtrace_helptrace_bufsize) {
15183 nnext = next + size;
15184 } else {
15185 nnext = size;
15186 }
15187 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15188
15189 /*
15190 * We have our slot; fill it in.
15191 */
15192 if (nnext == size)
15193 next = 0;
15194
15195 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15196 ent->dtht_helper = helper;
15197 ent->dtht_where = where;
15198 ent->dtht_nlocals = vstate->dtvs_nlocals;
15199
15200 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15201 mstate->dtms_fltoffs : -1;
15202 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15203 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15204
15205 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15206 dtrace_statvar_t *svar;
15207
15208 if ((svar = vstate->dtvs_locals[i]) == NULL)
15209 continue;
15210
15211 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15212 ent->dtht_locals[i] =
15213 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15214 }
15215 }
15216
15217 __attribute__((noinline))
15218 static uint64_t
dtrace_helper(int which,dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t arg0,uint64_t arg1)15219 dtrace_helper(int which, dtrace_mstate_t *mstate,
15220 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15221 {
15222 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15223 uint64_t sarg0 = mstate->dtms_arg[0];
15224 uint64_t sarg1 = mstate->dtms_arg[1];
15225 uint64_t rval = 0;
15226 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15227 dtrace_helper_action_t *helper;
15228 dtrace_vstate_t *vstate;
15229 dtrace_difo_t *pred;
15230 int i, trace = dtrace_helptrace_enabled;
15231
15232 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15233
15234 if (helpers == NULL)
15235 return (0);
15236
15237 if ((helper = helpers->dthps_actions[which]) == NULL)
15238 return (0);
15239
15240 vstate = &helpers->dthps_vstate;
15241 mstate->dtms_arg[0] = arg0;
15242 mstate->dtms_arg[1] = arg1;
15243
15244 /*
15245 * Now iterate over each helper. If its predicate evaluates to 'true',
15246 * we'll call the corresponding actions. Note that the below calls
15247 * to dtrace_dif_emulate() may set faults in machine state. This is
15248 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15249 * the stored DIF offset with its own (which is the desired behavior).
15250 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15251 * from machine state; this is okay, too.
15252 */
15253 for (; helper != NULL; helper = helper->dtha_next) {
15254 if ((pred = helper->dtha_predicate) != NULL) {
15255 if (trace)
15256 dtrace_helper_trace(helper, mstate, vstate, 0);
15257
15258 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15259 goto next;
15260
15261 if (*flags & CPU_DTRACE_FAULT)
15262 goto err;
15263 }
15264
15265 for (i = 0; i < helper->dtha_nactions; i++) {
15266 if (trace)
15267 dtrace_helper_trace(helper,
15268 mstate, vstate, i + 1);
15269
15270 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15271 mstate, vstate, state);
15272
15273 if (*flags & CPU_DTRACE_FAULT)
15274 goto err;
15275 }
15276
15277 next:
15278 if (trace)
15279 dtrace_helper_trace(helper, mstate, vstate,
15280 DTRACE_HELPTRACE_NEXT);
15281 }
15282
15283 if (trace)
15284 dtrace_helper_trace(helper, mstate, vstate,
15285 DTRACE_HELPTRACE_DONE);
15286
15287 /*
15288 * Restore the arg0 that we saved upon entry.
15289 */
15290 mstate->dtms_arg[0] = sarg0;
15291 mstate->dtms_arg[1] = sarg1;
15292
15293 return (rval);
15294
15295 err:
15296 if (trace)
15297 dtrace_helper_trace(helper, mstate, vstate,
15298 DTRACE_HELPTRACE_ERR);
15299
15300 /*
15301 * Restore the arg0 that we saved upon entry.
15302 */
15303 mstate->dtms_arg[0] = sarg0;
15304 mstate->dtms_arg[1] = sarg1;
15305
15306 return (0);
15307 }
15308
15309 static void
dtrace_helper_action_destroy(dtrace_helper_action_t * helper,dtrace_vstate_t * vstate)15310 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15311 dtrace_vstate_t *vstate)
15312 {
15313 int i;
15314
15315 if (helper->dtha_predicate != NULL)
15316 dtrace_difo_release(helper->dtha_predicate, vstate);
15317
15318 for (i = 0; i < helper->dtha_nactions; i++) {
15319 ASSERT(helper->dtha_actions[i] != NULL);
15320 dtrace_difo_release(helper->dtha_actions[i], vstate);
15321 }
15322
15323 kmem_free(helper->dtha_actions,
15324 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15325 kmem_free(helper, sizeof (dtrace_helper_action_t));
15326 }
15327
15328 static int
dtrace_helper_destroygen(proc_t * p,int gen)15329 dtrace_helper_destroygen(proc_t* p, int gen)
15330 {
15331 dtrace_helpers_t *help = p->p_dtrace_helpers;
15332 dtrace_vstate_t *vstate;
15333 uint_t i;
15334
15335 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15336 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15337
15338 if (help == NULL || gen > help->dthps_generation)
15339 return (EINVAL);
15340
15341 vstate = &help->dthps_vstate;
15342
15343 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15344 dtrace_helper_action_t *last = NULL, *h, *next;
15345
15346 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15347 next = h->dtha_next;
15348
15349 if (h->dtha_generation == gen) {
15350 if (last != NULL) {
15351 last->dtha_next = next;
15352 } else {
15353 help->dthps_actions[i] = next;
15354 }
15355
15356 dtrace_helper_action_destroy(h, vstate);
15357 } else {
15358 last = h;
15359 }
15360 }
15361 }
15362
15363 /*
15364 * Interate until we've cleared out all helper providers with the
15365 * given generation number.
15366 */
15367 for (;;) {
15368 dtrace_helper_provider_t *prov = NULL;
15369
15370 /*
15371 * Look for a helper provider with the right generation. We
15372 * have to start back at the beginning of the list each time
15373 * because we drop dtrace_lock. It's unlikely that we'll make
15374 * more than two passes.
15375 */
15376 for (i = 0; i < help->dthps_nprovs; i++) {
15377 prov = help->dthps_provs[i];
15378
15379 if (prov->dthp_generation == gen)
15380 break;
15381 }
15382
15383 /*
15384 * If there were no matches, we're done.
15385 */
15386 if (i == help->dthps_nprovs)
15387 break;
15388
15389 /*
15390 * Move the last helper provider into this slot.
15391 */
15392 help->dthps_nprovs--;
15393 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15394 help->dthps_provs[help->dthps_nprovs] = NULL;
15395
15396 lck_mtx_unlock(&dtrace_lock);
15397
15398 /*
15399 * If we have a meta provider, remove this helper provider.
15400 */
15401 if (dtrace_meta_pid != NULL) {
15402 ASSERT(dtrace_deferred_pid == NULL);
15403 dtrace_helper_provider_remove(&prov->dthp_prov,
15404 p);
15405 }
15406
15407 dtrace_helper_provider_destroy(prov);
15408
15409 lck_mtx_lock(&dtrace_lock);
15410 }
15411
15412 return (0);
15413 }
15414
15415 static int
dtrace_helper_validate(dtrace_helper_action_t * helper)15416 dtrace_helper_validate(dtrace_helper_action_t *helper)
15417 {
15418 int err = 0, i;
15419 dtrace_difo_t *dp;
15420
15421 if ((dp = helper->dtha_predicate) != NULL)
15422 err += dtrace_difo_validate_helper(dp);
15423
15424 for (i = 0; i < helper->dtha_nactions; i++)
15425 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15426
15427 return (err == 0);
15428 }
15429
15430 static int
dtrace_helper_action_add(proc_t * p,int which,dtrace_ecbdesc_t * ep)15431 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15432 {
15433 dtrace_helpers_t *help;
15434 dtrace_helper_action_t *helper, *last;
15435 dtrace_actdesc_t *act;
15436 dtrace_vstate_t *vstate;
15437 dtrace_predicate_t *pred;
15438 int count = 0, nactions = 0, i;
15439
15440 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15441 return (EINVAL);
15442
15443 help = p->p_dtrace_helpers;
15444 last = help->dthps_actions[which];
15445 vstate = &help->dthps_vstate;
15446
15447 for (count = 0; last != NULL; last = last->dtha_next) {
15448 count++;
15449 if (last->dtha_next == NULL)
15450 break;
15451 }
15452
15453 /*
15454 * If we already have dtrace_helper_actions_max helper actions for this
15455 * helper action type, we'll refuse to add a new one.
15456 */
15457 if (count >= dtrace_helper_actions_max)
15458 return (ENOSPC);
15459
15460 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15461 helper->dtha_generation = help->dthps_generation;
15462
15463 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15464 ASSERT(pred->dtp_difo != NULL);
15465 dtrace_difo_hold(pred->dtp_difo);
15466 helper->dtha_predicate = pred->dtp_difo;
15467 }
15468
15469 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15470 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15471 goto err;
15472
15473 if (act->dtad_difo == NULL)
15474 goto err;
15475
15476 nactions++;
15477 }
15478
15479 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15480 (helper->dtha_nactions = nactions), KM_SLEEP);
15481
15482 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15483 dtrace_difo_hold(act->dtad_difo);
15484 helper->dtha_actions[i++] = act->dtad_difo;
15485 }
15486
15487 if (!dtrace_helper_validate(helper))
15488 goto err;
15489
15490 if (last == NULL) {
15491 help->dthps_actions[which] = helper;
15492 } else {
15493 last->dtha_next = helper;
15494 }
15495
15496 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15497 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15498 dtrace_helptrace_next = 0;
15499 }
15500
15501 return (0);
15502 err:
15503 dtrace_helper_action_destroy(helper, vstate);
15504 return (EINVAL);
15505 }
15506
15507 static void
dtrace_helper_provider_register(proc_t * p,dtrace_helpers_t * help,dof_helper_t * dofhp)15508 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15509 dof_helper_t *dofhp)
15510 {
15511 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15512 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15513
15514 lck_mtx_lock(&dtrace_lock);
15515
15516 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15517 /*
15518 * If the dtrace module is loaded but not attached, or if
15519 * there aren't isn't a meta provider registered to deal with
15520 * these provider descriptions, we need to postpone creating
15521 * the actual providers until later.
15522 */
15523
15524 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15525 dtrace_deferred_pid != help) {
15526 help->dthps_deferred = 1;
15527 help->dthps_pid = proc_getpid(p);
15528 help->dthps_next = dtrace_deferred_pid;
15529 help->dthps_prev = NULL;
15530 if (dtrace_deferred_pid != NULL)
15531 dtrace_deferred_pid->dthps_prev = help;
15532 dtrace_deferred_pid = help;
15533 }
15534
15535 lck_mtx_unlock(&dtrace_lock);
15536
15537 } else if (dofhp != NULL) {
15538 /*
15539 * If the dtrace module is loaded and we have a particular
15540 * helper provider description, pass that off to the
15541 * meta provider.
15542 */
15543
15544 lck_mtx_unlock(&dtrace_lock);
15545
15546 dtrace_helper_provide(dofhp, p);
15547
15548 } else {
15549 /*
15550 * Otherwise, just pass all the helper provider descriptions
15551 * off to the meta provider.
15552 */
15553
15554 uint_t i;
15555 lck_mtx_unlock(&dtrace_lock);
15556
15557 for (i = 0; i < help->dthps_nprovs; i++) {
15558 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15559 p);
15560 }
15561 }
15562 }
15563
15564 static int
dtrace_helper_provider_add(proc_t * p,dof_helper_t * dofhp,int gen)15565 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15566 {
15567 dtrace_helpers_t *help;
15568 dtrace_helper_provider_t *hprov, **tmp_provs;
15569 uint_t tmp_maxprovs, i;
15570
15571 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15572 help = p->p_dtrace_helpers;
15573 ASSERT(help != NULL);
15574
15575 /*
15576 * If we already have dtrace_helper_providers_max helper providers,
15577 * we're refuse to add a new one.
15578 */
15579 if (help->dthps_nprovs >= dtrace_helper_providers_max)
15580 return (ENOSPC);
15581
15582 /*
15583 * Check to make sure this isn't a duplicate.
15584 */
15585 for (i = 0; i < help->dthps_nprovs; i++) {
15586 if (dofhp->dofhp_addr ==
15587 help->dthps_provs[i]->dthp_prov.dofhp_addr)
15588 return (EALREADY);
15589 }
15590
15591 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15592 hprov->dthp_prov = *dofhp;
15593 hprov->dthp_ref = 1;
15594 hprov->dthp_generation = gen;
15595
15596 /*
15597 * Allocate a bigger table for helper providers if it's already full.
15598 */
15599 if (help->dthps_maxprovs == help->dthps_nprovs) {
15600 tmp_maxprovs = help->dthps_maxprovs;
15601 tmp_provs = help->dthps_provs;
15602
15603 if (help->dthps_maxprovs == 0)
15604 help->dthps_maxprovs = 2;
15605 else
15606 help->dthps_maxprovs *= 2;
15607 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15608 help->dthps_maxprovs = dtrace_helper_providers_max;
15609
15610 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15611
15612 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15613 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15614
15615 if (tmp_provs != NULL) {
15616 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15617 sizeof (dtrace_helper_provider_t *));
15618 kmem_free(tmp_provs, tmp_maxprovs *
15619 sizeof (dtrace_helper_provider_t *));
15620 }
15621 }
15622
15623 help->dthps_provs[help->dthps_nprovs] = hprov;
15624 help->dthps_nprovs++;
15625
15626 return (0);
15627 }
15628
15629 static void
dtrace_helper_provider_destroy(dtrace_helper_provider_t * hprov)15630 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15631 {
15632 lck_mtx_lock(&dtrace_lock);
15633
15634 if (--hprov->dthp_ref == 0) {
15635 dof_hdr_t *dof;
15636 lck_mtx_unlock(&dtrace_lock);
15637 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15638 dtrace_dof_destroy(dof);
15639 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15640 } else {
15641 lck_mtx_unlock(&dtrace_lock);
15642 }
15643 }
15644
15645 static int
dtrace_helper_provider_validate(dof_hdr_t * dof,dof_sec_t * sec)15646 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15647 {
15648 uintptr_t daddr = (uintptr_t)dof;
15649 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15650 dof_provider_t *provider;
15651 dof_probe_t *probe;
15652 uint8_t *arg;
15653 char *strtab, *typestr;
15654 dof_stridx_t typeidx;
15655 size_t typesz;
15656 uint_t nprobes, j, k;
15657
15658 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15659
15660 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15661 dtrace_dof_error(dof, "misaligned section offset");
15662 return (-1);
15663 }
15664
15665 /*
15666 * The section needs to be large enough to contain the DOF provider
15667 * structure appropriate for the given version.
15668 */
15669 if (sec->dofs_size <
15670 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15671 offsetof(dof_provider_t, dofpv_prenoffs) :
15672 sizeof (dof_provider_t))) {
15673 dtrace_dof_error(dof, "provider section too small");
15674 return (-1);
15675 }
15676
15677 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15678 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15679 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15680 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15681 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15682
15683 if (str_sec == NULL || prb_sec == NULL ||
15684 arg_sec == NULL || off_sec == NULL)
15685 return (-1);
15686
15687 enoff_sec = NULL;
15688
15689 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15690 provider->dofpv_prenoffs != DOF_SECT_NONE &&
15691 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15692 provider->dofpv_prenoffs)) == NULL)
15693 return (-1);
15694
15695 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15696
15697 if (provider->dofpv_name >= str_sec->dofs_size ||
15698 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15699 dtrace_dof_error(dof, "invalid provider name");
15700 return (-1);
15701 }
15702
15703 if (prb_sec->dofs_entsize == 0 ||
15704 prb_sec->dofs_entsize > prb_sec->dofs_size) {
15705 dtrace_dof_error(dof, "invalid entry size");
15706 return (-1);
15707 }
15708
15709 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15710 dtrace_dof_error(dof, "misaligned entry size");
15711 return (-1);
15712 }
15713
15714 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15715 dtrace_dof_error(dof, "invalid entry size");
15716 return (-1);
15717 }
15718
15719 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15720 dtrace_dof_error(dof, "misaligned section offset");
15721 return (-1);
15722 }
15723
15724 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15725 dtrace_dof_error(dof, "invalid entry size");
15726 return (-1);
15727 }
15728
15729 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15730
15731 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15732
15733 /*
15734 * Take a pass through the probes to check for errors.
15735 */
15736 for (j = 0; j < nprobes; j++) {
15737 probe = (dof_probe_t *)(uintptr_t)(daddr +
15738 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15739
15740 if (probe->dofpr_func >= str_sec->dofs_size) {
15741 dtrace_dof_error(dof, "invalid function name");
15742 return (-1);
15743 }
15744
15745 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15746 dtrace_dof_error(dof, "function name too long");
15747 return (-1);
15748 }
15749
15750 if (probe->dofpr_name >= str_sec->dofs_size ||
15751 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15752 dtrace_dof_error(dof, "invalid probe name");
15753 return (-1);
15754 }
15755
15756 /*
15757 * The offset count must not wrap the index, and the offsets
15758 * must also not overflow the section's data.
15759 */
15760 if (probe->dofpr_offidx + probe->dofpr_noffs <
15761 probe->dofpr_offidx ||
15762 (probe->dofpr_offidx + probe->dofpr_noffs) *
15763 off_sec->dofs_entsize > off_sec->dofs_size) {
15764 dtrace_dof_error(dof, "invalid probe offset");
15765 return (-1);
15766 }
15767
15768 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15769 /*
15770 * If there's no is-enabled offset section, make sure
15771 * there aren't any is-enabled offsets. Otherwise
15772 * perform the same checks as for probe offsets
15773 * (immediately above).
15774 */
15775 if (enoff_sec == NULL) {
15776 if (probe->dofpr_enoffidx != 0 ||
15777 probe->dofpr_nenoffs != 0) {
15778 dtrace_dof_error(dof, "is-enabled "
15779 "offsets with null section");
15780 return (-1);
15781 }
15782 } else if (probe->dofpr_enoffidx +
15783 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15784 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15785 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15786 dtrace_dof_error(dof, "invalid is-enabled "
15787 "offset");
15788 return (-1);
15789 }
15790
15791 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15792 dtrace_dof_error(dof, "zero probe and "
15793 "is-enabled offsets");
15794 return (-1);
15795 }
15796 } else if (probe->dofpr_noffs == 0) {
15797 dtrace_dof_error(dof, "zero probe offsets");
15798 return (-1);
15799 }
15800
15801 if (probe->dofpr_argidx + probe->dofpr_xargc <
15802 probe->dofpr_argidx ||
15803 (probe->dofpr_argidx + probe->dofpr_xargc) *
15804 arg_sec->dofs_entsize > arg_sec->dofs_size) {
15805 dtrace_dof_error(dof, "invalid args");
15806 return (-1);
15807 }
15808
15809 typeidx = probe->dofpr_nargv;
15810 typestr = strtab + probe->dofpr_nargv;
15811 for (k = 0; k < probe->dofpr_nargc; k++) {
15812 if (typeidx >= str_sec->dofs_size) {
15813 dtrace_dof_error(dof, "bad "
15814 "native argument type");
15815 return (-1);
15816 }
15817
15818 typesz = strlen(typestr) + 1;
15819 if (typesz > DTRACE_ARGTYPELEN) {
15820 dtrace_dof_error(dof, "native "
15821 "argument type too long");
15822 return (-1);
15823 }
15824 typeidx += typesz;
15825 typestr += typesz;
15826 }
15827
15828 typeidx = probe->dofpr_xargv;
15829 typestr = strtab + probe->dofpr_xargv;
15830 for (k = 0; k < probe->dofpr_xargc; k++) {
15831 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15832 dtrace_dof_error(dof, "bad "
15833 "native argument index");
15834 return (-1);
15835 }
15836
15837 if (typeidx >= str_sec->dofs_size) {
15838 dtrace_dof_error(dof, "bad "
15839 "translated argument type");
15840 return (-1);
15841 }
15842
15843 typesz = strlen(typestr) + 1;
15844 if (typesz > DTRACE_ARGTYPELEN) {
15845 dtrace_dof_error(dof, "translated argument "
15846 "type too long");
15847 return (-1);
15848 }
15849
15850 typeidx += typesz;
15851 typestr += typesz;
15852 }
15853 }
15854
15855 return (0);
15856 }
15857
15858 static int
dtrace_helper_slurp(proc_t * p,dof_hdr_t * dof,dof_helper_t * dhp)15859 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15860 {
15861 dtrace_helpers_t *help;
15862 dtrace_vstate_t *vstate;
15863 dtrace_enabling_t *enab = NULL;
15864 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15865 uintptr_t daddr = (uintptr_t)dof;
15866
15867 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15868 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15869
15870 if ((help = p->p_dtrace_helpers) == NULL)
15871 help = dtrace_helpers_create(p);
15872
15873 vstate = &help->dthps_vstate;
15874
15875 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15876 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15877 dtrace_dof_destroy(dof);
15878 return (rv);
15879 }
15880
15881 /*
15882 * Look for helper providers and validate their descriptions.
15883 */
15884 if (dhp != NULL) {
15885 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15886 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15887 dof->dofh_secoff + i * dof->dofh_secsize);
15888
15889 if (sec->dofs_type != DOF_SECT_PROVIDER)
15890 continue;
15891
15892 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15893 dtrace_enabling_destroy(enab);
15894 dtrace_dof_destroy(dof);
15895 return (-1);
15896 }
15897
15898 nprovs++;
15899 }
15900 }
15901
15902 /*
15903 * Now we need to walk through the ECB descriptions in the enabling.
15904 */
15905 for (i = 0; i < enab->dten_ndesc; i++) {
15906 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15907 dtrace_probedesc_t *desc = &ep->dted_probe;
15908
15909 /* APPLE NOTE: Darwin employs size bounded string operation. */
15910 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15911 continue;
15912
15913 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15914 continue;
15915
15916 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15917 continue;
15918
15919 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15920 ep)) != 0) {
15921 /*
15922 * Adding this helper action failed -- we are now going
15923 * to rip out the entire generation and return failure.
15924 */
15925 (void) dtrace_helper_destroygen(p, help->dthps_generation);
15926 dtrace_enabling_destroy(enab);
15927 dtrace_dof_destroy(dof);
15928 return (-1);
15929 }
15930
15931 nhelpers++;
15932 }
15933
15934 if (nhelpers < enab->dten_ndesc)
15935 dtrace_dof_error(dof, "unmatched helpers");
15936
15937 gen = help->dthps_generation++;
15938 dtrace_enabling_destroy(enab);
15939
15940 if (dhp != NULL && nprovs > 0) {
15941 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15942 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15943 lck_mtx_unlock(&dtrace_lock);
15944 dtrace_helper_provider_register(p, help, dhp);
15945 lck_mtx_lock(&dtrace_lock);
15946
15947 destroy = 0;
15948 }
15949 }
15950
15951 if (destroy)
15952 dtrace_dof_destroy(dof);
15953
15954 return (gen);
15955 }
15956
15957 /*
15958 * APPLE NOTE: DTrace lazy dof implementation
15959 *
15960 * DTrace user static probes (USDT probes) and helper actions are loaded
15961 * in a process by proccessing dof sections. The dof sections are passed
15962 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15963 * expensive to process dof for a process that will never use it. There
15964 * is a memory cost (allocating the providers/probes), and a cpu cost
15965 * (creating the providers/probes).
15966 *
15967 * To reduce this cost, we use "lazy dof". The normal proceedure for
15968 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15969 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15970 * used, each process retains the dof_ioctl_data_t block, instead of
15971 * copying in the data it points to.
15972 *
15973 * The dof_ioctl_data_t blocks are managed as if they were the actual
15974 * processed dof; on fork the block is copied to the child, on exec and
15975 * exit the block is freed.
15976 *
15977 * If the process loads library(s) containing additional dof, the
15978 * new dof_ioctl_data_t is merged with the existing block.
15979 *
15980 * There are a few catches that make this slightly more difficult.
15981 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15982 * identifier value for each dof in the block. In non-lazy dof terms,
15983 * this is the generation that dof was loaded in. If we hand back
15984 * a UID for a lazy dof, that same UID must be able to unload the
15985 * dof once it has become non-lazy. To meet this requirement, the
15986 * code that loads lazy dof requires that the UID's for dof(s) in
15987 * the lazy dof be sorted, and in ascending order. It is okay to skip
15988 * UID's, I.E., 1 -> 5 -> 6 is legal.
15989 *
15990 * Once a process has become non-lazy, it will stay non-lazy. All
15991 * future dof operations for that process will be non-lazy, even
15992 * if the dof mode transitions back to lazy.
15993 *
15994 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15995 * That way if the lazy check fails due to transitioning to non-lazy, the
15996 * right thing is done with the newly faulted in dof.
15997 */
15998
15999 /*
16000 * This method is a bit squicky. It must handle:
16001 *
16002 * dof should not be lazy.
16003 * dof should have been handled lazily, but there was an error
16004 * dof was handled lazily, and needs to be freed.
16005 * dof was handled lazily, and must not be freed.
16006 *
16007 *
16008 * Returns EACCESS if dof should be handled non-lazily.
16009 *
16010 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
16011 *
16012 * If the dofs data is claimed by this method, dofs_claimed will be set.
16013 * Callers should not free claimed dofs.
16014 */
16015 static int
dtrace_lazy_dofs_add(proc_t * p,dof_ioctl_data_t * incoming_dofs,int * dofs_claimed)16016 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
16017 {
16018 ASSERT(p);
16019 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
16020
16021 int rval = 0;
16022 *dofs_claimed = 0;
16023
16024 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16025
16026 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16027 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16028
16029 /*
16030 * Any existing helpers force non-lazy behavior.
16031 */
16032 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16033 dtrace_sprlock(p);
16034
16035 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16036 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
16037 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
16038
16039 /*
16040 * Range check...
16041 */
16042 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
16043 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
16044 rval = EINVAL;
16045 goto unlock;
16046 }
16047
16048 /*
16049 * Each dof being added must be assigned a unique generation.
16050 */
16051 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
16052 for (i=0; i<incoming_dofs->dofiod_count; i++) {
16053 /*
16054 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
16055 */
16056 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
16057 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
16058 }
16059
16060
16061 if (existing_dofs) {
16062 /*
16063 * Merge the existing and incoming dofs
16064 */
16065 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
16066 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
16067
16068 bcopy(&existing_dofs->dofiod_helpers[0],
16069 &merged_dofs->dofiod_helpers[0],
16070 sizeof(dof_helper_t) * existing_dofs_count);
16071 bcopy(&incoming_dofs->dofiod_helpers[0],
16072 &merged_dofs->dofiod_helpers[existing_dofs_count],
16073 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
16074
16075 merged_dofs->dofiod_count = merged_dofs_count;
16076
16077 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16078
16079 p->p_dtrace_lazy_dofs = merged_dofs;
16080 } else {
16081 /*
16082 * Claim the incoming dofs
16083 */
16084 *dofs_claimed = 1;
16085 p->p_dtrace_lazy_dofs = incoming_dofs;
16086 }
16087
16088 #if DEBUG
16089 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16090 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16091 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16092 }
16093 #endif /* DEBUG */
16094
16095 unlock:
16096 dtrace_sprunlock(p);
16097 } else {
16098 rval = EACCES;
16099 }
16100
16101 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16102
16103 return rval;
16104 }
16105
16106 /*
16107 * Returns:
16108 *
16109 * EINVAL: lazy dof is enabled, but the requested generation was not found.
16110 * EACCES: This removal needs to be handled non-lazily.
16111 */
16112 static int
dtrace_lazy_dofs_remove(proc_t * p,int generation)16113 dtrace_lazy_dofs_remove(proc_t *p, int generation)
16114 {
16115 int rval = EINVAL;
16116
16117 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16118
16119 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16120 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16121
16122 /*
16123 * Any existing helpers force non-lazy behavior.
16124 */
16125 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16126 dtrace_sprlock(p);
16127
16128 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16129
16130 if (existing_dofs) {
16131 int index, existing_dofs_count = existing_dofs->dofiod_count;
16132 for (index=0; index<existing_dofs_count; index++) {
16133 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
16134 dof_ioctl_data_t* removed_dofs = NULL;
16135
16136 /*
16137 * If there is only 1 dof, we'll delete it and swap in NULL.
16138 */
16139 if (existing_dofs_count > 1) {
16140 int removed_dofs_count = existing_dofs_count - 1;
16141 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
16142
16143 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
16144 removed_dofs->dofiod_count = removed_dofs_count;
16145
16146 /*
16147 * copy the remaining data.
16148 */
16149 if (index > 0) {
16150 bcopy(&existing_dofs->dofiod_helpers[0],
16151 &removed_dofs->dofiod_helpers[0],
16152 index * sizeof(dof_helper_t));
16153 }
16154
16155 if (index < existing_dofs_count-1) {
16156 bcopy(&existing_dofs->dofiod_helpers[index+1],
16157 &removed_dofs->dofiod_helpers[index],
16158 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
16159 }
16160 }
16161
16162 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16163
16164 p->p_dtrace_lazy_dofs = removed_dofs;
16165
16166 rval = KERN_SUCCESS;
16167
16168 break;
16169 }
16170 }
16171
16172 #if DEBUG
16173 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16174 if (all_dofs) {
16175 unsigned int i;
16176 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16177 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16178 }
16179 }
16180 #endif
16181
16182 }
16183 dtrace_sprunlock(p);
16184 } else {
16185 rval = EACCES;
16186 }
16187
16188 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16189
16190 return rval;
16191 }
16192
16193 void
dtrace_lazy_dofs_destroy(proc_t * p)16194 dtrace_lazy_dofs_destroy(proc_t *p)
16195 {
16196 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16197 dtrace_sprlock(p);
16198
16199 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16200
16201 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16202 p->p_dtrace_lazy_dofs = NULL;
16203
16204 dtrace_sprunlock(p);
16205 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16206
16207 if (lazy_dofs) {
16208 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16209 }
16210 }
16211
16212 static int
dtrace_lazy_dofs_proc_iterate_filter(proc_t * p,void * ignored)16213 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16214 {
16215 #pragma unused(ignored)
16216 /*
16217 * Okay to NULL test without taking the sprlock.
16218 */
16219 return p->p_dtrace_lazy_dofs != NULL;
16220 }
16221
16222 static void
dtrace_lazy_dofs_process(proc_t * p)16223 dtrace_lazy_dofs_process(proc_t *p) {
16224 /*
16225 * It is possible this process may exit during our attempt to
16226 * fault in the dof. We could fix this by holding locks longer,
16227 * but the errors are benign.
16228 */
16229 dtrace_sprlock(p);
16230
16231
16232 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16233 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16234
16235 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16236 p->p_dtrace_lazy_dofs = NULL;
16237
16238 dtrace_sprunlock(p);
16239 lck_mtx_lock(&dtrace_meta_lock);
16240 /*
16241 * Process each dof_helper_t
16242 */
16243 if (lazy_dofs != NULL) {
16244 unsigned int i;
16245 int rval;
16246
16247 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16248 /*
16249 * When loading lazy dof, we depend on the generations being sorted in ascending order.
16250 */
16251 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16252
16253 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16254
16255 /*
16256 * We stored the generation in dofhp_dof. Save it, and restore the original value.
16257 */
16258 int generation = dhp->dofhp_dof;
16259 dhp->dofhp_dof = dhp->dofhp_addr;
16260
16261 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16262
16263 if (dof != NULL) {
16264 dtrace_helpers_t *help;
16265
16266 lck_mtx_lock(&dtrace_lock);
16267
16268 /*
16269 * This must be done with the dtrace_lock held
16270 */
16271 if ((help = p->p_dtrace_helpers) == NULL)
16272 help = dtrace_helpers_create(p);
16273
16274 /*
16275 * If the generation value has been bumped, someone snuck in
16276 * when we released the dtrace lock. We have to dump this generation,
16277 * there is no safe way to load it.
16278 */
16279 if (help->dthps_generation <= generation) {
16280 help->dthps_generation = generation;
16281
16282 /*
16283 * dtrace_helper_slurp() takes responsibility for the dof --
16284 * it may free it now or it may save it and free it later.
16285 */
16286 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16287 dtrace_dof_error(NULL, "returned value did not match expected generation");
16288 }
16289 }
16290
16291 lck_mtx_unlock(&dtrace_lock);
16292 }
16293 }
16294 lck_mtx_unlock(&dtrace_meta_lock);
16295 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16296 } else {
16297 lck_mtx_unlock(&dtrace_meta_lock);
16298 }
16299 }
16300
16301 static int
dtrace_lazy_dofs_proc_iterate_doit(proc_t * p,void * ignored)16302 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16303 {
16304 #pragma unused(ignored)
16305
16306 dtrace_lazy_dofs_process(p);
16307
16308 return PROC_RETURNED;
16309 }
16310
16311 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16312
16313 static int
dtrace_lazy_dofs_duplicate(proc_t * parent,proc_t * child)16314 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16315 {
16316 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16317 LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16318 LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16319
16320 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16321 dtrace_sprlock(parent);
16322
16323 /*
16324 * We need to make sure that the transition to lazy dofs -> helpers
16325 * was atomic for our parent
16326 */
16327 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16328 /*
16329 * In theory we should hold the child sprlock, but this is safe...
16330 */
16331 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16332
16333 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16334 dof_ioctl_data_t* child_dofs = NULL;
16335 if (parent_dofs) {
16336 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16337 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16338 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16339 }
16340
16341 dtrace_sprunlock(parent);
16342
16343 if (child_dofs) {
16344 dtrace_sprlock(child);
16345 child->p_dtrace_lazy_dofs = child_dofs;
16346 dtrace_sprunlock(child);
16347 /**
16348 * We process the DOF at this point if the mode is set to
16349 * LAZY_OFF. This can happen if DTrace is still processing the
16350 * DOF of other process (which can happen because the
16351 * protected pager can have a huge latency)
16352 * but has not processed our parent yet
16353 */
16354 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16355 dtrace_lazy_dofs_process(child);
16356 }
16357 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16358
16359 return DTRACE_LAZY_DOFS_DUPLICATED;
16360 }
16361 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16362
16363 return 0;
16364 }
16365
16366 static dtrace_helpers_t *
dtrace_helpers_create(proc_t * p)16367 dtrace_helpers_create(proc_t *p)
16368 {
16369 dtrace_helpers_t *help;
16370
16371 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16372 ASSERT(p->p_dtrace_helpers == NULL);
16373
16374 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16375 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16376 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16377
16378 p->p_dtrace_helpers = help;
16379 dtrace_helpers++;
16380
16381 return (help);
16382 }
16383
16384 static void
dtrace_helpers_destroy(proc_t * p)16385 dtrace_helpers_destroy(proc_t* p)
16386 {
16387 dtrace_helpers_t *help;
16388 dtrace_vstate_t *vstate;
16389 uint_t i;
16390
16391 lck_mtx_lock(&dtrace_meta_lock);
16392 lck_mtx_lock(&dtrace_lock);
16393
16394 ASSERT(p->p_dtrace_helpers != NULL);
16395 ASSERT(dtrace_helpers > 0);
16396
16397 help = p->p_dtrace_helpers;
16398 vstate = &help->dthps_vstate;
16399
16400 /*
16401 * We're now going to lose the help from this process.
16402 */
16403 p->p_dtrace_helpers = NULL;
16404 dtrace_sync();
16405
16406 /*
16407 * Destory the helper actions.
16408 */
16409 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16410 dtrace_helper_action_t *h, *next;
16411
16412 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16413 next = h->dtha_next;
16414 dtrace_helper_action_destroy(h, vstate);
16415 h = next;
16416 }
16417 }
16418
16419 lck_mtx_unlock(&dtrace_lock);
16420
16421 /*
16422 * Destroy the helper providers.
16423 */
16424 if (help->dthps_maxprovs > 0) {
16425 if (dtrace_meta_pid != NULL) {
16426 ASSERT(dtrace_deferred_pid == NULL);
16427
16428 for (i = 0; i < help->dthps_nprovs; i++) {
16429 dtrace_helper_provider_remove(
16430 &help->dthps_provs[i]->dthp_prov, p);
16431 }
16432 } else {
16433 lck_mtx_lock(&dtrace_lock);
16434 ASSERT(help->dthps_deferred == 0 ||
16435 help->dthps_next != NULL ||
16436 help->dthps_prev != NULL ||
16437 help == dtrace_deferred_pid);
16438
16439 /*
16440 * Remove the helper from the deferred list.
16441 */
16442 if (help->dthps_next != NULL)
16443 help->dthps_next->dthps_prev = help->dthps_prev;
16444 if (help->dthps_prev != NULL)
16445 help->dthps_prev->dthps_next = help->dthps_next;
16446 if (dtrace_deferred_pid == help) {
16447 dtrace_deferred_pid = help->dthps_next;
16448 ASSERT(help->dthps_prev == NULL);
16449 }
16450
16451 lck_mtx_unlock(&dtrace_lock);
16452 }
16453
16454
16455 for (i = 0; i < help->dthps_nprovs; i++) {
16456 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16457 }
16458
16459 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16460 sizeof (dtrace_helper_provider_t *));
16461 }
16462
16463 lck_mtx_lock(&dtrace_lock);
16464
16465 dtrace_vstate_fini(&help->dthps_vstate);
16466 kmem_free(help->dthps_actions,
16467 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16468 kmem_free(help, sizeof (dtrace_helpers_t));
16469
16470 --dtrace_helpers;
16471 lck_mtx_unlock(&dtrace_lock);
16472 lck_mtx_unlock(&dtrace_meta_lock);
16473 }
16474
16475 static void
dtrace_helpers_duplicate(proc_t * from,proc_t * to)16476 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16477 {
16478 dtrace_helpers_t *help, *newhelp;
16479 dtrace_helper_action_t *helper, *new, *last;
16480 dtrace_difo_t *dp;
16481 dtrace_vstate_t *vstate;
16482 uint_t i;
16483 int j, sz, hasprovs = 0;
16484
16485 lck_mtx_lock(&dtrace_meta_lock);
16486 lck_mtx_lock(&dtrace_lock);
16487 ASSERT(from->p_dtrace_helpers != NULL);
16488 ASSERT(dtrace_helpers > 0);
16489
16490 help = from->p_dtrace_helpers;
16491 newhelp = dtrace_helpers_create(to);
16492 ASSERT(to->p_dtrace_helpers != NULL);
16493
16494 newhelp->dthps_generation = help->dthps_generation;
16495 vstate = &newhelp->dthps_vstate;
16496
16497 /*
16498 * Duplicate the helper actions.
16499 */
16500 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16501 if ((helper = help->dthps_actions[i]) == NULL)
16502 continue;
16503
16504 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16505 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16506 KM_SLEEP);
16507 new->dtha_generation = helper->dtha_generation;
16508
16509 if ((dp = helper->dtha_predicate) != NULL) {
16510 dp = dtrace_difo_duplicate(dp, vstate);
16511 new->dtha_predicate = dp;
16512 }
16513
16514 new->dtha_nactions = helper->dtha_nactions;
16515 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16516 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16517
16518 for (j = 0; j < new->dtha_nactions; j++) {
16519 dtrace_difo_t *dpj = helper->dtha_actions[j];
16520
16521 ASSERT(dpj != NULL);
16522 dpj = dtrace_difo_duplicate(dpj, vstate);
16523 new->dtha_actions[j] = dpj;
16524 }
16525
16526 if (last != NULL) {
16527 last->dtha_next = new;
16528 } else {
16529 newhelp->dthps_actions[i] = new;
16530 }
16531
16532 last = new;
16533 }
16534 }
16535
16536 /*
16537 * Duplicate the helper providers and register them with the
16538 * DTrace framework.
16539 */
16540 if (help->dthps_nprovs > 0) {
16541 newhelp->dthps_nprovs = help->dthps_nprovs;
16542 newhelp->dthps_maxprovs = help->dthps_nprovs;
16543 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16544 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16545 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16546 newhelp->dthps_provs[i] = help->dthps_provs[i];
16547 newhelp->dthps_provs[i]->dthp_ref++;
16548 }
16549
16550 hasprovs = 1;
16551 }
16552
16553 lck_mtx_unlock(&dtrace_lock);
16554
16555 if (hasprovs)
16556 dtrace_helper_provider_register(to, newhelp, NULL);
16557
16558 lck_mtx_unlock(&dtrace_meta_lock);
16559 }
16560
16561 /**
16562 * DTrace Process functions
16563 */
16564
16565 void
dtrace_proc_fork(proc_t * parent_proc,proc_t * child_proc,int spawn)16566 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16567 {
16568 /*
16569 * This code applies to new processes who are copying the task
16570 * and thread state and address spaces of their parent process.
16571 */
16572 if (!spawn) {
16573 /*
16574 * APPLE NOTE: Solaris does a sprlock() and drops the
16575 * proc_lock here. We're cheating a bit and only taking
16576 * the p_dtrace_sprlock lock. A full sprlock would
16577 * task_suspend the parent.
16578 */
16579 dtrace_sprlock(parent_proc);
16580
16581 /*
16582 * Remove all DTrace tracepoints from the child process. We
16583 * need to do this _before_ duplicating USDT providers since
16584 * any associated probes may be immediately enabled.
16585 */
16586 if (parent_proc->p_dtrace_count > 0) {
16587 dtrace_fasttrap_fork(parent_proc, child_proc);
16588 }
16589
16590 dtrace_sprunlock(parent_proc);
16591
16592 /*
16593 * Duplicate any lazy dof(s). This must be done while NOT
16594 * holding the parent sprlock! Lock ordering is
16595 * dtrace_dof_mode_lock, then sprlock. It is imperative we
16596 * always call dtrace_lazy_dofs_duplicate, rather than null
16597 * check and call if !NULL. If we NULL test, during lazy dof
16598 * faulting we can race with the faulting code and proceed
16599 * from here to beyond the helpers copy. The lazy dof
16600 * faulting will then fail to copy the helpers to the child
16601 * process. We return if we duplicated lazy dofs as a process
16602 * can only have one at the same time to avoid a race between
16603 * a dtrace client and dtrace_proc_fork where a process would
16604 * end up with both lazy dofs and helpers.
16605 */
16606 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16607 return;
16608 }
16609
16610 /*
16611 * Duplicate any helper actions and providers if they haven't
16612 * already.
16613 */
16614 #if !defined(__APPLE__)
16615 /*
16616 * The SFORKING
16617 * we set above informs the code to enable USDT probes that
16618 * sprlock() may fail because the child is being forked.
16619 */
16620 #endif
16621 /*
16622 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16623 * never fails to find the child. We do not set SFORKING.
16624 */
16625 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16626 (*dtrace_helpers_fork)(parent_proc, child_proc);
16627 }
16628 }
16629 }
16630
16631 void
dtrace_proc_exec(proc_t * p)16632 dtrace_proc_exec(proc_t *p)
16633 {
16634 /*
16635 * Invalidate any predicate evaluation already cached for this thread by DTrace.
16636 * That's because we've just stored to p_comm and DTrace refers to that when it
16637 * evaluates the "execname" special variable. uid and gid may have changed as well.
16638 */
16639 dtrace_set_thread_predcache(current_thread(), 0);
16640
16641 /*
16642 * Free any outstanding lazy dof entries. It is imperative we
16643 * always call dtrace_lazy_dofs_destroy, rather than null check
16644 * and call if !NULL. If we NULL test, during lazy dof faulting
16645 * we can race with the faulting code and proceed from here to
16646 * beyond the helpers cleanup. The lazy dof faulting will then
16647 * install new helpers which no longer belong to this process!
16648 */
16649 dtrace_lazy_dofs_destroy(p);
16650
16651
16652 /*
16653 * Clean up any DTrace helpers for the process.
16654 */
16655 if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16656 (*dtrace_helpers_cleanup)(p);
16657 }
16658
16659 /*
16660 * Cleanup the DTrace provider associated with this process.
16661 */
16662 proc_lock(p);
16663 if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16664 (*dtrace_fasttrap_exec_ptr)(p);
16665 }
16666 proc_unlock(p);
16667 }
16668
16669 void
dtrace_proc_exit(proc_t * p)16670 dtrace_proc_exit(proc_t *p)
16671 {
16672 /*
16673 * Free any outstanding lazy dof entries. It is imperative we
16674 * always call dtrace_lazy_dofs_destroy, rather than null check
16675 * and call if !NULL. If we NULL test, during lazy dof faulting
16676 * we can race with the faulting code and proceed from here to
16677 * beyond the helpers cleanup. The lazy dof faulting will then
16678 * install new helpers which will never be cleaned up, and leak.
16679 */
16680 dtrace_lazy_dofs_destroy(p);
16681
16682 /*
16683 * Clean up any DTrace helper actions or probes for the process.
16684 */
16685 if (p->p_dtrace_helpers != NULL) {
16686 (*dtrace_helpers_cleanup)(p);
16687 }
16688
16689 /*
16690 * Clean up any DTrace probes associated with this process.
16691 */
16692 /*
16693 * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16694 * call this after dtrace_helpers_cleanup()
16695 */
16696 proc_lock(p);
16697 if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16698 (*dtrace_fasttrap_exit_ptr)(p);
16699 }
16700 proc_unlock(p);
16701 }
16702
16703 /*
16704 * DTrace Hook Functions
16705 */
16706
16707 /*
16708 * APPLE NOTE: dtrace_modctl_* routines for kext support.
16709 * Used to manipulate the modctl list within dtrace xnu.
16710 */
16711
16712 modctl_t *dtrace_modctl_list;
16713
16714 static void
dtrace_modctl_add(struct modctl * newctl)16715 dtrace_modctl_add(struct modctl * newctl)
16716 {
16717 struct modctl *nextp, *prevp;
16718
16719 ASSERT(newctl != NULL);
16720 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16721
16722 // Insert new module at the front of the list,
16723
16724 newctl->mod_next = dtrace_modctl_list;
16725 dtrace_modctl_list = newctl;
16726
16727 /*
16728 * If a module exists with the same name, then that module
16729 * must have been unloaded with enabled probes. We will move
16730 * the unloaded module to the new module's stale chain and
16731 * then stop traversing the list.
16732 */
16733
16734 prevp = newctl;
16735 nextp = newctl->mod_next;
16736
16737 while (nextp != NULL) {
16738 if (nextp->mod_loaded) {
16739 /* This is a loaded module. Keep traversing. */
16740 prevp = nextp;
16741 nextp = nextp->mod_next;
16742 continue;
16743 }
16744 else {
16745 /* Found an unloaded module */
16746 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16747 /* Names don't match. Keep traversing. */
16748 prevp = nextp;
16749 nextp = nextp->mod_next;
16750 continue;
16751 }
16752 else {
16753 /* We found a stale entry, move it. We're done. */
16754 prevp->mod_next = nextp->mod_next;
16755 newctl->mod_stale = nextp;
16756 nextp->mod_next = NULL;
16757 break;
16758 }
16759 }
16760 }
16761 }
16762
16763 static modctl_t *
dtrace_modctl_lookup(struct kmod_info * kmod)16764 dtrace_modctl_lookup(struct kmod_info * kmod)
16765 {
16766 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16767
16768 struct modctl * ctl;
16769
16770 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16771 if (ctl->mod_id == kmod->id)
16772 return(ctl);
16773 }
16774 return (NULL);
16775 }
16776
16777 /*
16778 * This routine is called from dtrace_module_unloaded().
16779 * It removes a modctl structure and its stale chain
16780 * from the kext shadow list.
16781 */
16782 static void
dtrace_modctl_remove(struct modctl * ctl)16783 dtrace_modctl_remove(struct modctl * ctl)
16784 {
16785 ASSERT(ctl != NULL);
16786 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16787 modctl_t *prevp, *nextp, *curp;
16788
16789 // Remove stale chain first
16790 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16791 nextp = curp->mod_stale;
16792 /* There should NEVER be user symbols allocated at this point */
16793 ASSERT(curp->mod_user_symbols == NULL);
16794 kmem_free(curp, sizeof(modctl_t));
16795 }
16796
16797 prevp = NULL;
16798 curp = dtrace_modctl_list;
16799
16800 while (curp != ctl) {
16801 prevp = curp;
16802 curp = curp->mod_next;
16803 }
16804
16805 if (prevp != NULL) {
16806 prevp->mod_next = ctl->mod_next;
16807 }
16808 else {
16809 dtrace_modctl_list = ctl->mod_next;
16810 }
16811
16812 /* There should NEVER be user symbols allocated at this point */
16813 ASSERT(ctl->mod_user_symbols == NULL);
16814
16815 kmem_free (ctl, sizeof(modctl_t));
16816 }
16817
16818 /*
16819 * APPLE NOTE: The kext loader will call dtrace_module_loaded
16820 * when the kext is loaded in memory, but before calling the
16821 * kext's start routine.
16822 *
16823 * Return 0 on success
16824 * Return -1 on failure
16825 */
16826
16827 static int
dtrace_module_loaded(struct kmod_info * kmod,uint32_t flag)16828 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16829 {
16830 dtrace_provider_t *prv;
16831
16832 /*
16833 * If kernel symbols have been disabled, return immediately
16834 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16835 */
16836 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16837 return 0;
16838
16839 #if CONFIG_SPTM
16840 /* Opt-out the SPTM/TXM fake kexts from being loaded by DTrace. */
16841 extern kmod_info_t g_sptm_kmod_info, g_txm_kmod_info;
16842 if ((kmod == &g_sptm_kmod_info) || (kmod == &g_txm_kmod_info)) {
16843 return 0;
16844 }
16845 #endif
16846
16847 struct modctl *ctl = NULL;
16848 if (!kmod || kmod->address == 0 || kmod->size == 0)
16849 return(-1);
16850
16851 lck_mtx_lock(&dtrace_provider_lock);
16852 lck_mtx_lock(&mod_lock);
16853
16854 /*
16855 * Have we seen this kext before?
16856 */
16857
16858 ctl = dtrace_modctl_lookup(kmod);
16859
16860 if (ctl != NULL) {
16861 /* bail... we already have this kext in the modctl list */
16862 lck_mtx_unlock(&mod_lock);
16863 lck_mtx_unlock(&dtrace_provider_lock);
16864 if (dtrace_err_verbose)
16865 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16866 return(-1);
16867 }
16868 else {
16869 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16870 if (ctl == NULL) {
16871 if (dtrace_err_verbose)
16872 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16873 lck_mtx_unlock(&mod_lock);
16874 lck_mtx_unlock(&dtrace_provider_lock);
16875 return (-1);
16876 }
16877 ctl->mod_next = NULL;
16878 ctl->mod_stale = NULL;
16879 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16880 ctl->mod_loadcnt = kmod->id;
16881 ctl->mod_nenabled = 0;
16882 ctl->mod_address = kmod->address;
16883 ctl->mod_size = kmod->size;
16884 ctl->mod_id = kmod->id;
16885 ctl->mod_loaded = 1;
16886 ctl->mod_flags = 0;
16887 ctl->mod_user_symbols = NULL;
16888 ctl->mod_sdtprobecnt = 0;
16889 ctl->mod_sdtdesc = NULL;
16890
16891 /*
16892 * Find the UUID for this module, if it has one
16893 */
16894 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16895 struct load_command* load_cmd = (struct load_command *)&header[1];
16896 uint32_t i;
16897 for (i = 0; i < header->ncmds; i++) {
16898 if (load_cmd->cmd == LC_UUID) {
16899 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16900 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16901 ctl->mod_flags |= MODCTL_HAS_UUID;
16902 break;
16903 }
16904 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16905 }
16906
16907 if (ctl->mod_address == g_kernel_kmod_info.address) {
16908 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16909 memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16910 }
16911 /*
16912 * Static kexts have a UUID that is not used for symbolication, as all their
16913 * symbols are in kernel
16914 */
16915 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16916 memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16917 ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16918 }
16919 }
16920 dtrace_modctl_add(ctl);
16921
16922 /*
16923 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16924 */
16925 lck_mtx_lock(&dtrace_lock);
16926
16927 /*
16928 * DTrace must decide if it will instrument modules lazily via
16929 * userspace symbols (default mode), or instrument immediately via
16930 * kernel symbols (non-default mode)
16931 *
16932 * When in default/lazy mode, DTrace will only support modules
16933 * built with a valid UUID.
16934 *
16935 * Overriding the default can be done explicitly in one of
16936 * the following two ways.
16937 *
16938 * A module can force symbols from kernel space using the plist key,
16939 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16940 * we fall through and instrument this module now.
16941 *
16942 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16943 * from kernel space (see dtrace_impl.h). If this system state is set
16944 * to a non-userspace mode, we fall through and instrument the module now.
16945 */
16946
16947 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16948 (!(flag & KMOD_DTRACE_FORCE_INIT)))
16949 {
16950 /* Load SDT section for module. Symbol related data will be handled lazily. */
16951 sdt_load_machsect(ctl);
16952
16953 /* We will instrument the module lazily -- this is the default */
16954 lck_mtx_unlock(&dtrace_lock);
16955 lck_mtx_unlock(&mod_lock);
16956 lck_mtx_unlock(&dtrace_provider_lock);
16957 return 0;
16958 }
16959
16960 /* We will instrument the module immediately using kernel symbols */
16961 if (!(flag & KMOD_DTRACE_NO_KERNEL_SYMS)) {
16962 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16963 }
16964
16965 /* Load SDT section for module. Symbol related data will be handled lazily. */
16966 sdt_load_machsect(ctl);
16967
16968 lck_mtx_unlock(&dtrace_lock);
16969
16970 /*
16971 * We're going to call each providers per-module provide operation
16972 * specifying only this module.
16973 */
16974 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16975 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16976
16977 /*
16978 * APPLE NOTE: The contract with the kext loader is that once this function
16979 * has completed, it may delete kernel symbols at will.
16980 * We must set this while still holding the mod_lock.
16981 */
16982 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16983
16984 lck_mtx_unlock(&mod_lock);
16985 lck_mtx_unlock(&dtrace_provider_lock);
16986
16987 /*
16988 * If we have any retained enablings, we need to match against them.
16989 * Enabling probes requires that cpu_lock be held, and we cannot hold
16990 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16991 * module. (In particular, this happens when loading scheduling
16992 * classes.) So if we have any retained enablings, we need to dispatch
16993 * our task queue to do the match for us.
16994 */
16995 lck_mtx_lock(&dtrace_lock);
16996
16997 if (dtrace_retained == NULL) {
16998 lck_mtx_unlock(&dtrace_lock);
16999 return 0;
17000 }
17001
17002 /* APPLE NOTE!
17003 *
17004 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
17005 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
17006 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
17007 * the delay call as well.
17008 */
17009 lck_mtx_unlock(&dtrace_lock);
17010
17011 dtrace_enabling_matchall();
17012
17013 return 0;
17014 }
17015
17016 /*
17017 * Return 0 on success
17018 * Return -1 on failure
17019 */
17020 static int
dtrace_module_unloaded(struct kmod_info * kmod)17021 dtrace_module_unloaded(struct kmod_info *kmod)
17022 {
17023 dtrace_probe_t template, *probe, *first, *next;
17024 dtrace_provider_t *prov;
17025 struct modctl *ctl = NULL;
17026 struct modctl *syncctl = NULL;
17027 struct modctl *nextsyncctl = NULL;
17028 int syncmode = 0;
17029
17030 lck_mtx_lock(&dtrace_provider_lock);
17031 lck_mtx_lock(&mod_lock);
17032 lck_mtx_lock(&dtrace_lock);
17033
17034 if (kmod == NULL) {
17035 syncmode = 1;
17036 }
17037 else {
17038 ctl = dtrace_modctl_lookup(kmod);
17039 if (ctl == NULL)
17040 {
17041 lck_mtx_unlock(&dtrace_lock);
17042 lck_mtx_unlock(&mod_lock);
17043 lck_mtx_unlock(&dtrace_provider_lock);
17044 return (-1);
17045 }
17046 ctl->mod_loaded = 0;
17047 ctl->mod_address = 0;
17048 ctl->mod_size = 0;
17049 }
17050
17051 if (dtrace_bymod == NULL) {
17052 /*
17053 * The DTrace module is loaded (obviously) but not attached;
17054 * we don't have any work to do.
17055 */
17056 if (ctl != NULL)
17057 (void)dtrace_modctl_remove(ctl);
17058 lck_mtx_unlock(&dtrace_lock);
17059 lck_mtx_unlock(&mod_lock);
17060 lck_mtx_unlock(&dtrace_provider_lock);
17061 return(0);
17062 }
17063
17064 /* Syncmode set means we target and traverse entire modctl list. */
17065 if (syncmode)
17066 nextsyncctl = dtrace_modctl_list;
17067
17068 syncloop:
17069 if (syncmode)
17070 {
17071 /* find a stale modctl struct */
17072 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
17073 if (syncctl->mod_address == 0)
17074 break;
17075 }
17076 if (syncctl==NULL)
17077 {
17078 /* We have no more work to do */
17079 lck_mtx_unlock(&dtrace_lock);
17080 lck_mtx_unlock(&mod_lock);
17081 lck_mtx_unlock(&dtrace_provider_lock);
17082 return(0);
17083 }
17084 else {
17085 /* keep track of next syncctl in case this one is removed */
17086 nextsyncctl = syncctl->mod_next;
17087 ctl = syncctl;
17088 }
17089 }
17090
17091 template.dtpr_mod = ctl->mod_modname;
17092
17093 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
17094 probe != NULL; probe = probe->dtpr_nextmod) {
17095 if (probe->dtpr_ecb != NULL) {
17096 /*
17097 * This shouldn't _actually_ be possible -- we're
17098 * unloading a module that has an enabled probe in it.
17099 * (It's normally up to the provider to make sure that
17100 * this can't happen.) However, because dtps_enable()
17101 * doesn't have a failure mode, there can be an
17102 * enable/unload race. Upshot: we don't want to
17103 * assert, but we're not going to disable the
17104 * probe, either.
17105 */
17106
17107
17108 if (syncmode) {
17109 /* We're syncing, let's look at next in list */
17110 goto syncloop;
17111 }
17112
17113 lck_mtx_unlock(&dtrace_lock);
17114 lck_mtx_unlock(&mod_lock);
17115 lck_mtx_unlock(&dtrace_provider_lock);
17116
17117 if (dtrace_err_verbose) {
17118 cmn_err(CE_WARN, "unloaded module '%s' had "
17119 "enabled probes", ctl->mod_modname);
17120 }
17121 return(-1);
17122 }
17123 }
17124
17125 probe = first;
17126
17127 for (first = NULL; probe != NULL; probe = next) {
17128 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
17129
17130 dtrace_probes[probe->dtpr_id - 1] = NULL;
17131 probe->dtpr_provider->dtpv_probe_count--;
17132
17133 next = probe->dtpr_nextmod;
17134 dtrace_hash_remove(dtrace_byprov, probe);
17135 dtrace_hash_remove(dtrace_bymod, probe);
17136 dtrace_hash_remove(dtrace_byfunc, probe);
17137 dtrace_hash_remove(dtrace_byname, probe);
17138
17139 if (first == NULL) {
17140 first = probe;
17141 probe->dtpr_nextmod = NULL;
17142 } else {
17143 probe->dtpr_nextmod = first;
17144 first = probe;
17145 }
17146 }
17147
17148 /*
17149 * We've removed all of the module's probes from the hash chains and
17150 * from the probe array. Now issue a dtrace_sync() to be sure that
17151 * everyone has cleared out from any probe array processing.
17152 */
17153 dtrace_sync();
17154
17155 for (probe = first; probe != NULL; probe = first) {
17156 first = probe->dtpr_nextmod;
17157 prov = probe->dtpr_provider;
17158 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
17159 probe->dtpr_arg);
17160 dtrace_strunref(probe->dtpr_mod);
17161 dtrace_strunref(probe->dtpr_func);
17162 dtrace_strunref(probe->dtpr_name);
17163 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
17164
17165 zfree(dtrace_probe_t_zone, probe);
17166 }
17167
17168 dtrace_modctl_remove(ctl);
17169
17170 if (syncmode)
17171 goto syncloop;
17172
17173 lck_mtx_unlock(&dtrace_lock);
17174 lck_mtx_unlock(&mod_lock);
17175 lck_mtx_unlock(&dtrace_provider_lock);
17176
17177 return(0);
17178 }
17179
17180 void
dtrace_suspend(void)17181 dtrace_suspend(void)
17182 {
17183 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17184 }
17185
17186 void
dtrace_resume(void)17187 dtrace_resume(void)
17188 {
17189 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17190 }
17191
17192 static int
dtrace_cpu_setup(cpu_setup_t what,processorid_t cpu)17193 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17194 {
17195 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17196 lck_mtx_lock(&dtrace_lock);
17197
17198 switch (what) {
17199 case CPU_CONFIG: {
17200 dtrace_state_t *state;
17201 dtrace_optval_t *opt, rs, c;
17202
17203 /*
17204 * For now, we only allocate a new buffer for anonymous state.
17205 */
17206 if ((state = dtrace_anon.dta_state) == NULL)
17207 break;
17208
17209 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17210 break;
17211
17212 opt = state->dts_options;
17213 c = opt[DTRACEOPT_CPU];
17214
17215 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17216 break;
17217
17218 /*
17219 * Regardless of what the actual policy is, we're going to
17220 * temporarily set our resize policy to be manual. We're
17221 * also going to temporarily set our CPU option to denote
17222 * the newly configured CPU.
17223 */
17224 rs = opt[DTRACEOPT_BUFRESIZE];
17225 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17226 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17227
17228 (void) dtrace_state_buffers(state);
17229
17230 opt[DTRACEOPT_BUFRESIZE] = rs;
17231 opt[DTRACEOPT_CPU] = c;
17232
17233 break;
17234 }
17235
17236 case CPU_UNCONFIG:
17237 /*
17238 * We don't free the buffer in the CPU_UNCONFIG case. (The
17239 * buffer will be freed when the consumer exits.)
17240 */
17241 break;
17242
17243 default:
17244 break;
17245 }
17246
17247 lck_mtx_unlock(&dtrace_lock);
17248 return (0);
17249 }
17250
17251 static void
dtrace_cpu_setup_initial(processorid_t cpu)17252 dtrace_cpu_setup_initial(processorid_t cpu)
17253 {
17254 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17255 }
17256
17257 static void
dtrace_toxrange_add(uintptr_t base,uintptr_t limit)17258 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17259 {
17260 if (dtrace_toxranges >= dtrace_toxranges_max) {
17261 int osize, nsize;
17262 dtrace_toxrange_t *range;
17263
17264 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17265
17266 if (osize == 0) {
17267 ASSERT(dtrace_toxrange == NULL);
17268 ASSERT(dtrace_toxranges_max == 0);
17269 dtrace_toxranges_max = 1;
17270 } else {
17271 dtrace_toxranges_max <<= 1;
17272 }
17273
17274 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17275 range = kmem_zalloc(nsize, KM_SLEEP);
17276
17277 if (dtrace_toxrange != NULL) {
17278 ASSERT(osize != 0);
17279 bcopy(dtrace_toxrange, range, osize);
17280 kmem_free(dtrace_toxrange, osize);
17281 }
17282
17283 dtrace_toxrange = range;
17284 }
17285
17286 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17287 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17288
17289 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17290 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17291 dtrace_toxranges++;
17292 }
17293
17294 /*
17295 * DTrace Driver Cookbook Functions
17296 */
17297 /*ARGSUSED*/
17298 static int
dtrace_attach(dev_info_t * devi)17299 dtrace_attach(dev_info_t *devi)
17300 {
17301 dtrace_provider_id_t id;
17302 dtrace_state_t *state = NULL;
17303 dtrace_enabling_t *enab;
17304
17305 lck_mtx_lock(&cpu_lock);
17306 lck_mtx_lock(&dtrace_provider_lock);
17307 lck_mtx_lock(&dtrace_lock);
17308
17309 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17310 dtrace_devi = devi;
17311
17312 dtrace_modload = dtrace_module_loaded;
17313 dtrace_modunload = dtrace_module_unloaded;
17314 dtrace_cpu_init = dtrace_cpu_setup_initial;
17315 dtrace_helpers_cleanup = dtrace_helpers_destroy;
17316 dtrace_helpers_fork = dtrace_helpers_duplicate;
17317 dtrace_cpustart_init = dtrace_suspend;
17318 dtrace_cpustart_fini = dtrace_resume;
17319 dtrace_debugger_init = dtrace_suspend;
17320 dtrace_debugger_fini = dtrace_resume;
17321
17322 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17323
17324 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17325
17326 dtrace_arena = vmem_create("dtrace", (void *)1, INT32_MAX, 1,
17327 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17328
17329 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17330
17331 dtrace_nprobes = dtrace_nprobes_default;
17332 dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17333 KM_SLEEP);
17334
17335 dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17336 0, /* unused */
17337 offsetof(dtrace_probe_t, dtpr_nextprov),
17338 offsetof(dtrace_probe_t, dtpr_prevprov));
17339
17340 dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17341 offsetof(dtrace_probe_t, dtpr_mod),
17342 offsetof(dtrace_probe_t, dtpr_nextmod),
17343 offsetof(dtrace_probe_t, dtpr_prevmod));
17344
17345 dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17346 offsetof(dtrace_probe_t, dtpr_func),
17347 offsetof(dtrace_probe_t, dtpr_nextfunc),
17348 offsetof(dtrace_probe_t, dtpr_prevfunc));
17349
17350 dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17351 offsetof(dtrace_probe_t, dtpr_name),
17352 offsetof(dtrace_probe_t, dtpr_nextname),
17353 offsetof(dtrace_probe_t, dtpr_prevname));
17354
17355 if (dtrace_retain_max < 1) {
17356 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17357 "setting to 1", dtrace_retain_max);
17358 dtrace_retain_max = 1;
17359 }
17360
17361 /*
17362 * Now discover our toxic ranges.
17363 */
17364 dtrace_toxic_ranges(dtrace_toxrange_add);
17365
17366 /*
17367 * Before we register ourselves as a provider to our own framework,
17368 * we would like to assert that dtrace_provider is NULL -- but that's
17369 * not true if we were loaded as a dependency of a DTrace provider.
17370 * Once we've registered, we can assert that dtrace_provider is our
17371 * pseudo provider.
17372 */
17373 (void) dtrace_register("dtrace", &dtrace_provider_attr,
17374 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17375
17376 ASSERT(dtrace_provider != NULL);
17377 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17378
17379 #if defined (__x86_64__)
17380 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17381 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17382 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17383 dtrace_provider, NULL, NULL, "END", 0, NULL);
17384 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17385 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17386 #elif defined(__arm64__)
17387 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17388 dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17389 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17390 dtrace_provider, NULL, NULL, "END", 1, NULL);
17391 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17392 dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17393 #else
17394 #error Unknown Architecture
17395 #endif
17396
17397 dtrace_anon_property();
17398 lck_mtx_unlock(&cpu_lock);
17399
17400 /*
17401 * If DTrace helper tracing is enabled, we need to allocate the
17402 * trace buffer and initialize the values.
17403 */
17404 if (dtrace_helptrace_enabled) {
17405 ASSERT(dtrace_helptrace_buffer == NULL);
17406 dtrace_helptrace_buffer =
17407 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17408 dtrace_helptrace_next = 0;
17409 }
17410
17411 /*
17412 * If there are already providers, we must ask them to provide their
17413 * probes, and then match any anonymous enabling against them. Note
17414 * that there should be no other retained enablings at this time:
17415 * the only retained enablings at this time should be the anonymous
17416 * enabling.
17417 */
17418 if (dtrace_anon.dta_enabling != NULL) {
17419 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17420
17421 /*
17422 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17423 */
17424 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17425 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17426 }
17427
17428 dtrace_enabling_provide(NULL);
17429 state = dtrace_anon.dta_state;
17430
17431 /*
17432 * We couldn't hold cpu_lock across the above call to
17433 * dtrace_enabling_provide(), but we must hold it to actually
17434 * enable the probes. We have to drop all of our locks, pick
17435 * up cpu_lock, and regain our locks before matching the
17436 * retained anonymous enabling.
17437 */
17438 lck_mtx_unlock(&dtrace_lock);
17439 lck_mtx_unlock(&dtrace_provider_lock);
17440
17441 lck_mtx_lock(&cpu_lock);
17442 lck_mtx_lock(&dtrace_provider_lock);
17443 lck_mtx_lock(&dtrace_lock);
17444
17445 if ((enab = dtrace_anon.dta_enabling) != NULL)
17446 (void) dtrace_enabling_match(enab, NULL, NULL);
17447
17448 lck_mtx_unlock(&cpu_lock);
17449 }
17450
17451 lck_mtx_unlock(&dtrace_lock);
17452 lck_mtx_unlock(&dtrace_provider_lock);
17453
17454 if (state != NULL) {
17455 /*
17456 * If we created any anonymous state, set it going now.
17457 */
17458 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17459 }
17460
17461 return (DDI_SUCCESS);
17462 }
17463
17464 /*ARGSUSED*/
17465 static int
dtrace_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)17466 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17467 {
17468 #pragma unused(flag, otyp)
17469 dtrace_state_t *state;
17470 uint32_t priv;
17471 uid_t uid;
17472 zoneid_t zoneid;
17473 int rv;
17474
17475 if (minor(*devp) < 0 || minor(*devp) >= DTRACE_NCLIENTS)
17476 return (ENXIO);
17477
17478 /* APPLE: Darwin puts Helper on its own major device. */
17479
17480 /*
17481 * If no DTRACE_PRIV_* bits are set in the credential, then the
17482 * caller lacks sufficient permission to do anything with DTrace.
17483 */
17484 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17485 if (priv == DTRACE_PRIV_NONE)
17486 return (EACCES);
17487
17488 /*
17489 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17490 * It certainly can't be later than now!
17491 */
17492 fasttrap_init();
17493
17494 /*
17495 * Ask all providers to provide all their probes.
17496 */
17497 lck_mtx_lock(&dtrace_provider_lock);
17498 dtrace_probe_provide(NULL, NULL);
17499 lck_mtx_unlock(&dtrace_provider_lock);
17500
17501 lck_mtx_lock(&cpu_lock);
17502 lck_mtx_lock(&dtrace_lock);
17503 dtrace_opens++;
17504 dtrace_membar_producer();
17505
17506 #ifdef illumos
17507 /*
17508 * If the kernel debugger is active (that is, if the kernel debugger
17509 * modified text in some way), we won't allow the open.
17510 */
17511 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17512 dtrace_opens--;
17513 lck_mtx_unlock(&dtrace_lock);
17514 lck_mtx_unlock(&cpu_lock);
17515 return (EBUSY);
17516 }
17517 #endif
17518
17519 rv = dtrace_state_create(devp, cred_p, &state);
17520 lck_mtx_unlock(&cpu_lock);
17521
17522 if (rv != 0 || state == NULL) {
17523 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17524 #ifdef illumos
17525 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17526 #endif
17527 }
17528 lck_mtx_unlock(&dtrace_lock);
17529 /* propagate EAGAIN or ERESTART */
17530 return (rv);
17531 }
17532
17533 lck_mtx_unlock(&dtrace_lock);
17534
17535 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17536
17537 /*
17538 * If we are currently lazy, transition states.
17539 *
17540 * Unlike dtrace_close, we do not need to check the
17541 * value of dtrace_opens, as any positive value (and
17542 * we count as 1) means we transition states.
17543 */
17544 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17545 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17546 /*
17547 * We do not need to hold the exclusive lock while processing
17548 * DOF on processes. We do need to make sure the mode does not get
17549 * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17550 * (which should not happen anyway since it only happens in
17551 * dtrace_close). There is no way imcomplete USDT probes can be
17552 * activate by any DTrace clients here since they all have to
17553 * call dtrace_open and be blocked on dtrace_dof_mode_lock
17554 */
17555 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17556 /*
17557 * Iterate all existing processes and load lazy dofs.
17558 */
17559 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17560 dtrace_lazy_dofs_proc_iterate_doit,
17561 NULL,
17562 dtrace_lazy_dofs_proc_iterate_filter,
17563 NULL);
17564
17565 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17566 }
17567 else {
17568 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17569 }
17570
17571
17572 /*
17573 * Update kernel symbol state.
17574 *
17575 * We must own the provider and dtrace locks.
17576 *
17577 * NOTE! It may appear there is a race by setting this value so late
17578 * after dtrace_probe_provide. However, any kext loaded after the
17579 * call to probe provide and before we set LAZY_OFF will be marked as
17580 * eligible for symbols from userspace. The same dtrace that is currently
17581 * calling dtrace_open() (this call!) will get a list of kexts needing
17582 * symbols and fill them in, thus closing the race window.
17583 *
17584 * We want to set this value only after it certain it will succeed, as
17585 * this significantly reduces the complexity of error exits.
17586 */
17587 lck_mtx_lock(&dtrace_lock);
17588 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17589 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17590 }
17591 lck_mtx_unlock(&dtrace_lock);
17592
17593 /* Suspend cluster powerdown while DTrace device is opened. */
17594 suspend_cluster_powerdown();
17595 return (0);
17596 }
17597
17598 /*ARGSUSED*/
17599 static int
dtrace_close(dev_t dev,int flag,int otyp,cred_t * cred_p)17600 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17601 {
17602 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17603 minor_t minor = getminor(dev);
17604 dtrace_state_t *state;
17605
17606 /* APPLE NOTE: Darwin puts Helper on its own major device. */
17607 state = dtrace_state_get(minor);
17608
17609 lck_mtx_lock(&cpu_lock);
17610 lck_mtx_lock(&dtrace_lock);
17611
17612 if (state->dts_anon) {
17613 /*
17614 * There is anonymous state. Destroy that first.
17615 */
17616 ASSERT(dtrace_anon.dta_state == NULL);
17617 dtrace_state_destroy(state->dts_anon);
17618 }
17619
17620 dtrace_state_destroy(state);
17621 ASSERT(dtrace_opens > 0);
17622
17623 /*
17624 * Only relinquish control of the kernel debugger interface when there
17625 * are no consumers and no anonymous enablings.
17626 */
17627 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17628 #ifdef illumos
17629 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17630 #endif
17631 }
17632
17633 lck_mtx_unlock(&dtrace_lock);
17634 lck_mtx_unlock(&cpu_lock);
17635
17636 /*
17637 * Lock ordering requires the dof mode lock be taken before
17638 * the dtrace_lock.
17639 */
17640 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17641 lck_mtx_lock(&dtrace_lock);
17642
17643 if (dtrace_opens == 0) {
17644 /*
17645 * If we are currently lazy-off, and this is the last close, transition to
17646 * lazy state.
17647 */
17648 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17649 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17650 }
17651
17652 /*
17653 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17654 */
17655 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17656 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17657 }
17658 }
17659
17660 lck_mtx_unlock(&dtrace_lock);
17661 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17662
17663 /*
17664 * Kext probes may be retained past the end of the kext's lifespan. The
17665 * probes are kept until the last reference to them has been removed.
17666 * Since closing an active dtrace context is likely to drop that last reference,
17667 * lets take a shot at cleaning out the orphaned probes now.
17668 */
17669 dtrace_module_unloaded(NULL);
17670
17671 /* State is gone so resume cluster powerdown. */
17672 resume_cluster_powerdown();
17673 return (0);
17674 }
17675
17676 /*ARGSUSED*/
17677 static int
dtrace_ioctl_helper(u_long cmd,caddr_t arg,int * rv)17678 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17679 {
17680 #pragma unused(rv)
17681 /*
17682 * Safe to check this outside the dof mode lock
17683 */
17684 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17685 return KERN_SUCCESS;
17686
17687 switch (cmd) {
17688 #if defined (__arm64__)
17689 case DTRACEHIOC_ADDDOF_U32:
17690 case DTRACEHIOC_ADDDOF_U64:
17691 #else
17692 case DTRACEHIOC_ADDDOF:
17693 #endif /* __arm64__*/
17694 {
17695 dof_helper_t *dhp = NULL;
17696 size_t dof_ioctl_data_size;
17697 dof_ioctl_data_t* multi_dof;
17698 unsigned int i;
17699 int rval = 0;
17700 user_addr_t user_address = *(user_addr_t*)arg;
17701 uint64_t dof_count;
17702 int multi_dof_claimed = 0;
17703 proc_t* p = current_proc();
17704
17705 /*
17706 * If this is a restricted process and dtrace is restricted,
17707 * do not allow DOFs to be registered
17708 */
17709 if (dtrace_is_restricted() &&
17710 !dtrace_are_restrictions_relaxed() &&
17711 !dtrace_can_attach_to_proc(current_proc())) {
17712 return (EACCES);
17713 }
17714
17715 /*
17716 * Read the number of DOF sections being passed in.
17717 */
17718 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17719 &dof_count,
17720 sizeof(dof_count))) {
17721 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17722 return (EFAULT);
17723 }
17724
17725 /*
17726 * Range check the count.
17727 */
17728 if (dof_count == 0 || dof_count > 1024) {
17729 dtrace_dof_error(NULL, "dofiod_count is not valid");
17730 return (EINVAL);
17731 }
17732
17733 /*
17734 * Allocate a correctly sized structure and copyin the data.
17735 */
17736 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17737 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17738 return (ENOMEM);
17739
17740 /* NOTE! We can no longer exit this method via return */
17741 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17742 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17743 rval = EFAULT;
17744 goto cleanup;
17745 }
17746
17747 /*
17748 * Check that the count didn't change between the first copyin and the second.
17749 */
17750 if (multi_dof->dofiod_count != dof_count) {
17751 rval = EINVAL;
17752 goto cleanup;
17753 }
17754
17755 /*
17756 * Try to process lazily first.
17757 */
17758 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17759
17760 /*
17761 * If rval is EACCES, we must be non-lazy.
17762 */
17763 if (rval == EACCES) {
17764 rval = 0;
17765 /*
17766 * Process each dof_helper_t
17767 */
17768 i = 0;
17769 do {
17770 dhp = &multi_dof->dofiod_helpers[i];
17771
17772 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17773
17774 if (dof != NULL) {
17775 lck_mtx_lock(&dtrace_meta_lock);
17776 lck_mtx_lock(&dtrace_lock);
17777
17778 /*
17779 * dtrace_helper_slurp() takes responsibility for the dof --
17780 * it may free it now or it may save it and free it later.
17781 */
17782 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17783 rval = EINVAL;
17784 }
17785
17786 lck_mtx_unlock(&dtrace_lock);
17787 lck_mtx_unlock(&dtrace_meta_lock);
17788 }
17789 } while (++i < multi_dof->dofiod_count && rval == 0);
17790 }
17791
17792 /*
17793 * We need to copyout the multi_dof struct, because it contains
17794 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17795 *
17796 * This could certainly be better optimized.
17797 */
17798 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17799 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17800 /* Don't overwrite pre-existing error code */
17801 if (rval == 0) rval = EFAULT;
17802 }
17803
17804 cleanup:
17805 /*
17806 * If we had to allocate struct memory, free it.
17807 */
17808 if (multi_dof != NULL && !multi_dof_claimed) {
17809 kmem_free(multi_dof, dof_ioctl_data_size);
17810 }
17811
17812 return rval;
17813 }
17814
17815 case DTRACEHIOC_REMOVE: {
17816 int generation = *(int*)arg;
17817 proc_t* p = current_proc();
17818
17819 /*
17820 * Try lazy first.
17821 */
17822 int rval = dtrace_lazy_dofs_remove(p, generation);
17823
17824 /*
17825 * EACCES means non-lazy
17826 */
17827 if (rval == EACCES) {
17828 lck_mtx_lock(&dtrace_meta_lock);
17829 lck_mtx_lock(&dtrace_lock);
17830 rval = dtrace_helper_destroygen(p, generation);
17831 lck_mtx_unlock(&dtrace_lock);
17832 lck_mtx_unlock(&dtrace_meta_lock);
17833 }
17834
17835 return (rval);
17836 }
17837
17838 default:
17839 break;
17840 }
17841
17842 return ENOTTY;
17843 }
17844
17845 /*ARGSUSED*/
17846 static int
dtrace_ioctl(dev_t dev,u_long cmd,user_addr_t arg,int md,cred_t * cr,int * rv)17847 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17848 {
17849 #pragma unused(md)
17850 minor_t minor = getminor(dev);
17851 dtrace_state_t *state;
17852 int rval;
17853
17854 /* Darwin puts Helper on its own major device. */
17855
17856 state = dtrace_state_get(minor);
17857
17858 if (state->dts_anon) {
17859 ASSERT(dtrace_anon.dta_state == NULL);
17860 state = state->dts_anon;
17861 }
17862
17863 switch (cmd) {
17864 case DTRACEIOC_PROVIDER: {
17865 dtrace_providerdesc_t pvd;
17866 dtrace_provider_t *pvp;
17867
17868 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17869 return (EFAULT);
17870
17871 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17872 lck_mtx_lock(&dtrace_provider_lock);
17873
17874 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17875 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17876 break;
17877 }
17878
17879 lck_mtx_unlock(&dtrace_provider_lock);
17880
17881 if (pvp == NULL)
17882 return (ESRCH);
17883
17884 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17885 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17886 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17887 return (EFAULT);
17888
17889 return (0);
17890 }
17891
17892 case DTRACEIOC_EPROBE: {
17893 dtrace_eprobedesc_t epdesc;
17894 dtrace_ecb_t *ecb;
17895 dtrace_action_t *act;
17896 void *buf;
17897 size_t size;
17898 uintptr_t dest;
17899 int nrecs;
17900
17901 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17902 return (EFAULT);
17903
17904 lck_mtx_lock(&dtrace_lock);
17905
17906 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17907 lck_mtx_unlock(&dtrace_lock);
17908 return (EINVAL);
17909 }
17910
17911 if (ecb->dte_probe == NULL) {
17912 lck_mtx_unlock(&dtrace_lock);
17913 return (EINVAL);
17914 }
17915
17916 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17917 epdesc.dtepd_uarg = ecb->dte_uarg;
17918 epdesc.dtepd_size = ecb->dte_size;
17919
17920 nrecs = epdesc.dtepd_nrecs;
17921 epdesc.dtepd_nrecs = 0;
17922 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17923 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17924 continue;
17925
17926 epdesc.dtepd_nrecs++;
17927 }
17928
17929 /*
17930 * Now that we have the size, we need to allocate a temporary
17931 * buffer in which to store the complete description. We need
17932 * the temporary buffer to be able to drop dtrace_lock()
17933 * across the copyout(), below.
17934 */
17935 size = sizeof (dtrace_eprobedesc_t) +
17936 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17937
17938 buf = kmem_alloc(size, KM_SLEEP);
17939 dest = (uintptr_t)buf;
17940
17941 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17942 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17943
17944 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17945 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17946 continue;
17947
17948 if (nrecs-- == 0)
17949 break;
17950
17951 bcopy(&act->dta_rec, (void *)dest,
17952 sizeof (dtrace_recdesc_t));
17953 dest += sizeof (dtrace_recdesc_t);
17954 }
17955
17956 lck_mtx_unlock(&dtrace_lock);
17957
17958 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17959 kmem_free(buf, size);
17960 return (EFAULT);
17961 }
17962
17963 kmem_free(buf, size);
17964 return (0);
17965 }
17966
17967 case DTRACEIOC_AGGDESC: {
17968 dtrace_aggdesc_t aggdesc;
17969 dtrace_action_t *act;
17970 dtrace_aggregation_t *agg;
17971 int nrecs;
17972 uint32_t offs;
17973 dtrace_recdesc_t *lrec;
17974 void *buf;
17975 size_t size;
17976 uintptr_t dest;
17977
17978 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17979 return (EFAULT);
17980
17981 lck_mtx_lock(&dtrace_lock);
17982
17983 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17984 lck_mtx_unlock(&dtrace_lock);
17985 return (EINVAL);
17986 }
17987
17988 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17989
17990 nrecs = aggdesc.dtagd_nrecs;
17991 aggdesc.dtagd_nrecs = 0;
17992
17993 offs = agg->dtag_base;
17994 lrec = &agg->dtag_action.dta_rec;
17995 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17996
17997 for (act = agg->dtag_first; ; act = act->dta_next) {
17998 ASSERT(act->dta_intuple ||
17999 DTRACEACT_ISAGG(act->dta_kind));
18000
18001 /*
18002 * If this action has a record size of zero, it
18003 * denotes an argument to the aggregating action.
18004 * Because the presence of this record doesn't (or
18005 * shouldn't) affect the way the data is interpreted,
18006 * we don't copy it out to save user-level the
18007 * confusion of dealing with a zero-length record.
18008 */
18009 if (act->dta_rec.dtrd_size == 0) {
18010 ASSERT(agg->dtag_hasarg);
18011 continue;
18012 }
18013
18014 aggdesc.dtagd_nrecs++;
18015
18016 if (act == &agg->dtag_action)
18017 break;
18018 }
18019
18020 /*
18021 * Now that we have the size, we need to allocate a temporary
18022 * buffer in which to store the complete description. We need
18023 * the temporary buffer to be able to drop dtrace_lock()
18024 * across the copyout(), below.
18025 */
18026 size = sizeof (dtrace_aggdesc_t) +
18027 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
18028
18029 buf = kmem_alloc(size, KM_SLEEP);
18030 dest = (uintptr_t)buf;
18031
18032 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
18033 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
18034
18035 for (act = agg->dtag_first; ; act = act->dta_next) {
18036 dtrace_recdesc_t rec = act->dta_rec;
18037
18038 /*
18039 * See the comment in the above loop for why we pass
18040 * over zero-length records.
18041 */
18042 if (rec.dtrd_size == 0) {
18043 ASSERT(agg->dtag_hasarg);
18044 continue;
18045 }
18046
18047 if (nrecs-- == 0)
18048 break;
18049
18050 rec.dtrd_offset -= offs;
18051 bcopy(&rec, (void *)dest, sizeof (rec));
18052 dest += sizeof (dtrace_recdesc_t);
18053
18054 if (act == &agg->dtag_action)
18055 break;
18056 }
18057
18058 lck_mtx_unlock(&dtrace_lock);
18059
18060 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18061 kmem_free(buf, size);
18062 return (EFAULT);
18063 }
18064
18065 kmem_free(buf, size);
18066 return (0);
18067 }
18068
18069 case DTRACEIOC_ENABLE: {
18070 dof_hdr_t *dof;
18071 dtrace_enabling_t *enab = NULL;
18072 dtrace_vstate_t *vstate;
18073 int err = 0;
18074
18075 *rv = 0;
18076
18077 /*
18078 * If a NULL argument has been passed, we take this as our
18079 * cue to reevaluate our enablings.
18080 */
18081 if (arg == 0) {
18082 dtrace_enabling_matchall();
18083
18084 return (0);
18085 }
18086
18087 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
18088 return (rval);
18089
18090 lck_mtx_lock(&cpu_lock);
18091 lck_mtx_lock(&dtrace_lock);
18092 vstate = &state->dts_vstate;
18093
18094 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18095 lck_mtx_unlock(&dtrace_lock);
18096 lck_mtx_unlock(&cpu_lock);
18097 dtrace_dof_destroy(dof);
18098 return (EBUSY);
18099 }
18100
18101 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
18102 lck_mtx_unlock(&dtrace_lock);
18103 lck_mtx_unlock(&cpu_lock);
18104 dtrace_dof_destroy(dof);
18105 return (EINVAL);
18106 }
18107
18108 if ((rval = dtrace_dof_options(dof, state)) != 0) {
18109 dtrace_enabling_destroy(enab);
18110 lck_mtx_unlock(&dtrace_lock);
18111 lck_mtx_unlock(&cpu_lock);
18112 dtrace_dof_destroy(dof);
18113 return (rval);
18114 }
18115
18116 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
18117 err = dtrace_enabling_retain(enab);
18118 } else {
18119 dtrace_enabling_destroy(enab);
18120 }
18121
18122 lck_mtx_unlock(&dtrace_lock);
18123 lck_mtx_unlock(&cpu_lock);
18124 dtrace_dof_destroy(dof);
18125
18126 return (err);
18127 }
18128
18129 case DTRACEIOC_REPLICATE: {
18130 dtrace_repldesc_t desc;
18131 dtrace_probedesc_t *match = &desc.dtrpd_match;
18132 dtrace_probedesc_t *create = &desc.dtrpd_create;
18133 int err;
18134
18135 if (copyin(arg, &desc, sizeof (desc)) != 0)
18136 return (EFAULT);
18137
18138 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18139 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18140 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18141 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18142
18143 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18144 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18145 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18146 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18147
18148 lck_mtx_lock(&dtrace_lock);
18149 err = dtrace_enabling_replicate(state, match, create);
18150 lck_mtx_unlock(&dtrace_lock);
18151
18152 return (err);
18153 }
18154
18155 case DTRACEIOC_PROBEMATCH:
18156 case DTRACEIOC_PROBES: {
18157 dtrace_probe_t *probe = NULL;
18158 dtrace_probedesc_t desc;
18159 dtrace_probekey_t pkey;
18160 dtrace_id_t i;
18161 int m = 0;
18162 uint32_t priv;
18163 uid_t uid;
18164 zoneid_t zoneid;
18165
18166 if (copyin(arg, &desc, sizeof (desc)) != 0)
18167 return (EFAULT);
18168
18169 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18170 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18171 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18172 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18173
18174 /*
18175 * Before we attempt to match this probe, we want to give
18176 * all providers the opportunity to provide it.
18177 */
18178 if (desc.dtpd_id == DTRACE_IDNONE) {
18179 lck_mtx_lock(&dtrace_provider_lock);
18180 dtrace_probe_provide(&desc, NULL);
18181 lck_mtx_unlock(&dtrace_provider_lock);
18182 desc.dtpd_id++;
18183 }
18184
18185 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18186
18187 lck_mtx_lock(&dtrace_lock);
18188
18189 if (cmd == DTRACEIOC_PROBEMATCH) {
18190 dtrace_probekey(&desc, &pkey);
18191 pkey.dtpk_id = DTRACE_IDNONE;
18192
18193 /* Quiet compiler warning */
18194 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18195 if ((probe = dtrace_probes[i - 1]) != NULL &&
18196 (m = dtrace_match_probe(probe, &pkey,
18197 priv, uid, zoneid)) != 0)
18198 break;
18199 }
18200
18201 if (m < 0) {
18202 lck_mtx_unlock(&dtrace_lock);
18203 return (EINVAL);
18204 }
18205 dtrace_probekey_release(&pkey);
18206
18207 } else {
18208 /* Quiet compiler warning */
18209 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18210 if ((probe = dtrace_probes[i - 1]) != NULL &&
18211 dtrace_match_priv(probe, priv, uid, zoneid))
18212 break;
18213 }
18214 }
18215
18216 if (probe == NULL) {
18217 lck_mtx_unlock(&dtrace_lock);
18218 return (ESRCH);
18219 }
18220
18221 dtrace_probe_description(probe, &desc);
18222 lck_mtx_unlock(&dtrace_lock);
18223
18224 if (copyout(&desc, arg, sizeof (desc)) != 0)
18225 return (EFAULT);
18226
18227 return (0);
18228 }
18229
18230 case DTRACEIOC_PROBEARG: {
18231 dtrace_argdesc_t desc;
18232 dtrace_probe_t *probe;
18233 dtrace_provider_t *prov;
18234
18235 if (copyin(arg, &desc, sizeof (desc)) != 0)
18236 return (EFAULT);
18237
18238 if (desc.dtargd_id == DTRACE_IDNONE)
18239 return (EINVAL);
18240
18241 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18242 return (EINVAL);
18243
18244 lck_mtx_lock(&dtrace_provider_lock);
18245 lck_mtx_lock(&mod_lock);
18246 lck_mtx_lock(&dtrace_lock);
18247
18248 /* Quiet compiler warning */
18249 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18250 lck_mtx_unlock(&dtrace_lock);
18251 lck_mtx_unlock(&mod_lock);
18252 lck_mtx_unlock(&dtrace_provider_lock);
18253 return (EINVAL);
18254 }
18255
18256 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18257 lck_mtx_unlock(&dtrace_lock);
18258 lck_mtx_unlock(&mod_lock);
18259 lck_mtx_unlock(&dtrace_provider_lock);
18260 return (EINVAL);
18261 }
18262
18263 lck_mtx_unlock(&dtrace_lock);
18264
18265 prov = probe->dtpr_provider;
18266
18267 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18268 /*
18269 * There isn't any typed information for this probe.
18270 * Set the argument number to DTRACE_ARGNONE.
18271 */
18272 desc.dtargd_ndx = DTRACE_ARGNONE;
18273 } else {
18274 desc.dtargd_native[0] = '\0';
18275 desc.dtargd_xlate[0] = '\0';
18276 desc.dtargd_mapping = desc.dtargd_ndx;
18277
18278 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18279 probe->dtpr_id, probe->dtpr_arg, &desc);
18280 }
18281
18282 lck_mtx_unlock(&mod_lock);
18283 lck_mtx_unlock(&dtrace_provider_lock);
18284
18285 if (copyout(&desc, arg, sizeof (desc)) != 0)
18286 return (EFAULT);
18287
18288 return (0);
18289 }
18290
18291 case DTRACEIOC_GO: {
18292 processorid_t cpuid;
18293 rval = dtrace_state_go(state, &cpuid);
18294
18295 if (rval != 0)
18296 return (rval);
18297
18298 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18299 return (EFAULT);
18300
18301 return (0);
18302 }
18303
18304 case DTRACEIOC_STOP: {
18305 processorid_t cpuid;
18306
18307 lck_mtx_lock(&dtrace_lock);
18308 rval = dtrace_state_stop(state, &cpuid);
18309 lck_mtx_unlock(&dtrace_lock);
18310
18311 if (rval != 0)
18312 return (rval);
18313
18314 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18315 return (EFAULT);
18316
18317 return (0);
18318 }
18319
18320 case DTRACEIOC_DOFGET: {
18321 dof_hdr_t hdr, *dof;
18322 uint64_t len;
18323
18324 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18325 return (EFAULT);
18326
18327 lck_mtx_lock(&dtrace_lock);
18328 dof = dtrace_dof_create(state);
18329 lck_mtx_unlock(&dtrace_lock);
18330
18331 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18332 rval = copyout(dof, arg, len);
18333 dtrace_dof_destroy(dof);
18334
18335 return (rval == 0 ? 0 : EFAULT);
18336 }
18337
18338 case DTRACEIOC_SLEEP: {
18339 int64_t time;
18340 uint64_t abstime;
18341 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18342
18343 if (copyin(arg, &time, sizeof(time)) != 0)
18344 return (EFAULT);
18345
18346 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18347 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18348
18349 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18350 if (state->dts_buf_over_limit > 0) {
18351 clear_wait(current_thread(), THREAD_INTERRUPTED);
18352 rvalue = DTRACE_WAKE_BUF_LIMIT;
18353 } else {
18354 thread_block(THREAD_CONTINUE_NULL);
18355 if (state->dts_buf_over_limit > 0) {
18356 rvalue = DTRACE_WAKE_BUF_LIMIT;
18357 }
18358 }
18359 }
18360
18361 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18362 return (EFAULT);
18363
18364 return (0);
18365 }
18366
18367 case DTRACEIOC_SIGNAL: {
18368 wakeup(state);
18369 return (0);
18370 }
18371
18372 case DTRACEIOC_AGGSNAP:
18373 case DTRACEIOC_BUFSNAP: {
18374 dtrace_bufdesc_t desc;
18375 caddr_t cached;
18376 boolean_t over_limit;
18377 dtrace_buffer_t *buf;
18378
18379 if (copyin(arg, &desc, sizeof (desc)) != 0)
18380 return (EFAULT);
18381
18382 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18383 return (EINVAL);
18384
18385 lck_mtx_lock(&dtrace_lock);
18386
18387 if (cmd == DTRACEIOC_BUFSNAP) {
18388 buf = &state->dts_buffer[desc.dtbd_cpu];
18389 } else {
18390 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18391 }
18392
18393 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18394 size_t sz = buf->dtb_offset;
18395
18396 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18397 lck_mtx_unlock(&dtrace_lock);
18398 return (EBUSY);
18399 }
18400
18401 /*
18402 * If this buffer has already been consumed, we're
18403 * going to indicate that there's nothing left here
18404 * to consume.
18405 */
18406 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18407 lck_mtx_unlock(&dtrace_lock);
18408
18409 desc.dtbd_size = 0;
18410 desc.dtbd_drops = 0;
18411 desc.dtbd_errors = 0;
18412 desc.dtbd_oldest = 0;
18413 sz = sizeof (desc);
18414
18415 if (copyout(&desc, arg, sz) != 0)
18416 return (EFAULT);
18417
18418 return (0);
18419 }
18420
18421 /*
18422 * If this is a ring buffer that has wrapped, we want
18423 * to copy the whole thing out.
18424 */
18425 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18426 dtrace_buffer_polish(buf);
18427 sz = buf->dtb_size;
18428 }
18429
18430 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18431 lck_mtx_unlock(&dtrace_lock);
18432 return (EFAULT);
18433 }
18434
18435 desc.dtbd_size = sz;
18436 desc.dtbd_drops = buf->dtb_drops;
18437 desc.dtbd_errors = buf->dtb_errors;
18438 desc.dtbd_oldest = buf->dtb_xamot_offset;
18439 desc.dtbd_timestamp = dtrace_gethrtime();
18440
18441 lck_mtx_unlock(&dtrace_lock);
18442
18443 if (copyout(&desc, arg, sizeof (desc)) != 0)
18444 return (EFAULT);
18445
18446 buf->dtb_flags |= DTRACEBUF_CONSUMED;
18447
18448 return (0);
18449 }
18450
18451 if (buf->dtb_tomax == NULL) {
18452 ASSERT(buf->dtb_xamot == NULL);
18453 lck_mtx_unlock(&dtrace_lock);
18454 return (ENOENT);
18455 }
18456
18457 cached = buf->dtb_tomax;
18458 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18459
18460 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18461
18462 dtrace_xcall(desc.dtbd_cpu,
18463 (dtrace_xcall_t)dtrace_buffer_switch, buf);
18464
18465 state->dts_errors += buf->dtb_xamot_errors;
18466
18467 /*
18468 * If the buffers did not actually switch, then the cross call
18469 * did not take place -- presumably because the given CPU is
18470 * not in the ready set. If this is the case, we'll return
18471 * ENOENT.
18472 */
18473 if (buf->dtb_tomax == cached) {
18474 ASSERT(buf->dtb_xamot != cached);
18475 lck_mtx_unlock(&dtrace_lock);
18476 return (ENOENT);
18477 }
18478
18479 ASSERT(cached == buf->dtb_xamot);
18480 /*
18481 * At this point we know the buffer have switched, so we
18482 * can decrement the over limit count if the buffer was over
18483 * its limit. The new buffer might already be over its limit
18484 * yet, but we don't care since we're guaranteed not to be
18485 * checking the buffer over limit count at this point.
18486 */
18487 if (over_limit) {
18488 uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18489 #pragma unused(old)
18490
18491 /*
18492 * Verify that we didn't underflow the value
18493 */
18494 ASSERT(old != 0);
18495 }
18496
18497 /*
18498 * We have our snapshot; now copy it out.
18499 */
18500 if (dtrace_buffer_copyout(buf->dtb_xamot,
18501 (user_addr_t)desc.dtbd_data,
18502 buf->dtb_xamot_offset) != 0) {
18503 lck_mtx_unlock(&dtrace_lock);
18504 return (EFAULT);
18505 }
18506
18507 desc.dtbd_size = buf->dtb_xamot_offset;
18508 desc.dtbd_drops = buf->dtb_xamot_drops;
18509 desc.dtbd_errors = buf->dtb_xamot_errors;
18510 desc.dtbd_oldest = 0;
18511 desc.dtbd_timestamp = buf->dtb_switched;
18512
18513 lck_mtx_unlock(&dtrace_lock);
18514
18515 /*
18516 * Finally, copy out the buffer description.
18517 */
18518 if (copyout(&desc, arg, sizeof (desc)) != 0)
18519 return (EFAULT);
18520
18521 return (0);
18522 }
18523
18524 case DTRACEIOC_CONF: {
18525 dtrace_conf_t conf;
18526
18527 bzero(&conf, sizeof (conf));
18528 conf.dtc_difversion = DIF_VERSION;
18529 conf.dtc_difintregs = DIF_DIR_NREGS;
18530 conf.dtc_diftupregs = DIF_DTR_NREGS;
18531 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18532
18533 if (copyout(&conf, arg, sizeof (conf)) != 0)
18534 return (EFAULT);
18535
18536 return (0);
18537 }
18538
18539 case DTRACEIOC_STATUS: {
18540 dtrace_status_t stat;
18541 dtrace_dstate_t *dstate;
18542 int j;
18543 uint64_t nerrs;
18544
18545 /*
18546 * See the comment in dtrace_state_deadman() for the reason
18547 * for setting dts_laststatus to INT64_MAX before setting
18548 * it to the correct value.
18549 */
18550 state->dts_laststatus = INT64_MAX;
18551 dtrace_membar_producer();
18552 state->dts_laststatus = dtrace_gethrtime();
18553
18554 bzero(&stat, sizeof (stat));
18555
18556 lck_mtx_lock(&dtrace_lock);
18557
18558 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18559 lck_mtx_unlock(&dtrace_lock);
18560 return (ENOENT);
18561 }
18562
18563 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18564 stat.dtst_exiting = 1;
18565
18566 nerrs = state->dts_errors;
18567 dstate = &state->dts_vstate.dtvs_dynvars;
18568
18569 zpercpu_foreach_cpu(i) {
18570 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
18571
18572 stat.dtst_dyndrops += dcpu->dtdsc_drops;
18573 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18574 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18575
18576 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18577 stat.dtst_filled++;
18578
18579 nerrs += state->dts_buffer[i].dtb_errors;
18580
18581 for (j = 0; j < state->dts_nspeculations; j++) {
18582 dtrace_speculation_t *spec;
18583 dtrace_buffer_t *buf;
18584
18585 spec = &state->dts_speculations[j];
18586 buf = &spec->dtsp_buffer[i];
18587 stat.dtst_specdrops += buf->dtb_xamot_drops;
18588 }
18589 }
18590
18591 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18592 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18593 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18594 stat.dtst_dblerrors = state->dts_dblerrors;
18595 stat.dtst_killed =
18596 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18597 stat.dtst_errors = nerrs;
18598
18599 lck_mtx_unlock(&dtrace_lock);
18600
18601 if (copyout(&stat, arg, sizeof (stat)) != 0)
18602 return (EFAULT);
18603
18604 return (0);
18605 }
18606
18607 case DTRACEIOC_FORMAT: {
18608 dtrace_fmtdesc_t fmt;
18609 char *str;
18610 int len;
18611
18612 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18613 return (EFAULT);
18614
18615 lck_mtx_lock(&dtrace_lock);
18616
18617 if (fmt.dtfd_format == 0 ||
18618 fmt.dtfd_format > state->dts_nformats) {
18619 lck_mtx_unlock(&dtrace_lock);
18620 return (EINVAL);
18621 }
18622
18623 /*
18624 * Format strings are allocated contiguously and they are
18625 * never freed; if a format index is less than the number
18626 * of formats, we can assert that the format map is non-NULL
18627 * and that the format for the specified index is non-NULL.
18628 */
18629 ASSERT(state->dts_formats != NULL);
18630 str = state->dts_formats[fmt.dtfd_format - 1]->dtf_str;
18631 ASSERT(str != NULL);
18632
18633 len = strlen(str) + 1;
18634
18635 if (len > fmt.dtfd_length) {
18636 fmt.dtfd_length = len;
18637
18638 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18639 lck_mtx_unlock(&dtrace_lock);
18640 return (EINVAL);
18641 }
18642 } else {
18643 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18644 lck_mtx_unlock(&dtrace_lock);
18645 return (EINVAL);
18646 }
18647 }
18648
18649 lck_mtx_unlock(&dtrace_lock);
18650 return (0);
18651 }
18652
18653 case DTRACEIOC_MODUUIDSLIST: {
18654 size_t module_uuids_list_size;
18655 dtrace_module_uuids_list_t* uuids_list;
18656 uint64_t dtmul_count;
18657
18658 /*
18659 * Security restrictions make this operation illegal, if this is enabled DTrace
18660 * must refuse to provide any fbt probes.
18661 */
18662 if (dtrace_fbt_probes_restricted()) {
18663 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18664 return (EPERM);
18665 }
18666
18667 /*
18668 * Fail if the kernel symbol mode makes this operation illegal.
18669 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18670 * for them without holding the dtrace_lock.
18671 */
18672 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18673 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18674 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18675 return (EPERM);
18676 }
18677
18678 /*
18679 * Read the number of symbolsdesc structs being passed in.
18680 */
18681 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18682 &dtmul_count, sizeof(dtmul_count)) != 0) {
18683 cmn_err(CE_WARN, "failed to copyin dtmul_count");
18684 return (EFAULT);
18685 }
18686
18687 /*
18688 * Range check the count. More than 2k kexts is probably an error.
18689 */
18690 if (dtmul_count > 2048) {
18691 cmn_err(CE_WARN, "dtmul_count is not valid");
18692 return (EINVAL);
18693 }
18694
18695 /*
18696 * For all queries, we return EINVAL when the user specified
18697 * count does not match the actual number of modules we find
18698 * available.
18699 *
18700 * If the user specified count is zero, then this serves as a
18701 * simple query to count the available modules in need of symbols.
18702 */
18703
18704 rval = 0;
18705
18706 if (dtmul_count == 0)
18707 {
18708 lck_mtx_lock(&mod_lock);
18709 struct modctl* ctl = dtrace_modctl_list;
18710 while (ctl) {
18711 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18712 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18713 dtmul_count++;
18714 rval = EINVAL;
18715 }
18716 ctl = ctl->mod_next;
18717 }
18718 lck_mtx_unlock(&mod_lock);
18719
18720 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18721 return (EFAULT);
18722 else
18723 return (rval);
18724 }
18725
18726 /*
18727 * If we reach this point, then we have a request for full list data.
18728 * Allocate a correctly sized structure and copyin the data.
18729 */
18730 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18731 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18732 return (ENOMEM);
18733
18734 /* NOTE! We can no longer exit this method via return */
18735 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18736 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18737 rval = EFAULT;
18738 goto moduuidslist_cleanup;
18739 }
18740
18741 /*
18742 * Check that the count didn't change between the first copyin and the second.
18743 */
18744 if (uuids_list->dtmul_count != dtmul_count) {
18745 rval = EINVAL;
18746 goto moduuidslist_cleanup;
18747 }
18748
18749 /*
18750 * Build the list of UUID's that need symbols
18751 */
18752 lck_mtx_lock(&mod_lock);
18753
18754 dtmul_count = 0;
18755
18756 struct modctl* ctl = dtrace_modctl_list;
18757 while (ctl) {
18758 /*
18759 * We assume that userspace symbols will be "better" than kernel level symbols,
18760 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18761 * are available, add user syms if the module might use them.
18762 */
18763 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18764 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18765 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18766 if (dtmul_count++ < uuids_list->dtmul_count) {
18767 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18768 }
18769 }
18770 ctl = ctl->mod_next;
18771 }
18772
18773 lck_mtx_unlock(&mod_lock);
18774
18775 if (uuids_list->dtmul_count < dtmul_count)
18776 rval = EINVAL;
18777
18778 uuids_list->dtmul_count = dtmul_count;
18779
18780 /*
18781 * Copyout the symbols list (or at least the count!)
18782 */
18783 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18784 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18785 rval = EFAULT;
18786 }
18787
18788 moduuidslist_cleanup:
18789 /*
18790 * If we had to allocate struct memory, free it.
18791 */
18792 if (uuids_list != NULL) {
18793 kmem_free(uuids_list, module_uuids_list_size);
18794 }
18795
18796 return rval;
18797 }
18798
18799 case DTRACEIOC_PROVMODSYMS: {
18800 size_t module_symbols_size;
18801 dtrace_module_symbols_t* module_symbols;
18802 uint64_t dtmodsyms_count;
18803
18804 /*
18805 * Security restrictions make this operation illegal, if this is enabled DTrace
18806 * must refuse to provide any fbt probes.
18807 */
18808 if (dtrace_fbt_probes_restricted()) {
18809 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18810 return (EPERM);
18811 }
18812
18813 /*
18814 * Fail if the kernel symbol mode makes this operation illegal.
18815 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18816 * for them without holding the dtrace_lock.
18817 */
18818 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18819 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18820 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18821 return (EPERM);
18822 }
18823
18824 /*
18825 * Read the number of module symbols structs being passed in.
18826 */
18827 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18828 &dtmodsyms_count, sizeof(dtmodsyms_count)) != 0) {
18829 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18830 return (EFAULT);
18831 }
18832
18833 /* Ensure that we have at least one symbol. */
18834 if (dtmodsyms_count == 0) {
18835 cmn_err(CE_WARN, "Invalid dtmodsyms_count value");
18836 return (EINVAL);
18837 }
18838
18839 /* Safely calculate size we need for copyin buffer. */
18840 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18841 if (module_symbols_size == 0 || module_symbols_size > (size_t)dtrace_copy_maxsize()) {
18842 cmn_err(CE_WARN, "Invalid module_symbols_size %ld", module_symbols_size);
18843 return (EINVAL);
18844 }
18845
18846 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18847 return (ENOMEM);
18848
18849 rval = 0;
18850
18851 /* NOTE! We can no longer exit this method via return */
18852 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18853 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18854 rval = EFAULT;
18855 goto module_symbols_cleanup;
18856 }
18857
18858 /*
18859 * Check that the count didn't change between the first copyin and the second.
18860 */
18861 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18862 rval = EINVAL;
18863 goto module_symbols_cleanup;
18864 }
18865
18866 /*
18867 * Find the modctl to add symbols to.
18868 */
18869 lck_mtx_lock(&dtrace_provider_lock);
18870 lck_mtx_lock(&mod_lock);
18871
18872 struct modctl* ctl = dtrace_modctl_list;
18873 while (ctl) {
18874 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18875 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18876 dtrace_provider_t *prv;
18877 ctl->mod_user_symbols = module_symbols;
18878
18879 /*
18880 * We're going to call each providers per-module provide operation
18881 * specifying only this module.
18882 */
18883 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18884 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18885 /*
18886 * We gave every provider a chance to provide with the user syms, go ahead and clear them
18887 */
18888 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18889 }
18890 ctl = ctl->mod_next;
18891 }
18892
18893 lck_mtx_unlock(&mod_lock);
18894 lck_mtx_unlock(&dtrace_provider_lock);
18895
18896 module_symbols_cleanup:
18897 /*
18898 * If we had to allocate struct memory, free it.
18899 */
18900 if (module_symbols != NULL) {
18901 kmem_free(module_symbols, module_symbols_size);
18902 }
18903
18904 return rval;
18905 }
18906
18907 case DTRACEIOC_PROCWAITFOR: {
18908 dtrace_procdesc_t pdesc = {
18909 .p_name = {0},
18910 .p_pid = -1
18911 };
18912
18913 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18914 goto proc_waitfor_error;
18915
18916 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18917 goto proc_waitfor_error;
18918
18919 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18920 goto proc_waitfor_error;
18921
18922 return 0;
18923
18924 proc_waitfor_error:
18925 /* The process was suspended, revert this since the client will not do it. */
18926 if (pdesc.p_pid != -1) {
18927 proc_t *proc = proc_find(pdesc.p_pid);
18928 if (proc != PROC_NULL) {
18929 task_pidresume(proc_task(proc));
18930 proc_rele(proc);
18931 }
18932 }
18933
18934 return rval;
18935 }
18936
18937 default:
18938 break;
18939 }
18940
18941 return (ENOTTY);
18942 }
18943
18944 /*
18945 * APPLE NOTE: dtrace_detach not implemented
18946 */
18947 #if !defined(__APPLE__)
18948 /*ARGSUSED*/
18949 static int
dtrace_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)18950 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18951 {
18952 dtrace_state_t *state;
18953
18954 switch (cmd) {
18955 case DDI_DETACH:
18956 break;
18957
18958 case DDI_SUSPEND:
18959 return (DDI_SUCCESS);
18960
18961 default:
18962 return (DDI_FAILURE);
18963 }
18964
18965 lck_mtx_lock(&cpu_lock);
18966 lck_mtx_lock(&dtrace_provider_lock);
18967 lck_mtx_lock(&dtrace_lock);
18968
18969 ASSERT(dtrace_opens == 0);
18970
18971 if (dtrace_helpers > 0) {
18972 lck_mtx_unlock(&dtrace_lock);
18973 lck_mtx_unlock(&dtrace_provider_lock);
18974 lck_mtx_unlock(&cpu_lock);
18975 return (DDI_FAILURE);
18976 }
18977
18978 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18979 lck_mtx_unlock(&dtrace_lock);
18980 lck_mtx_unlock(&dtrace_provider_lock);
18981 lck_mtx_unlock(&cpu_lock);
18982 return (DDI_FAILURE);
18983 }
18984
18985 dtrace_provider = NULL;
18986
18987 if ((state = dtrace_anon_grab()) != NULL) {
18988 /*
18989 * If there were ECBs on this state, the provider should
18990 * have not been allowed to detach; assert that there is
18991 * none.
18992 */
18993 ASSERT(state->dts_necbs == 0);
18994 dtrace_state_destroy(state);
18995
18996 /*
18997 * If we're being detached with anonymous state, we need to
18998 * indicate to the kernel debugger that DTrace is now inactive.
18999 */
19000 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
19001 }
19002
19003 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
19004 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
19005 dtrace_cpu_init = NULL;
19006 dtrace_helpers_cleanup = NULL;
19007 dtrace_helpers_fork = NULL;
19008 dtrace_cpustart_init = NULL;
19009 dtrace_cpustart_fini = NULL;
19010 dtrace_debugger_init = NULL;
19011 dtrace_debugger_fini = NULL;
19012 dtrace_kreloc_init = NULL;
19013 dtrace_kreloc_fini = NULL;
19014 dtrace_modload = NULL;
19015 dtrace_modunload = NULL;
19016
19017 lck_mtx_unlock(&cpu_lock);
19018
19019 if (dtrace_helptrace_enabled) {
19020 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
19021 dtrace_helptrace_buffer = NULL;
19022 }
19023
19024 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
19025 dtrace_probes = NULL;
19026 dtrace_nprobes = 0;
19027
19028 dtrace_hash_destroy(dtrace_strings);
19029 dtrace_hash_destroy(dtrace_byprov);
19030 dtrace_hash_destroy(dtrace_bymod);
19031 dtrace_hash_destroy(dtrace_byfunc);
19032 dtrace_hash_destroy(dtrace_byname);
19033 dtrace_strings = NULL;
19034 dtrace_byprov = NULL;
19035 dtrace_bymod = NULL;
19036 dtrace_byfunc = NULL;
19037 dtrace_byname = NULL;
19038
19039 kmem_cache_destroy(dtrace_state_cache);
19040 vmem_destroy(dtrace_arena);
19041
19042 if (dtrace_toxrange != NULL) {
19043 kmem_free(dtrace_toxrange,
19044 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
19045 dtrace_toxrange = NULL;
19046 dtrace_toxranges = 0;
19047 dtrace_toxranges_max = 0;
19048 }
19049
19050 ddi_remove_minor_node(dtrace_devi, NULL);
19051 dtrace_devi = NULL;
19052
19053 ddi_soft_state_fini(&dtrace_softstate);
19054
19055 ASSERT(dtrace_vtime_references == 0);
19056 ASSERT(dtrace_opens == 0);
19057 ASSERT(dtrace_retained == NULL);
19058
19059 lck_mtx_unlock(&dtrace_lock);
19060 lck_mtx_unlock(&dtrace_provider_lock);
19061
19062 #ifdef illumos
19063 /*
19064 * We don't destroy the task queue until after we have dropped our
19065 * locks (taskq_destroy() may block on running tasks). To prevent
19066 * attempting to do work after we have effectively detached but before
19067 * the task queue has been destroyed, all tasks dispatched via the
19068 * task queue must check that DTrace is still attached before
19069 * performing any operation.
19070 */
19071 taskq_destroy(dtrace_taskq);
19072 dtrace_taskq = NULL;
19073 #endif
19074
19075 return (DDI_SUCCESS);
19076 }
19077 #endif /* __APPLE__ */
19078
19079 d_open_t _dtrace_open, helper_open;
19080 d_close_t _dtrace_close, helper_close;
19081 d_ioctl_t _dtrace_ioctl, helper_ioctl;
19082
19083 int
_dtrace_open(dev_t dev,int flags,int devtype,struct proc * p)19084 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19085 {
19086 #pragma unused(p)
19087 dev_t locdev = dev;
19088
19089 return dtrace_open( &locdev, flags, devtype, CRED());
19090 }
19091
19092 int
helper_open(dev_t dev,int flags,int devtype,struct proc * p)19093 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19094 {
19095 #pragma unused(dev,flags,devtype,p)
19096 return 0;
19097 }
19098
19099 int
_dtrace_close(dev_t dev,int flags,int devtype,struct proc * p)19100 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19101 {
19102 #pragma unused(p)
19103 return dtrace_close( dev, flags, devtype, CRED());
19104 }
19105
19106 int
helper_close(dev_t dev,int flags,int devtype,struct proc * p)19107 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19108 {
19109 #pragma unused(dev,flags,devtype,p)
19110 return 0;
19111 }
19112
19113 int
_dtrace_ioctl(dev_t dev,u_long cmd,caddr_t data,int fflag,struct proc * p)19114 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19115 {
19116 #pragma unused(p)
19117 int err, rv = 0;
19118 user_addr_t uaddrp;
19119
19120 if (proc_is64bit(p))
19121 uaddrp = *(user_addr_t *)data;
19122 else
19123 uaddrp = (user_addr_t) *(uint32_t *)data;
19124
19125 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
19126
19127 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19128 if (err != 0) {
19129 ASSERT( (err & 0xfffff000) == 0 );
19130 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19131 } else if (rv != 0) {
19132 ASSERT( (rv & 0xfff00000) == 0 );
19133 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19134 } else
19135 return 0;
19136 }
19137
19138 int
helper_ioctl(dev_t dev,u_long cmd,caddr_t data,int fflag,struct proc * p)19139 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19140 {
19141 #pragma unused(dev,fflag,p)
19142 int err, rv = 0;
19143
19144 err = dtrace_ioctl_helper(cmd, data, &rv);
19145 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19146 if (err != 0) {
19147 ASSERT( (err & 0xfffff000) == 0 );
19148 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19149 } else if (rv != 0) {
19150 ASSERT( (rv & 0xfff00000) == 0 );
19151 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19152 } else
19153 return 0;
19154 }
19155
19156 #define HELPER_MAJOR -24 /* let the kernel pick the device number */
19157
19158 #define nulldevfp (void (*)(void))&nulldev
19159
19160 const static struct cdevsw helper_cdevsw =
19161 {
19162 .d_open = helper_open,
19163 .d_close = helper_close,
19164 .d_read = eno_rdwrt,
19165 .d_write = eno_rdwrt,
19166 .d_ioctl = helper_ioctl,
19167 .d_stop = eno_stop,
19168 .d_reset = eno_reset,
19169 .d_select = eno_select,
19170 .d_mmap = eno_mmap,
19171 .d_strategy = eno_strat,
19172 .d_reserved_1 = eno_getc,
19173 .d_reserved_2 = eno_putc,
19174 };
19175
19176 static int helper_majdevno = 0;
19177
19178 static int gDTraceInited = 0;
19179
19180 void
helper_init(void)19181 helper_init( void )
19182 {
19183 /*
19184 * Once the "helper" is initialized, it can take ioctl calls that use locks
19185 * and zones initialized in dtrace_init. Make certain dtrace_init was called
19186 * before us.
19187 */
19188
19189 if (!gDTraceInited) {
19190 panic("helper_init before dtrace_init");
19191 }
19192
19193 if (0 >= helper_majdevno)
19194 {
19195 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19196
19197 if (helper_majdevno < 0) {
19198 printf("helper_init: failed to allocate a major number!\n");
19199 return;
19200 }
19201
19202 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19203 DTRACEMNR_HELPER )) {
19204 printf("dtrace_init: failed to devfs_make_node for helper!\n");
19205 return;
19206 }
19207 } else
19208 panic("helper_init: called twice!");
19209 }
19210
19211 #undef HELPER_MAJOR
19212
19213 static int
dtrace_clone_func(dev_t dev,int action)19214 dtrace_clone_func(dev_t dev, int action)
19215 {
19216 #pragma unused(dev)
19217
19218 if (action == DEVFS_CLONE_ALLOC) {
19219 return dtrace_state_reserve();
19220 }
19221 else if (action == DEVFS_CLONE_FREE) {
19222 return 0;
19223 }
19224 else return -1;
19225 }
19226
19227 void dtrace_ast(void);
19228
19229 void
dtrace_ast(void)19230 dtrace_ast(void)
19231 {
19232 int i;
19233 uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19234 if (clients == 0)
19235 return;
19236 /**
19237 * We disable preemption here to be sure that we won't get
19238 * interrupted by a wakeup to a thread that is higher
19239 * priority than us, so that we do issue all wakeups
19240 */
19241 disable_preemption();
19242 for (i = 0; i < DTRACE_NCLIENTS; i++) {
19243 if (clients & (1 << i)) {
19244 dtrace_state_t *state = dtrace_state_get(i);
19245 if (state) {
19246 wakeup(state);
19247 }
19248
19249 }
19250 }
19251 enable_preemption();
19252 }
19253
19254
19255 #define DTRACE_MAJOR -24 /* let the kernel pick the device number */
19256
19257 static const struct cdevsw dtrace_cdevsw =
19258 {
19259 .d_open = _dtrace_open,
19260 .d_close = _dtrace_close,
19261 .d_read = eno_rdwrt,
19262 .d_write = eno_rdwrt,
19263 .d_ioctl = _dtrace_ioctl,
19264 .d_stop = eno_stop,
19265 .d_reset = eno_reset,
19266 .d_select = eno_select,
19267 .d_mmap = eno_mmap,
19268 .d_strategy = eno_strat,
19269 .d_reserved_1 = eno_getc,
19270 .d_reserved_2 = eno_putc,
19271 };
19272
19273 LCK_ATTR_DECLARE(dtrace_lck_attr, 0, 0);
19274 LCK_GRP_DECLARE(dtrace_lck_grp, "dtrace");
19275
19276 static int gMajDevNo;
19277
dtrace_early_init(void)19278 void dtrace_early_init (void)
19279 {
19280 dtrace_restriction_policy_load();
19281
19282 /*
19283 * See dtrace_impl.h for a description of kernel symbol modes.
19284 * The default is to wait for symbols from userspace (lazy symbols).
19285 */
19286 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19287 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19288 }
19289 }
19290
19291 void
dtrace_init(void)19292 dtrace_init( void )
19293 {
19294 if (0 == gDTraceInited) {
19295 unsigned int i, ncpu;
19296 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19297
19298 /*
19299 * Disable destructive actions when dtrace is running
19300 * in a restricted environment
19301 */
19302 dtrace_destructive_disallow = dtrace_is_restricted() &&
19303 !dtrace_are_restrictions_relaxed();
19304
19305 /*
19306 * DTrace allocates buffers based on the maximum number
19307 * of enabled cpus. This call avoids any race when finding
19308 * that count.
19309 */
19310 ASSERT(dtrace_max_cpus == 0);
19311 ncpu = dtrace_max_cpus = ml_wait_max_cpus();
19312
19313 /*
19314 * Retrieve the size of the physical memory in order to define
19315 * the state buffer memory maximal size. If we cannot retrieve
19316 * this value, we'll consider that we have 1Gb of memory per CPU, that's
19317 * still better than raising a kernel panic.
19318 */
19319 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19320 &size, NULL, 0))
19321 {
19322 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19323 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19324 dtrace_buffer_memory_maxsize);
19325 }
19326
19327 /*
19328 * Finally, divide by three to prevent DTrace from eating too
19329 * much memory.
19330 */
19331 dtrace_buffer_memory_maxsize /= 3;
19332 ASSERT(dtrace_buffer_memory_maxsize > 0);
19333
19334 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19335
19336 if (gMajDevNo < 0) {
19337 printf("dtrace_init: failed to allocate a major number!\n");
19338 gDTraceInited = 0;
19339 return;
19340 }
19341
19342 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19343 dtrace_clone_func, DTRACEMNR_DTRACE )) {
19344 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19345 gDTraceInited = 0;
19346 return;
19347 }
19348
19349 /*
19350 * The cpu_core structure consists of per-CPU state available in any context.
19351 * On some architectures, this may mean that the page(s) containing the
19352 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19353 * is up to the platform to assure that this is performed properly. Note that
19354 * the structure is sized to avoid false sharing.
19355 */
19356
19357 dtrace_modctl_list = NULL;
19358
19359 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19360 for (i = 0; i < ncpu; ++i) {
19361 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, &dtrace_lck_grp, &dtrace_lck_attr);
19362 }
19363
19364 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19365 for (i = 0; i < ncpu; ++i) {
19366 cpu_list[i].cpu_id = (processorid_t)i;
19367 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19368 LIST_INIT(&cpu_list[i].cpu_cyc_list);
19369 lck_rw_init(&cpu_list[i].cpu_ft_lock, &dtrace_lck_grp, &dtrace_lck_attr);
19370 }
19371
19372 /*
19373 * Initialize the CPU offline/online hooks.
19374 */
19375 dtrace_install_cpu_hooks();
19376
19377 lck_mtx_lock(&cpu_lock);
19378 for (i = 0; i < ncpu; ++i)
19379 /* FIXME: track CPU configuration */
19380 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19381 lck_mtx_unlock(&cpu_lock);
19382
19383 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19384
19385 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19386 offsetof(dtrace_string_t, dtst_str),
19387 offsetof(dtrace_string_t, dtst_next),
19388 offsetof(dtrace_string_t, dtst_prev));
19389
19390 /*
19391 * See dtrace_impl.h for a description of dof modes.
19392 * The default is lazy dof.
19393 *
19394 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19395 * makes no sense...
19396 */
19397 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19398 #if defined(XNU_TARGET_OS_OSX)
19399 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19400 #else
19401 dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19402 #endif
19403 }
19404
19405 /*
19406 * Sanity check of dof mode value.
19407 */
19408 switch (dtrace_dof_mode) {
19409 case DTRACE_DOF_MODE_NEVER:
19410 case DTRACE_DOF_MODE_LAZY_ON:
19411 /* valid modes, but nothing else we need to do */
19412 break;
19413
19414 case DTRACE_DOF_MODE_LAZY_OFF:
19415 case DTRACE_DOF_MODE_NON_LAZY:
19416 /* Cannot wait for a dtrace_open to init fasttrap */
19417 fasttrap_init();
19418 break;
19419
19420 default:
19421 /* Invalid, clamp to non lazy */
19422 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19423 fasttrap_init();
19424 break;
19425 }
19426
19427 #if CONFIG_DTRACE
19428 if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19429 commpage_update_dof(true);
19430 #endif
19431
19432 gDTraceInited = 1;
19433
19434 } else
19435 panic("dtrace_init: called twice!");
19436 }
19437
19438 void
dtrace_postinit(void)19439 dtrace_postinit(void)
19440 {
19441 /*
19442 * Called from bsd_init after all provider's *_init() routines have been
19443 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19444 * to go.
19445 */
19446 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19447
19448 /*
19449 * Add the mach_kernel to the module list for lazy processing
19450 */
19451 struct kmod_info fake_kernel_kmod;
19452 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19453
19454 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19455 fake_kernel_kmod.id = 1;
19456 fake_kernel_kmod.address = g_kernel_kmod_info.address;
19457 fake_kernel_kmod.size = g_kernel_kmod_info.size;
19458
19459 /* Ensure we don't try to touch symbols if they are gone. */
19460 boolean_t keepsyms = false;
19461 PE_parse_boot_argn("keepsyms", &keepsyms, sizeof(keepsyms));
19462
19463 if (dtrace_module_loaded(&fake_kernel_kmod, (keepsyms) ? 0 : KMOD_DTRACE_NO_KERNEL_SYMS) != 0) {
19464 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19465 }
19466
19467 (void)OSKextRegisterKextsWithDTrace();
19468 }
19469 #undef DTRACE_MAJOR
19470
19471 /*
19472 * Routines used to register interest in cpu's being added to or removed
19473 * from the system.
19474 */
19475 void
register_cpu_setup_func(cpu_setup_func_t * ignore1,void * ignore2)19476 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19477 {
19478 #pragma unused(ignore1,ignore2)
19479 }
19480
19481 void
unregister_cpu_setup_func(cpu_setup_func_t * ignore1,void * ignore2)19482 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19483 {
19484 #pragma unused(ignore1,ignore2)
19485 }
19486