1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Portions Copyright (c) 2013, 2016, Joyent, Inc. All rights reserved.
24 * Portions Copyright (c) 2013 by Delphix. All rights reserved.
25 */
26
27 /*
28 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
29 * Use is subject to license terms.
30 */
31
32 /*
33 * DTrace - Dynamic Tracing for Solaris
34 *
35 * This is the implementation of the Solaris Dynamic Tracing framework
36 * (DTrace). The user-visible interface to DTrace is described at length in
37 * the "Solaris Dynamic Tracing Guide". The interfaces between the libdtrace
38 * library, the in-kernel DTrace framework, and the DTrace providers are
39 * described in the block comments in the <sys/dtrace.h> header file. The
40 * internal architecture of DTrace is described in the block comments in the
41 * <sys/dtrace_impl.h> header file. The comments contained within the DTrace
42 * implementation very much assume mastery of all of these sources; if one has
43 * an unanswered question about the implementation, one should consult them
44 * first.
45 *
46 * The functions here are ordered roughly as follows:
47 *
48 * - Probe context functions
49 * - Probe hashing functions
50 * - Non-probe context utility functions
51 * - Matching functions
52 * - Provider-to-Framework API functions
53 * - Probe management functions
54 * - DIF object functions
55 * - Format functions
56 * - Predicate functions
57 * - ECB functions
58 * - Buffer functions
59 * - Enabling functions
60 * - DOF functions
61 * - Anonymous enabling functions
62 * - Process functions
63 * - Consumer state functions
64 * - Helper functions
65 * - Hook functions
66 * - Driver cookbook functions
67 *
68 * Each group of functions begins with a block comment labelled the "DTrace
69 * [Group] Functions", allowing one to find each block by searching forward
70 * on capital-f functions.
71 */
72 #include <sys/errno.h>
73 #include <sys/types.h>
74 #include <sys/stat.h>
75 #include <sys/conf.h>
76 #include <sys/random.h>
77 #include <sys/systm.h>
78 #include <sys/dtrace_impl.h>
79 #include <sys/param.h>
80 #include <sys/proc_internal.h>
81 #include <sys/ioctl.h>
82 #include <sys/fcntl.h>
83 #include <miscfs/devfs/devfs.h>
84 #include <sys/malloc.h>
85 #include <sys/kernel_types.h>
86 #include <sys/proc_internal.h>
87 #include <sys/uio_internal.h>
88 #include <sys/kauth.h>
89 #include <vm/pmap.h>
90 #include <sys/user.h>
91 #include <mach/exception_types.h>
92 #include <sys/signalvar.h>
93 #include <mach/task.h>
94 #include <kern/zalloc.h>
95 #include <kern/ast.h>
96 #include <kern/sched_prim.h>
97 #include <kern/task.h>
98 #include <kern/hvg_hypercall.h>
99 #include <netinet/in.h>
100 #include <libkern/sysctl.h>
101 #include <sys/kdebug.h>
102 #include <sys/sdt_impl.h>
103
104 #if MONOTONIC
105 #include <kern/monotonic.h>
106 #include <machine/monotonic.h>
107 #endif /* MONOTONIC */
108
109 #include "dtrace_xoroshiro128_plus.h"
110
111 #include <IOKit/IOPlatformExpert.h>
112
113 #include <kern/cpu_data.h>
114
115 extern addr64_t kvtophys(vm_offset_t va);
116
117 extern uint32_t pmap_find_phys(void *, uint64_t);
118 extern boolean_t pmap_valid_page(uint32_t);
119 extern void OSKextRegisterKextsWithDTrace(void);
120 extern kmod_info_t g_kernel_kmod_info;
121 extern void commpage_update_dof(boolean_t enabled);
122
123 /* Solaris proc_t is the struct. Darwin's proc_t is a pointer to it. */
124 #define proc_t struct proc /* Steer clear of the Darwin typedef for proc_t */
125
126 #define t_predcache t_dtrace_predcache /* Cosmetic. Helps readability of thread.h */
127
128 extern void dtrace_suspend(void);
129 extern void dtrace_resume(void);
130 extern void dtrace_early_init(void);
131 extern int dtrace_keep_kernel_symbols(void);
132 extern void dtrace_init(void);
133 extern void helper_init(void);
134 extern void fasttrap_init(void);
135
136 static int dtrace_lazy_dofs_duplicate(proc_t *, proc_t *);
137 extern void dtrace_lazy_dofs_destroy(proc_t *);
138 extern void dtrace_postinit(void);
139
140 extern void dtrace_proc_fork(proc_t*, proc_t*, int);
141 extern void dtrace_proc_exec(proc_t*);
142 extern void dtrace_proc_exit(proc_t*);
143
144 /*
145 * DTrace Tunable Variables
146 *
147 * The following variables may be dynamically tuned by using sysctl(8), the
148 * variables being stored in the kern.dtrace namespace. For example:
149 * sysctl kern.dtrace.dof_maxsize = 1048575 # 1M
150 *
151 * In general, the only variables that one should be tuning this way are those
152 * that affect system-wide DTrace behavior, and for which the default behavior
153 * is undesirable. Most of these variables are tunable on a per-consumer
154 * basis using DTrace options, and need not be tuned on a system-wide basis.
155 * When tuning these variables, avoid pathological values; while some attempt
156 * is made to verify the integrity of these variables, they are not considered
157 * part of the supported interface to DTrace, and they are therefore not
158 * checked comprehensively.
159 */
160 uint64_t dtrace_buffer_memory_maxsize = 0; /* initialized in dtrace_init */
161 uint64_t dtrace_buffer_memory_inuse = 0;
162 int dtrace_destructive_disallow = 1;
163 dtrace_optval_t dtrace_nonroot_maxsize = (16 * 1024 * 1024);
164 size_t dtrace_difo_maxsize = (256 * 1024);
165 dtrace_optval_t dtrace_dof_maxsize = (512 * 1024);
166 dtrace_optval_t dtrace_statvar_maxsize = (16 * 1024);
167 dtrace_optval_t dtrace_statvar_maxsize_max = (16 * 10 * 1024);
168 size_t dtrace_actions_max = (16 * 1024);
169 size_t dtrace_retain_max = 1024;
170 dtrace_optval_t dtrace_helper_actions_max = 32;
171 dtrace_optval_t dtrace_helper_providers_max = 64;
172 dtrace_optval_t dtrace_dstate_defsize = (1 * 1024 * 1024);
173 size_t dtrace_strsize_default = 256;
174 dtrace_optval_t dtrace_strsize_min = 8;
175 dtrace_optval_t dtrace_strsize_max = 65536;
176 dtrace_optval_t dtrace_cleanrate_default = 990099000; /* 1.1 hz */
177 dtrace_optval_t dtrace_cleanrate_min = 20000000; /* 50 hz */
178 dtrace_optval_t dtrace_cleanrate_max = (uint64_t)60 * NANOSEC; /* 1/minute */
179 dtrace_optval_t dtrace_aggrate_default = NANOSEC; /* 1 hz */
180 dtrace_optval_t dtrace_statusrate_default = NANOSEC; /* 1 hz */
181 dtrace_optval_t dtrace_statusrate_max = (hrtime_t)10 * NANOSEC; /* 6/minute */
182 dtrace_optval_t dtrace_switchrate_default = NANOSEC; /* 1 hz */
183 dtrace_optval_t dtrace_nspec_default = 1;
184 dtrace_optval_t dtrace_specsize_default = 32 * 1024;
185 dtrace_optval_t dtrace_stackframes_default = 20;
186 dtrace_optval_t dtrace_ustackframes_default = 20;
187 dtrace_optval_t dtrace_jstackframes_default = 50;
188 dtrace_optval_t dtrace_jstackstrsize_default = 512;
189 dtrace_optval_t dtrace_buflimit_default = 75;
190 dtrace_optval_t dtrace_buflimit_min = 1;
191 dtrace_optval_t dtrace_buflimit_max = 99;
192 size_t dtrace_nprobes_default = 4;
193 int dtrace_msgdsize_max = 128;
194 hrtime_t dtrace_chill_max = 500 * (NANOSEC / MILLISEC); /* 500 ms */
195 hrtime_t dtrace_chill_interval = NANOSEC; /* 1000 ms */
196 int dtrace_devdepth_max = 32;
197 int dtrace_err_verbose;
198 hrtime_t dtrace_deadman_interval = NANOSEC;
199 hrtime_t dtrace_deadman_timeout = (hrtime_t)10 * NANOSEC;
200 hrtime_t dtrace_deadman_user = (hrtime_t)30 * NANOSEC;
201
202 /*
203 * DTrace External Variables
204 *
205 * As dtrace(7D) is a kernel module, any DTrace variables are obviously
206 * available to DTrace consumers via the backtick (`) syntax. One of these,
207 * dtrace_zero, is made deliberately so: it is provided as a source of
208 * well-known, zero-filled memory. While this variable is not documented,
209 * it is used by some translators as an implementation detail.
210 */
211 const char dtrace_zero[256] = { 0 }; /* zero-filled memory */
212 unsigned int dtrace_max_cpus = 0; /* number of enabled cpus */
213 /*
214 * DTrace Internal Variables
215 */
216 static dev_info_t *dtrace_devi; /* device info */
217 static vmem_t *dtrace_arena; /* probe ID arena */
218 static dtrace_probe_t **dtrace_probes; /* array of all probes */
219 static int dtrace_nprobes; /* number of probes */
220 static dtrace_provider_t *dtrace_provider; /* provider list */
221 static dtrace_meta_t *dtrace_meta_pid; /* user-land meta provider */
222 static int dtrace_opens; /* number of opens */
223 static int dtrace_helpers; /* number of helpers */
224 static dtrace_hash_t *dtrace_strings;
225 static dtrace_hash_t *dtrace_byprov; /* probes hashed by provider */
226 static dtrace_hash_t *dtrace_bymod; /* probes hashed by module */
227 static dtrace_hash_t *dtrace_byfunc; /* probes hashed by function */
228 static dtrace_hash_t *dtrace_byname; /* probes hashed by name */
229 static dtrace_toxrange_t *dtrace_toxrange; /* toxic range array */
230 static int dtrace_toxranges; /* number of toxic ranges */
231 static int dtrace_toxranges_max; /* size of toxic range array */
232 static dtrace_anon_t dtrace_anon; /* anonymous enabling */
233 static uint64_t dtrace_vtime_references; /* number of vtimestamp refs */
234 static kthread_t *dtrace_panicked; /* panicking thread */
235 static dtrace_ecb_t *dtrace_ecb_create_cache; /* cached created ECB */
236 static dtrace_genid_t dtrace_probegen; /* current probe generation */
237 static dtrace_helpers_t *dtrace_deferred_pid; /* deferred helper list */
238 static dtrace_enabling_t *dtrace_retained; /* list of retained enablings */
239 static dtrace_genid_t dtrace_retained_gen; /* current retained enab gen */
240 static dtrace_dynvar_t dtrace_dynhash_sink; /* end of dynamic hash chains */
241
242 static int dtrace_dof_mode; /* See dtrace_impl.h for a description of Darwin's dof modes. */
243
244 /*
245 * This does't quite fit as an internal variable, as it must be accessed in
246 * fbt_provide and sdt_provide. Its clearly not a dtrace tunable variable either...
247 */
248 int dtrace_kernel_symbol_mode; /* See dtrace_impl.h for a description of Darwin's kernel symbol modes. */
249 static uint32_t dtrace_wake_clients;
250 static uint8_t dtrace_kerneluuid[16]; /* the 128-bit uuid */
251
252 /*
253 * To save memory, some common memory allocations are given a
254 * unique zone. For example, dtrace_probe_t is 72 bytes in size,
255 * which means it would fall into the kalloc.128 bucket. With
256 * 20k elements allocated, the space saved is substantial.
257 */
258
259 static ZONE_DEFINE_TYPE(dtrace_probe_t_zone, "dtrace.dtrace_probe_t",
260 dtrace_probe_t, ZC_PGZ_USE_GUARDS);
261
262 static ZONE_DEFINE(dtrace_state_pcpu_zone, "dtrace.dtrace_dstate_percpu_t",
263 sizeof(dtrace_dstate_percpu_t), ZC_PERCPU);
264
265 static int dtrace_module_unloaded(struct kmod_info *kmod);
266
267 /*
268 * DTrace Locking
269 * DTrace is protected by three (relatively coarse-grained) locks:
270 *
271 * (1) dtrace_lock is required to manipulate essentially any DTrace state,
272 * including enabling state, probes, ECBs, consumer state, helper state,
273 * etc. Importantly, dtrace_lock is _not_ required when in probe context;
274 * probe context is lock-free -- synchronization is handled via the
275 * dtrace_sync() cross call mechanism.
276 *
277 * (2) dtrace_provider_lock is required when manipulating provider state, or
278 * when provider state must be held constant.
279 *
280 * (3) dtrace_meta_lock is required when manipulating meta provider state, or
281 * when meta provider state must be held constant.
282 *
283 * The lock ordering between these three locks is dtrace_meta_lock before
284 * dtrace_provider_lock before dtrace_lock. (In particular, there are
285 * several places where dtrace_provider_lock is held by the framework as it
286 * calls into the providers -- which then call back into the framework,
287 * grabbing dtrace_lock.)
288 *
289 * There are two other locks in the mix: mod_lock and cpu_lock. With respect
290 * to dtrace_provider_lock and dtrace_lock, cpu_lock continues its historical
291 * role as a coarse-grained lock; it is acquired before both of these locks.
292 * With respect to dtrace_meta_lock, its behavior is stranger: cpu_lock must
293 * be acquired _between_ dtrace_meta_lock and any other DTrace locks.
294 * mod_lock is similar with respect to dtrace_provider_lock in that it must be
295 * acquired _between_ dtrace_provider_lock and dtrace_lock.
296 */
297
298
299 /*
300 * APPLE NOTE:
301 *
302 * For porting purposes, all kmutex_t vars have been changed
303 * to lck_mtx_t, which require explicit initialization.
304 *
305 * kmutex_t becomes lck_mtx_t
306 * mutex_enter() becomes lck_mtx_lock()
307 * mutex_exit() becomes lck_mtx_unlock()
308 *
309 * Lock asserts are changed like this:
310 *
311 * ASSERT(MUTEX_HELD(&cpu_lock));
312 * becomes:
313 * LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
314 *
315 */
316 static LCK_MTX_DECLARE_ATTR(dtrace_lock,
317 &dtrace_lck_grp, &dtrace_lck_attr); /* probe state lock */
318 static LCK_MTX_DECLARE_ATTR(dtrace_provider_lock,
319 &dtrace_lck_grp, &dtrace_lck_attr); /* provider state lock */
320 static LCK_MTX_DECLARE_ATTR(dtrace_meta_lock,
321 &dtrace_lck_grp, &dtrace_lck_attr); /* meta-provider state lock */
322 static LCK_RW_DECLARE_ATTR(dtrace_dof_mode_lock,
323 &dtrace_lck_grp, &dtrace_lck_attr); /* dof mode lock */
324
325 /*
326 * DTrace Provider Variables
327 *
328 * These are the variables relating to DTrace as a provider (that is, the
329 * provider of the BEGIN, END, and ERROR probes).
330 */
331 static dtrace_pattr_t dtrace_provider_attr = {
332 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
333 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
334 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
335 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
336 { DTRACE_STABILITY_STABLE, DTRACE_STABILITY_STABLE, DTRACE_CLASS_COMMON },
337 };
338
339 static void
dtrace_provide_nullop(void * arg,const dtrace_probedesc_t * desc)340 dtrace_provide_nullop(void *arg, const dtrace_probedesc_t *desc)
341 {
342 #pragma unused(arg, desc)
343 }
344
345 static void
dtrace_provide_module_nullop(void * arg,struct modctl * ctl)346 dtrace_provide_module_nullop(void *arg, struct modctl *ctl)
347 {
348 #pragma unused(arg, ctl)
349 }
350
351 static int
dtrace_enable_nullop(void * arg,dtrace_id_t id,void * parg)352 dtrace_enable_nullop(void *arg, dtrace_id_t id, void *parg)
353 {
354 #pragma unused(arg, id, parg)
355 return (0);
356 }
357
358 static void
dtrace_disable_nullop(void * arg,dtrace_id_t id,void * parg)359 dtrace_disable_nullop(void *arg, dtrace_id_t id, void *parg)
360 {
361 #pragma unused(arg, id, parg)
362 }
363
364 static void
dtrace_suspend_nullop(void * arg,dtrace_id_t id,void * parg)365 dtrace_suspend_nullop(void *arg, dtrace_id_t id, void *parg)
366 {
367 #pragma unused(arg, id, parg)
368 }
369
370 static void
dtrace_resume_nullop(void * arg,dtrace_id_t id,void * parg)371 dtrace_resume_nullop(void *arg, dtrace_id_t id, void *parg)
372 {
373 #pragma unused(arg, id, parg)
374 }
375
376 static void
dtrace_destroy_nullop(void * arg,dtrace_id_t id,void * parg)377 dtrace_destroy_nullop(void *arg, dtrace_id_t id, void *parg)
378 {
379 #pragma unused(arg, id, parg)
380 }
381
382
383 static dtrace_pops_t dtrace_provider_ops = {
384 .dtps_provide = dtrace_provide_nullop,
385 .dtps_provide_module = dtrace_provide_module_nullop,
386 .dtps_enable = dtrace_enable_nullop,
387 .dtps_disable = dtrace_disable_nullop,
388 .dtps_suspend = dtrace_suspend_nullop,
389 .dtps_resume = dtrace_resume_nullop,
390 .dtps_getargdesc = NULL,
391 .dtps_getargval = NULL,
392 .dtps_usermode = NULL,
393 .dtps_destroy = dtrace_destroy_nullop,
394 };
395
396 static dtrace_id_t dtrace_probeid_begin; /* special BEGIN probe */
397 static dtrace_id_t dtrace_probeid_end; /* special END probe */
398 dtrace_id_t dtrace_probeid_error; /* special ERROR probe */
399
400 /*
401 * DTrace Helper Tracing Variables
402 */
403 uint32_t dtrace_helptrace_next = 0;
404 uint32_t dtrace_helptrace_nlocals;
405 char *dtrace_helptrace_buffer;
406 size_t dtrace_helptrace_bufsize = 512 * 1024;
407
408 #if DEBUG
409 int dtrace_helptrace_enabled = 1;
410 #else
411 int dtrace_helptrace_enabled = 0;
412 #endif
413
414 #if defined (__arm64__)
415 /*
416 * The ioctl for adding helper DOF is based on the
417 * size of a user_addr_t. We need to recognize both
418 * U32 and U64 as the same action.
419 */
420 #define DTRACEHIOC_ADDDOF_U32 _IOW('h', 4, user32_addr_t)
421 #define DTRACEHIOC_ADDDOF_U64 _IOW('h', 4, user64_addr_t)
422 #endif /* __arm64__ */
423
424 /*
425 * DTrace Error Hashing
426 *
427 * On DEBUG kernels, DTrace will track the errors that has seen in a hash
428 * table. This is very useful for checking coverage of tests that are
429 * expected to induce DIF or DOF processing errors, and may be useful for
430 * debugging problems in the DIF code generator or in DOF generation . The
431 * error hash may be examined with the ::dtrace_errhash MDB dcmd.
432 */
433 #if DEBUG
434 static dtrace_errhash_t dtrace_errhash[DTRACE_ERRHASHSZ];
435 static const char *dtrace_errlast;
436 static kthread_t *dtrace_errthread;
437 static LCK_MTX_DECLARE_ATTR(dtrace_errlock, &dtrace_lck_grp, &dtrace_lck_attr);
438 #endif
439
440 /*
441 * DTrace Macros and Constants
442 *
443 * These are various macros that are useful in various spots in the
444 * implementation, along with a few random constants that have no meaning
445 * outside of the implementation. There is no real structure to this cpp
446 * mishmash -- but is there ever?
447 */
448
449 #define DTRACE_GETSTR(hash, elm) \
450 (hash->dth_getstr(elm, hash->dth_stroffs))
451
452 #define DTRACE_HASHSTR(hash, elm) \
453 dtrace_hash_str(DTRACE_GETSTR(hash, elm))
454
455 #define DTRACE_HASHNEXT(hash, elm) \
456 (void**)((uintptr_t)(elm) + (hash)->dth_nextoffs)
457
458 #define DTRACE_HASHPREV(hash, elm) \
459 (void**)((uintptr_t)(elm) + (hash)->dth_prevoffs)
460
461 #define DTRACE_HASHEQ(hash, lhs, rhs) \
462 (strcmp(DTRACE_GETSTR(hash, lhs), \
463 DTRACE_GETSTR(hash, rhs)) == 0)
464
465 #define DTRACE_AGGHASHSIZE_SLEW 17
466
467 #define DTRACE_V4MAPPED_OFFSET (sizeof (uint32_t) * 3)
468
469 /*
470 * The key for a thread-local variable needs to be unique to a single
471 * thread over the lifetime of the system, and not overlap with any variable
472 * IDs. So we take thread's thread_id, a unique 64-bit number that is never
473 * reused after the thread exits, and add DIF_VARIABLE_MAX to it, which
474 * guarantees that it won’t overlap any variable IDs. We also want to treat
475 * running in interrupt context as independent of thread-context. So if
476 * interrupts are active, we set the 63rd bit, otherwise it’s cleared.
477 *
478 * This is necessary (but not sufficient) to assure that global associative
479 * arrays never collide with thread-local variables. To guarantee that they
480 * cannot collide, we must also define the order for keying dynamic variables.
481 *
482 * That order is:
483 *
484 * [ key0 ] ... [ keyn ] [ variable-key ] [ tls-key ]
485 *
486 * Because the variable-key and the tls-key are in orthogonal spaces, there is
487 * no way for a global variable key signature to match a thread-local key
488 * signature.
489 */
490 #if defined (__x86_64__) || defined(__arm__) || defined(__arm64__)
491 #define DTRACE_TLS_THRKEY(where) { \
492 uint_t intr = ml_at_interrupt_context(); /* Note: just one measly bit */ \
493 uint64_t thr = thread_tid(current_thread()); \
494 ASSERT(intr < 2); \
495 (where) = ((thr + DIF_VARIABLE_MAX) & (~((uint64_t)1 << 63))) | \
496 ((uint64_t)intr << 63); \
497 }
498 #else
499 #error Unknown architecture
500 #endif
501
502 #define DT_BSWAP_8(x) ((x) & 0xff)
503 #define DT_BSWAP_16(x) ((DT_BSWAP_8(x) << 8) | DT_BSWAP_8((x) >> 8))
504 #define DT_BSWAP_32(x) ((DT_BSWAP_16(x) << 16) | DT_BSWAP_16((x) >> 16))
505 #define DT_BSWAP_64(x) ((DT_BSWAP_32(x) << 32) | DT_BSWAP_32((x) >> 32))
506
507 #define DT_MASK_LO 0x00000000FFFFFFFFULL
508
509 #define DTRACE_STORE(type, tomax, offset, what) \
510 *((type *)((uintptr_t)(tomax) + (uintptr_t)offset)) = (type)(what);
511
512
513 #define DTRACE_ALIGNCHECK(addr, size, flags) \
514 if (addr & (MIN(size,4) - 1)) { \
515 *flags |= CPU_DTRACE_BADALIGN; \
516 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
517 return (0); \
518 }
519
520 #define DTRACE_RANGE_REMAIN(remp, addr, baseaddr, basesz) \
521 do { \
522 if ((remp) != NULL) { \
523 *(remp) = (uintptr_t)(baseaddr) + (basesz) - (addr); \
524 } \
525 } while (0)
526
527
528 /*
529 * Test whether a range of memory starting at testaddr of size testsz falls
530 * within the range of memory described by addr, sz. We take care to avoid
531 * problems with overflow and underflow of the unsigned quantities, and
532 * disallow all negative sizes. Ranges of size 0 are allowed.
533 */
534 #define DTRACE_INRANGE(testaddr, testsz, baseaddr, basesz) \
535 ((testaddr) - (baseaddr) < (basesz) && \
536 (testaddr) + (testsz) - (baseaddr) <= (basesz) && \
537 (testaddr) + (testsz) >= (testaddr))
538
539 /*
540 * Test whether alloc_sz bytes will fit in the scratch region. We isolate
541 * alloc_sz on the righthand side of the comparison in order to avoid overflow
542 * or underflow in the comparison with it. This is simpler than the INRANGE
543 * check above, because we know that the dtms_scratch_ptr is valid in the
544 * range. Allocations of size zero are allowed.
545 */
546 #define DTRACE_INSCRATCH(mstate, alloc_sz) \
547 ((mstate)->dtms_scratch_base + (mstate)->dtms_scratch_size - \
548 (mstate)->dtms_scratch_ptr >= (alloc_sz))
549
550 #define RECOVER_LABEL(bits) dtraceLoadRecover##bits:
551
552 #if defined (__x86_64__) || (defined (__arm__) || defined (__arm64__))
553 #define DTRACE_LOADFUNC(bits) \
554 /*CSTYLED*/ \
555 uint##bits##_t dtrace_load##bits(uintptr_t addr); \
556 \
557 uint##bits##_t \
558 dtrace_load##bits(uintptr_t addr) \
559 { \
560 size_t size = bits / NBBY; \
561 /*CSTYLED*/ \
562 uint##bits##_t rval = 0; \
563 int i; \
564 volatile uint16_t *flags = (volatile uint16_t *) \
565 &cpu_core[CPU->cpu_id].cpuc_dtrace_flags; \
566 \
567 DTRACE_ALIGNCHECK(addr, size, flags); \
568 \
569 for (i = 0; i < dtrace_toxranges; i++) { \
570 if (addr >= dtrace_toxrange[i].dtt_limit) \
571 continue; \
572 \
573 if (addr + size <= dtrace_toxrange[i].dtt_base) \
574 continue; \
575 \
576 /* \
577 * This address falls within a toxic region; return 0. \
578 */ \
579 *flags |= CPU_DTRACE_BADADDR; \
580 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
581 return (0); \
582 } \
583 \
584 { \
585 volatile vm_offset_t recover = (vm_offset_t)&&dtraceLoadRecover##bits; \
586 *flags |= CPU_DTRACE_NOFAULT; \
587 recover = dtrace_sign_and_set_thread_recover(current_thread(), recover); \
588 /*CSTYLED*/ \
589 /* \
590 * PR6394061 - avoid device memory that is unpredictably \
591 * mapped and unmapped \
592 */ \
593 if (pmap_valid_page(pmap_find_phys(kernel_pmap, addr))) \
594 rval = *((volatile uint##bits##_t *)addr); \
595 else { \
596 *flags |= CPU_DTRACE_BADADDR; \
597 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = addr; \
598 return (0); \
599 } \
600 \
601 RECOVER_LABEL(bits); \
602 (void)dtrace_set_thread_recover(current_thread(), recover); \
603 *flags &= ~CPU_DTRACE_NOFAULT; \
604 } \
605 \
606 return (rval); \
607 }
608 #else /* all other architectures */
609 #error Unknown Architecture
610 #endif
611
612 #ifdef __LP64__
613 #define dtrace_loadptr dtrace_load64
614 #else
615 #define dtrace_loadptr dtrace_load32
616 #endif
617
618 #define DTRACE_DYNHASH_FREE 0
619 #define DTRACE_DYNHASH_SINK 1
620 #define DTRACE_DYNHASH_VALID 2
621
622 #define DTRACE_MATCH_FAIL -1
623 #define DTRACE_MATCH_NEXT 0
624 #define DTRACE_MATCH_DONE 1
625 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0')
626 #define DTRACE_STATE_ALIGN 64
627
628 #define DTRACE_FLAGS2FLT(flags) \
629 (((flags) & CPU_DTRACE_BADADDR) ? DTRACEFLT_BADADDR : \
630 ((flags) & CPU_DTRACE_ILLOP) ? DTRACEFLT_ILLOP : \
631 ((flags) & CPU_DTRACE_DIVZERO) ? DTRACEFLT_DIVZERO : \
632 ((flags) & CPU_DTRACE_KPRIV) ? DTRACEFLT_KPRIV : \
633 ((flags) & CPU_DTRACE_UPRIV) ? DTRACEFLT_UPRIV : \
634 ((flags) & CPU_DTRACE_TUPOFLOW) ? DTRACEFLT_TUPOFLOW : \
635 ((flags) & CPU_DTRACE_BADALIGN) ? DTRACEFLT_BADALIGN : \
636 ((flags) & CPU_DTRACE_NOSCRATCH) ? DTRACEFLT_NOSCRATCH : \
637 ((flags) & CPU_DTRACE_BADSTACK) ? DTRACEFLT_BADSTACK : \
638 DTRACEFLT_UNKNOWN)
639
640 #define DTRACEACT_ISSTRING(act) \
641 ((act)->dta_kind == DTRACEACT_DIFEXPR && \
642 (act)->dta_difo->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING)
643
644
645 static size_t dtrace_strlen(const char *, size_t);
646 static dtrace_probe_t *dtrace_probe_lookup_id(dtrace_id_t id);
647 static void dtrace_enabling_provide(dtrace_provider_t *);
648 static int dtrace_enabling_match(dtrace_enabling_t *, int *, dtrace_match_cond_t *cond);
649 static void dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond);
650 static void dtrace_enabling_matchall(void);
651 static dtrace_state_t *dtrace_anon_grab(void);
652 static uint64_t dtrace_helper(int, dtrace_mstate_t *,
653 dtrace_state_t *, uint64_t, uint64_t);
654 static dtrace_helpers_t *dtrace_helpers_create(proc_t *);
655 static void dtrace_buffer_drop(dtrace_buffer_t *);
656 static intptr_t dtrace_buffer_reserve(dtrace_buffer_t *, size_t, size_t,
657 dtrace_state_t *, dtrace_mstate_t *);
658 static int dtrace_state_option(dtrace_state_t *, dtrace_optid_t,
659 dtrace_optval_t);
660 static int dtrace_ecb_create_enable(dtrace_probe_t *, void *, void *);
661 static void dtrace_helper_provider_destroy(dtrace_helper_provider_t *);
662 static int dtrace_canload_remains(uint64_t, size_t, size_t *,
663 dtrace_mstate_t *, dtrace_vstate_t *);
664 static int dtrace_canstore_remains(uint64_t, size_t, size_t *,
665 dtrace_mstate_t *, dtrace_vstate_t *);
666
667
668 /*
669 * DTrace sysctl handlers
670 *
671 * These declarations and functions are used for a deeper DTrace configuration.
672 * Most of them are not per-consumer basis and may impact the other DTrace
673 * consumers. Correctness may not be supported for all the variables, so you
674 * should be careful about what values you are using.
675 */
676
677 SYSCTL_DECL(_kern_dtrace);
678 SYSCTL_NODE(_kern, OID_AUTO, dtrace, CTLFLAG_RW | CTLFLAG_LOCKED, 0, "dtrace");
679
680 static int
681 sysctl_dtrace_err_verbose SYSCTL_HANDLER_ARGS
682 {
683 #pragma unused(oidp, arg2)
684 int changed, error;
685 int value = *(int *) arg1;
686
687 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
688 if (error || !changed)
689 return (error);
690
691 if (value != 0 && value != 1)
692 return (ERANGE);
693
694 lck_mtx_lock(&dtrace_lock);
695 dtrace_err_verbose = value;
696 lck_mtx_unlock(&dtrace_lock);
697
698 return (0);
699 }
700
701 /*
702 * kern.dtrace.err_verbose
703 *
704 * Set DTrace verbosity when an error occured (0 = disabled, 1 = enabld).
705 * Errors are reported when a DIFO or a DOF has been rejected by the kernel.
706 */
707 SYSCTL_PROC(_kern_dtrace, OID_AUTO, err_verbose,
708 CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_LOCKED,
709 &dtrace_err_verbose, 0,
710 sysctl_dtrace_err_verbose, "I", "dtrace error verbose");
711
712 static int
713 sysctl_dtrace_buffer_memory_maxsize SYSCTL_HANDLER_ARGS
714 {
715 #pragma unused(oidp, arg2, req)
716 int changed, error;
717 uint64_t value = *(uint64_t *) arg1;
718
719 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
720 if (error || !changed)
721 return (error);
722
723 if (value <= dtrace_buffer_memory_inuse)
724 return (ERANGE);
725
726 lck_mtx_lock(&dtrace_lock);
727 dtrace_buffer_memory_maxsize = value;
728 lck_mtx_unlock(&dtrace_lock);
729
730 return (0);
731 }
732
733 /*
734 * kern.dtrace.buffer_memory_maxsize
735 *
736 * Set DTrace maximal size in bytes used by all the consumers' state buffers. By default
737 * the limit is PHYS_MEM / 3 for *all* consumers. Attempting to set a null, a negative value
738 * or a value <= to dtrace_buffer_memory_inuse will result in a failure.
739 */
740 SYSCTL_PROC(_kern_dtrace, OID_AUTO, buffer_memory_maxsize,
741 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
742 &dtrace_buffer_memory_maxsize, 0,
743 sysctl_dtrace_buffer_memory_maxsize, "Q", "dtrace state buffer memory maxsize");
744
745 /*
746 * kern.dtrace.buffer_memory_inuse
747 *
748 * Current state buffer memory used, in bytes, by all the DTrace consumers.
749 * This value is read-only.
750 */
751 SYSCTL_QUAD(_kern_dtrace, OID_AUTO, buffer_memory_inuse, CTLFLAG_RD | CTLFLAG_LOCKED,
752 &dtrace_buffer_memory_inuse, "dtrace state buffer memory in-use");
753
754 static int
755 sysctl_dtrace_difo_maxsize SYSCTL_HANDLER_ARGS
756 {
757 #pragma unused(oidp, arg2, req)
758 int changed, error;
759 size_t value = *(size_t*) arg1;
760
761 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
762 if (error || !changed)
763 return (error);
764
765 if (value <= 0)
766 return (ERANGE);
767
768 lck_mtx_lock(&dtrace_lock);
769 dtrace_difo_maxsize = value;
770 lck_mtx_unlock(&dtrace_lock);
771
772 return (0);
773 }
774
775 /*
776 * kern.dtrace.difo_maxsize
777 *
778 * Set the DIFO max size in bytes, check the definition of dtrace_difo_maxsize
779 * to get the default value. Attempting to set a null or negative size will
780 * result in a failure.
781 */
782 SYSCTL_PROC(_kern_dtrace, OID_AUTO, difo_maxsize,
783 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
784 &dtrace_difo_maxsize, 0,
785 sysctl_dtrace_difo_maxsize, "Q", "dtrace difo maxsize");
786
787 static int
788 sysctl_dtrace_dof_maxsize SYSCTL_HANDLER_ARGS
789 {
790 #pragma unused(oidp, arg2, req)
791 int changed, error;
792 dtrace_optval_t value = *(dtrace_optval_t *) arg1;
793
794 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
795 if (error || !changed)
796 return (error);
797
798 if (value <= 0)
799 return (ERANGE);
800
801 if (value >= dtrace_copy_maxsize())
802 return (ERANGE);
803
804 lck_mtx_lock(&dtrace_lock);
805 dtrace_dof_maxsize = value;
806 lck_mtx_unlock(&dtrace_lock);
807
808 return (0);
809 }
810
811 /*
812 * kern.dtrace.dof_maxsize
813 *
814 * Set the DOF max size in bytes, check the definition of dtrace_dof_maxsize to
815 * get the default value. Attempting to set a null or negative size will result
816 * in a failure.
817 */
818 SYSCTL_PROC(_kern_dtrace, OID_AUTO, dof_maxsize,
819 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
820 &dtrace_dof_maxsize, 0,
821 sysctl_dtrace_dof_maxsize, "Q", "dtrace dof maxsize");
822
823 static int
824 sysctl_dtrace_statvar_maxsize SYSCTL_HANDLER_ARGS
825 {
826 #pragma unused(oidp, arg2, req)
827 int changed, error;
828 dtrace_optval_t value = *(dtrace_optval_t*) arg1;
829
830 error = sysctl_io_number(req, value, sizeof(value), &value, &changed);
831 if (error || !changed)
832 return (error);
833
834 if (value <= 0)
835 return (ERANGE);
836 if (value > dtrace_statvar_maxsize_max)
837 return (ERANGE);
838
839 lck_mtx_lock(&dtrace_lock);
840 dtrace_statvar_maxsize = value;
841 lck_mtx_unlock(&dtrace_lock);
842
843 return (0);
844 }
845
846 /*
847 * kern.dtrace.global_maxsize
848 *
849 * Set the variable max size in bytes, check the definition of
850 * dtrace_statvar_maxsize to get the default value. Attempting to set a null,
851 * too high or negative size will result in a failure.
852 */
853 SYSCTL_PROC(_kern_dtrace, OID_AUTO, global_maxsize,
854 CTLTYPE_QUAD | CTLFLAG_RW | CTLFLAG_LOCKED,
855 &dtrace_statvar_maxsize, 0,
856 sysctl_dtrace_statvar_maxsize, "Q", "dtrace statvar maxsize");
857
858
859 /*
860 * kern.dtrace.provide_private_probes
861 *
862 * Set whether the providers must provide the private probes. This is
863 * kept as compatibility as they are always provided.
864 */
865 SYSCTL_INT(_kern_dtrace, OID_AUTO, provide_private_probes,
866 CTLFLAG_RD | CTLFLAG_LOCKED,
867 (int *)NULL, 1, "provider must provide the private probes");
868
869 /*
870 * kern.dtrace.dof_mode
871 *
872 * Returns the current DOF mode.
873 * This value is read-only.
874 */
875 SYSCTL_INT(_kern_dtrace, OID_AUTO, dof_mode, CTLFLAG_RD | CTLFLAG_LOCKED,
876 &dtrace_dof_mode, 0, "dtrace dof mode");
877
878 /*
879 * DTrace Probe Context Functions
880 *
881 * These functions are called from probe context. Because probe context is
882 * any context in which C may be called, arbitrarily locks may be held,
883 * interrupts may be disabled, we may be in arbitrary dispatched state, etc.
884 * As a result, functions called from probe context may only call other DTrace
885 * support functions -- they may not interact at all with the system at large.
886 * (Note that the ASSERT macro is made probe-context safe by redefining it in
887 * terms of dtrace_assfail(), a probe-context safe function.) If arbitrary
888 * loads are to be performed from probe context, they _must_ be in terms of
889 * the safe dtrace_load*() variants.
890 *
891 * Some functions in this block are not actually called from probe context;
892 * for these functions, there will be a comment above the function reading
893 * "Note: not called from probe context."
894 */
895
896 int
dtrace_assfail(const char * a,const char * f,int l)897 dtrace_assfail(const char *a, const char *f, int l)
898 {
899 panic("dtrace: assertion failed: %s, file: %s, line: %d", a, f, l);
900
901 /*
902 * We just need something here that even the most clever compiler
903 * cannot optimize away.
904 */
905 return (a[(uintptr_t)f]);
906 }
907
908 /*
909 * Atomically increment a specified error counter from probe context.
910 */
911 static void
dtrace_error(uint32_t * counter)912 dtrace_error(uint32_t *counter)
913 {
914 /*
915 * Most counters stored to in probe context are per-CPU counters.
916 * However, there are some error conditions that are sufficiently
917 * arcane that they don't merit per-CPU storage. If these counters
918 * are incremented concurrently on different CPUs, scalability will be
919 * adversely affected -- but we don't expect them to be white-hot in a
920 * correctly constructed enabling...
921 */
922 uint32_t oval, nval;
923
924 do {
925 oval = *counter;
926
927 if ((nval = oval + 1) == 0) {
928 /*
929 * If the counter would wrap, set it to 1 -- assuring
930 * that the counter is never zero when we have seen
931 * errors. (The counter must be 32-bits because we
932 * aren't guaranteed a 64-bit compare&swap operation.)
933 * To save this code both the infamy of being fingered
934 * by a priggish news story and the indignity of being
935 * the target of a neo-puritan witch trial, we're
936 * carefully avoiding any colorful description of the
937 * likelihood of this condition -- but suffice it to
938 * say that it is only slightly more likely than the
939 * overflow of predicate cache IDs, as discussed in
940 * dtrace_predicate_create().
941 */
942 nval = 1;
943 }
944 } while (dtrace_cas32(counter, oval, nval) != oval);
945 }
946
947 /*
948 * Use the DTRACE_LOADFUNC macro to define functions for each of loading a
949 * uint8_t, a uint16_t, a uint32_t and a uint64_t.
950 */
951 DTRACE_LOADFUNC(8)
952 DTRACE_LOADFUNC(16)
953 DTRACE_LOADFUNC(32)
954 DTRACE_LOADFUNC(64)
955
956 static int
dtrace_inscratch(uintptr_t dest,size_t size,dtrace_mstate_t * mstate)957 dtrace_inscratch(uintptr_t dest, size_t size, dtrace_mstate_t *mstate)
958 {
959 if (dest < mstate->dtms_scratch_base)
960 return (0);
961
962 if (dest + size < dest)
963 return (0);
964
965 if (dest + size > mstate->dtms_scratch_ptr)
966 return (0);
967
968 return (1);
969 }
970
971 static int
dtrace_canstore_statvar(uint64_t addr,size_t sz,size_t * remain,dtrace_statvar_t ** svars,int nsvars)972 dtrace_canstore_statvar(uint64_t addr, size_t sz, size_t *remain,
973 dtrace_statvar_t **svars, int nsvars)
974 {
975 int i;
976
977 size_t maxglobalsize, maxlocalsize;
978
979 maxglobalsize = dtrace_statvar_maxsize + sizeof (uint64_t);
980 maxlocalsize = (maxglobalsize) * NCPU;
981
982 if (nsvars == 0)
983 return (0);
984
985 for (i = 0; i < nsvars; i++) {
986 dtrace_statvar_t *svar = svars[i];
987 uint8_t scope;
988 size_t size;
989
990 if (svar == NULL || (size = svar->dtsv_size) == 0)
991 continue;
992
993 scope = svar->dtsv_var.dtdv_scope;
994
995 /**
996 * We verify that our size is valid in the spirit of providing
997 * defense in depth: we want to prevent attackers from using
998 * DTrace to escalate an orthogonal kernel heap corruption bug
999 * into the ability to store to arbitrary locations in memory.
1000 */
1001 VERIFY((scope == DIFV_SCOPE_GLOBAL && size <= maxglobalsize) ||
1002 (scope == DIFV_SCOPE_LOCAL && size <= maxlocalsize));
1003
1004 if (DTRACE_INRANGE(addr, sz, svar->dtsv_data, svar->dtsv_size)) {
1005 DTRACE_RANGE_REMAIN(remain, addr, svar->dtsv_data,
1006 svar->dtsv_size);
1007 return (1);
1008 }
1009 }
1010
1011 return (0);
1012 }
1013
1014 /*
1015 * Check to see if the address is within a memory region to which a store may
1016 * be issued. This includes the DTrace scratch areas, and any DTrace variable
1017 * region. The caller of dtrace_canstore() is responsible for performing any
1018 * alignment checks that are needed before stores are actually executed.
1019 */
1020 static int
dtrace_canstore(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1021 dtrace_canstore(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1022 dtrace_vstate_t *vstate)
1023 {
1024 return (dtrace_canstore_remains(addr, sz, NULL, mstate, vstate));
1025 }
1026 /*
1027 * Implementation of dtrace_canstore which communicates the upper bound of the
1028 * allowed memory region.
1029 */
1030 static int
dtrace_canstore_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1031 dtrace_canstore_remains(uint64_t addr, size_t sz, size_t *remain,
1032 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1033 {
1034 /*
1035 * First, check to see if the address is in scratch space...
1036 */
1037 if (DTRACE_INRANGE(addr, sz, mstate->dtms_scratch_base,
1038 mstate->dtms_scratch_size)) {
1039 DTRACE_RANGE_REMAIN(remain, addr, mstate->dtms_scratch_base,
1040 mstate->dtms_scratch_size);
1041 return (1);
1042 }
1043 /*
1044 * Now check to see if it's a dynamic variable. This check will pick
1045 * up both thread-local variables and any global dynamically-allocated
1046 * variables.
1047 */
1048 if (DTRACE_INRANGE(addr, sz, (uintptr_t)vstate->dtvs_dynvars.dtds_base,
1049 vstate->dtvs_dynvars.dtds_size)) {
1050 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
1051 uintptr_t base = (uintptr_t)dstate->dtds_base +
1052 (dstate->dtds_hashsize * sizeof (dtrace_dynhash_t));
1053 uintptr_t chunkoffs;
1054 dtrace_dynvar_t *dvar;
1055
1056 /*
1057 * Before we assume that we can store here, we need to make
1058 * sure that it isn't in our metadata -- storing to our
1059 * dynamic variable metadata would corrupt our state. For
1060 * the range to not include any dynamic variable metadata,
1061 * it must:
1062 *
1063 * (1) Start above the hash table that is at the base of
1064 * the dynamic variable space
1065 *
1066 * (2) Have a starting chunk offset that is beyond the
1067 * dtrace_dynvar_t that is at the base of every chunk
1068 *
1069 * (3) Not span a chunk boundary
1070 *
1071 * (4) Not be in the tuple space of a dynamic variable
1072 *
1073 */
1074 if (addr < base)
1075 return (0);
1076
1077 chunkoffs = (addr - base) % dstate->dtds_chunksize;
1078
1079 if (chunkoffs < sizeof (dtrace_dynvar_t))
1080 return (0);
1081
1082 if (chunkoffs + sz > dstate->dtds_chunksize)
1083 return (0);
1084
1085 dvar = (dtrace_dynvar_t *)((uintptr_t)addr - chunkoffs);
1086
1087 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE)
1088 return (0);
1089
1090 if (chunkoffs < sizeof (dtrace_dynvar_t) +
1091 ((dvar->dtdv_tuple.dtt_nkeys - 1) * sizeof (dtrace_key_t)))
1092 return (0);
1093
1094 return (1);
1095 }
1096
1097 /*
1098 * Finally, check the static local and global variables. These checks
1099 * take the longest, so we perform them last.
1100 */
1101 if (dtrace_canstore_statvar(addr, sz, remain,
1102 vstate->dtvs_locals, vstate->dtvs_nlocals))
1103 return (1);
1104
1105 if (dtrace_canstore_statvar(addr, sz, remain,
1106 vstate->dtvs_globals, vstate->dtvs_nglobals))
1107 return (1);
1108
1109 return (0);
1110 }
1111
1112
1113 /*
1114 * Convenience routine to check to see if the address is within a memory
1115 * region in which a load may be issued given the user's privilege level;
1116 * if not, it sets the appropriate error flags and loads 'addr' into the
1117 * illegal value slot.
1118 *
1119 * DTrace subroutines (DIF_SUBR_*) should use this helper to implement
1120 * appropriate memory access protection.
1121 */
1122 int
dtrace_canload(uint64_t addr,size_t sz,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1123 dtrace_canload(uint64_t addr, size_t sz, dtrace_mstate_t *mstate,
1124 dtrace_vstate_t *vstate)
1125 {
1126 return (dtrace_canload_remains(addr, sz, NULL, mstate, vstate));
1127 }
1128
1129 /*
1130 * Implementation of dtrace_canload which communicates the upper bound of the
1131 * allowed memory region.
1132 */
1133 static int
dtrace_canload_remains(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1134 dtrace_canload_remains(uint64_t addr, size_t sz, size_t *remain,
1135 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1136 {
1137 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
1138
1139 /*
1140 * If we hold the privilege to read from kernel memory, then
1141 * everything is readable.
1142 */
1143 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1144 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1145 return (1);
1146 }
1147
1148 /*
1149 * You can obviously read that which you can store.
1150 */
1151 if (dtrace_canstore_remains(addr, sz, remain, mstate, vstate))
1152 return (1);
1153
1154 /*
1155 * We're allowed to read from our own string table.
1156 */
1157 if (DTRACE_INRANGE(addr, sz, (uintptr_t)mstate->dtms_difo->dtdo_strtab,
1158 mstate->dtms_difo->dtdo_strlen)) {
1159 DTRACE_RANGE_REMAIN(remain, addr,
1160 mstate->dtms_difo->dtdo_strtab,
1161 mstate->dtms_difo->dtdo_strlen);
1162 return (1);
1163 }
1164
1165 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
1166 *illval = addr;
1167 return (0);
1168 }
1169
1170 /*
1171 * Convenience routine to check to see if a given string is within a memory
1172 * region in which a load may be issued given the user's privilege level;
1173 * this exists so that we don't need to issue unnecessary dtrace_strlen()
1174 * calls in the event that the user has all privileges.
1175 */
1176 static int
dtrace_strcanload(uint64_t addr,size_t sz,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1177 dtrace_strcanload(uint64_t addr, size_t sz, size_t *remain,
1178 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1179 {
1180 size_t rsize = 0;
1181
1182 /*
1183 * If we hold the privilege to read from kernel memory, then
1184 * everything is readable.
1185 */
1186 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1187 DTRACE_RANGE_REMAIN(remain, addr, addr, sz);
1188 return (1);
1189 }
1190
1191 /*
1192 * Even if the caller is uninterested in querying the remaining valid
1193 * range, it is required to ensure that the access is allowed.
1194 */
1195 if (remain == NULL) {
1196 remain = &rsize;
1197 }
1198 if (dtrace_canload_remains(addr, 0, remain, mstate, vstate)) {
1199 size_t strsz;
1200 /*
1201 * Perform the strlen after determining the length of the
1202 * memory region which is accessible. This prevents timing
1203 * information from being used to find NULs in memory which is
1204 * not accessible to the caller.
1205 */
1206 strsz = 1 + dtrace_strlen((char *)(uintptr_t)addr,
1207 MIN(sz, *remain));
1208 if (strsz <= *remain) {
1209 return (1);
1210 }
1211 }
1212
1213 return (0);
1214 }
1215
1216 /*
1217 * Convenience routine to check to see if a given variable is within a memory
1218 * region in which a load may be issued given the user's privilege level.
1219 */
1220 static int
dtrace_vcanload(void * src,dtrace_diftype_t * type,size_t * remain,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1221 dtrace_vcanload(void *src, dtrace_diftype_t *type, size_t *remain,
1222 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1223 {
1224 size_t sz;
1225 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1226
1227 /*
1228 * Calculate the max size before performing any checks since even
1229 * DTRACE_ACCESS_KERNEL-credentialed callers expect that this function
1230 * return the max length via 'remain'.
1231 */
1232 if (type->dtdt_kind == DIF_TYPE_STRING) {
1233 dtrace_state_t *state = vstate->dtvs_state;
1234
1235 if (state != NULL) {
1236 sz = state->dts_options[DTRACEOPT_STRSIZE];
1237 } else {
1238 /*
1239 * In helper context, we have a NULL state; fall back
1240 * to using the system-wide default for the string size
1241 * in this case.
1242 */
1243 sz = dtrace_strsize_default;
1244 }
1245 } else {
1246 sz = type->dtdt_size;
1247 }
1248
1249 /*
1250 * If we hold the privilege to read from kernel memory, then
1251 * everything is readable.
1252 */
1253 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0) {
1254 DTRACE_RANGE_REMAIN(remain, (uintptr_t)src, src, sz);
1255 return (1);
1256 }
1257
1258 if (type->dtdt_kind == DIF_TYPE_STRING) {
1259 return (dtrace_strcanload((uintptr_t)src, sz, remain, mstate,
1260 vstate));
1261 }
1262 return (dtrace_canload_remains((uintptr_t)src, sz, remain, mstate,
1263 vstate));
1264 }
1265
1266 #define isdigit(ch) ((ch) >= '0' && (ch) <= '9')
1267 #define islower(ch) ((ch) >= 'a' && (ch) <= 'z')
1268 #define isspace(ch) (((ch) == ' ') || ((ch) == '\r') || ((ch) == '\n') || \
1269 ((ch) == '\t') || ((ch) == '\f'))
1270 #define isxdigit(ch) (isdigit(ch) || ((ch) >= 'a' && (ch) <= 'f') || \
1271 ((ch) >= 'A' && (ch) <= 'F'))
1272 #define lisalnum(x) \
1273 (isdigit(x) || ((x) >= 'a' && (x) <= 'z') || ((x) >= 'A' && (x) <= 'Z'))
1274
1275 #define DIGIT(x) \
1276 (isdigit(x) ? (x) - '0' : islower(x) ? (x) + 10 - 'a' : (x) + 10 - 'A')
1277
1278 /*
1279 * Convert a string to a signed integer using safe loads.
1280 */
1281 static int64_t
dtrace_strtoll(char * input,int base,size_t limit)1282 dtrace_strtoll(char *input, int base, size_t limit)
1283 {
1284 uintptr_t pos = (uintptr_t)input;
1285 int64_t val = 0;
1286 int x;
1287 boolean_t neg = B_FALSE;
1288 char c, cc, ccc;
1289 uintptr_t end = pos + limit;
1290
1291 /*
1292 * Consume any whitespace preceding digits.
1293 */
1294 while ((c = dtrace_load8(pos)) == ' ' || c == '\t')
1295 pos++;
1296
1297 /*
1298 * Handle an explicit sign if one is present.
1299 */
1300 if (c == '-' || c == '+') {
1301 if (c == '-')
1302 neg = B_TRUE;
1303 c = dtrace_load8(++pos);
1304 }
1305
1306 /*
1307 * Check for an explicit hexadecimal prefix ("0x" or "0X") and skip it
1308 * if present.
1309 */
1310 if (base == 16 && c == '0' && ((cc = dtrace_load8(pos + 1)) == 'x' ||
1311 cc == 'X') && isxdigit(ccc = dtrace_load8(pos + 2))) {
1312 pos += 2;
1313 c = ccc;
1314 }
1315
1316 /*
1317 * Read in contiguous digits until the first non-digit character.
1318 */
1319 for (; pos < end && c != '\0' && lisalnum(c) && (x = DIGIT(c)) < base;
1320 c = dtrace_load8(++pos))
1321 val = val * base + x;
1322
1323 return (neg ? -val : val);
1324 }
1325
1326
1327 /*
1328 * Compare two strings using safe loads.
1329 */
1330 static int
dtrace_strncmp(const char * s1,const char * s2,size_t limit)1331 dtrace_strncmp(const char *s1, const char *s2, size_t limit)
1332 {
1333 uint8_t c1, c2;
1334 volatile uint16_t *flags;
1335
1336 if (s1 == s2 || limit == 0)
1337 return (0);
1338
1339 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1340
1341 do {
1342 if (s1 == NULL) {
1343 c1 = '\0';
1344 } else {
1345 c1 = dtrace_load8((uintptr_t)s1++);
1346 }
1347
1348 if (s2 == NULL) {
1349 c2 = '\0';
1350 } else {
1351 c2 = dtrace_load8((uintptr_t)s2++);
1352 }
1353
1354 if (c1 != c2)
1355 return (c1 - c2);
1356 } while (--limit && c1 != '\0' && !(*flags & CPU_DTRACE_FAULT));
1357
1358 return (0);
1359 }
1360
1361 /*
1362 * Compute strlen(s) for a string using safe memory accesses. The additional
1363 * len parameter is used to specify a maximum length to ensure completion.
1364 */
1365 static size_t
dtrace_strlen(const char * s,size_t lim)1366 dtrace_strlen(const char *s, size_t lim)
1367 {
1368 uint_t len;
1369
1370 for (len = 0; len != lim; len++) {
1371 if (dtrace_load8((uintptr_t)s++) == '\0')
1372 break;
1373 }
1374
1375 return (len);
1376 }
1377
1378 /*
1379 * Check if an address falls within a toxic region.
1380 */
1381 static int
dtrace_istoxic(uintptr_t kaddr,size_t size)1382 dtrace_istoxic(uintptr_t kaddr, size_t size)
1383 {
1384 uintptr_t taddr, tsize;
1385 int i;
1386
1387 for (i = 0; i < dtrace_toxranges; i++) {
1388 taddr = dtrace_toxrange[i].dtt_base;
1389 tsize = dtrace_toxrange[i].dtt_limit - taddr;
1390
1391 if (kaddr - taddr < tsize) {
1392 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1393 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = kaddr;
1394 return (1);
1395 }
1396
1397 if (taddr - kaddr < size) {
1398 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
1399 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = taddr;
1400 return (1);
1401 }
1402 }
1403
1404 return (0);
1405 }
1406
1407 /*
1408 * Copy src to dst using safe memory accesses. The src is assumed to be unsafe
1409 * memory specified by the DIF program. The dst is assumed to be safe memory
1410 * that we can store to directly because it is managed by DTrace. As with
1411 * standard bcopy, overlapping copies are handled properly.
1412 */
1413 static void
dtrace_bcopy(const void * src,void * dst,size_t len)1414 dtrace_bcopy(const void *src, void *dst, size_t len)
1415 {
1416 if (len != 0) {
1417 uint8_t *s1 = dst;
1418 const uint8_t *s2 = src;
1419
1420 if (s1 <= s2) {
1421 do {
1422 *s1++ = dtrace_load8((uintptr_t)s2++);
1423 } while (--len != 0);
1424 } else {
1425 s2 += len;
1426 s1 += len;
1427
1428 do {
1429 *--s1 = dtrace_load8((uintptr_t)--s2);
1430 } while (--len != 0);
1431 }
1432 }
1433 }
1434
1435 /*
1436 * Copy src to dst using safe memory accesses, up to either the specified
1437 * length, or the point that a nul byte is encountered. The src is assumed to
1438 * be unsafe memory specified by the DIF program. The dst is assumed to be
1439 * safe memory that we can store to directly because it is managed by DTrace.
1440 * Unlike dtrace_bcopy(), overlapping regions are not handled.
1441 */
1442 static void
dtrace_strcpy(const void * src,void * dst,size_t len)1443 dtrace_strcpy(const void *src, void *dst, size_t len)
1444 {
1445 if (len != 0) {
1446 uint8_t *s1 = dst, c;
1447 const uint8_t *s2 = src;
1448
1449 do {
1450 *s1++ = c = dtrace_load8((uintptr_t)s2++);
1451 } while (--len != 0 && c != '\0');
1452 }
1453 }
1454
1455 /*
1456 * Copy src to dst, deriving the size and type from the specified (BYREF)
1457 * variable type. The src is assumed to be unsafe memory specified by the DIF
1458 * program. The dst is assumed to be DTrace variable memory that is of the
1459 * specified type; we assume that we can store to directly.
1460 */
1461 static void
dtrace_vcopy(void * src,void * dst,dtrace_diftype_t * type,size_t limit)1462 dtrace_vcopy(void *src, void *dst, dtrace_diftype_t *type, size_t limit)
1463 {
1464 ASSERT(type->dtdt_flags & DIF_TF_BYREF);
1465
1466 if (type->dtdt_kind == DIF_TYPE_STRING) {
1467 dtrace_strcpy(src, dst, MIN(type->dtdt_size, limit));
1468 } else {
1469 dtrace_bcopy(src, dst, MIN(type->dtdt_size, limit));
1470 }
1471 }
1472
1473 /*
1474 * Compare s1 to s2 using safe memory accesses. The s1 data is assumed to be
1475 * unsafe memory specified by the DIF program. The s2 data is assumed to be
1476 * safe memory that we can access directly because it is managed by DTrace.
1477 */
1478 static int
dtrace_bcmp(const void * s1,const void * s2,size_t len)1479 dtrace_bcmp(const void *s1, const void *s2, size_t len)
1480 {
1481 volatile uint16_t *flags;
1482
1483 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
1484
1485 if (s1 == s2)
1486 return (0);
1487
1488 if (s1 == NULL || s2 == NULL)
1489 return (1);
1490
1491 if (s1 != s2 && len != 0) {
1492 const uint8_t *ps1 = s1;
1493 const uint8_t *ps2 = s2;
1494
1495 do {
1496 if (dtrace_load8((uintptr_t)ps1++) != *ps2++)
1497 return (1);
1498 } while (--len != 0 && !(*flags & CPU_DTRACE_FAULT));
1499 }
1500 return (0);
1501 }
1502
1503 /*
1504 * Zero the specified region using a simple byte-by-byte loop. Note that this
1505 * is for safe DTrace-managed memory only.
1506 */
1507 static void
dtrace_bzero(void * dst,size_t len)1508 dtrace_bzero(void *dst, size_t len)
1509 {
1510 uchar_t *cp;
1511
1512 for (cp = dst; len != 0; len--)
1513 *cp++ = 0;
1514 }
1515
1516 static void
dtrace_add_128(uint64_t * addend1,uint64_t * addend2,uint64_t * sum)1517 dtrace_add_128(uint64_t *addend1, uint64_t *addend2, uint64_t *sum)
1518 {
1519 uint64_t result[2];
1520
1521 result[0] = addend1[0] + addend2[0];
1522 result[1] = addend1[1] + addend2[1] +
1523 (result[0] < addend1[0] || result[0] < addend2[0] ? 1 : 0);
1524
1525 sum[0] = result[0];
1526 sum[1] = result[1];
1527 }
1528
1529 /*
1530 * Shift the 128-bit value in a by b. If b is positive, shift left.
1531 * If b is negative, shift right.
1532 */
1533 static void
dtrace_shift_128(uint64_t * a,int b)1534 dtrace_shift_128(uint64_t *a, int b)
1535 {
1536 uint64_t mask;
1537
1538 if (b == 0)
1539 return;
1540
1541 if (b < 0) {
1542 b = -b;
1543 if (b >= 64) {
1544 a[0] = a[1] >> (b - 64);
1545 a[1] = 0;
1546 } else {
1547 a[0] >>= b;
1548 mask = 1LL << (64 - b);
1549 mask -= 1;
1550 a[0] |= ((a[1] & mask) << (64 - b));
1551 a[1] >>= b;
1552 }
1553 } else {
1554 if (b >= 64) {
1555 a[1] = a[0] << (b - 64);
1556 a[0] = 0;
1557 } else {
1558 a[1] <<= b;
1559 mask = a[0] >> (64 - b);
1560 a[1] |= mask;
1561 a[0] <<= b;
1562 }
1563 }
1564 }
1565
1566 /*
1567 * The basic idea is to break the 2 64-bit values into 4 32-bit values,
1568 * use native multiplication on those, and then re-combine into the
1569 * resulting 128-bit value.
1570 *
1571 * (hi1 << 32 + lo1) * (hi2 << 32 + lo2) =
1572 * hi1 * hi2 << 64 +
1573 * hi1 * lo2 << 32 +
1574 * hi2 * lo1 << 32 +
1575 * lo1 * lo2
1576 */
1577 static void
dtrace_multiply_128(uint64_t factor1,uint64_t factor2,uint64_t * product)1578 dtrace_multiply_128(uint64_t factor1, uint64_t factor2, uint64_t *product)
1579 {
1580 uint64_t hi1, hi2, lo1, lo2;
1581 uint64_t tmp[2];
1582
1583 hi1 = factor1 >> 32;
1584 hi2 = factor2 >> 32;
1585
1586 lo1 = factor1 & DT_MASK_LO;
1587 lo2 = factor2 & DT_MASK_LO;
1588
1589 product[0] = lo1 * lo2;
1590 product[1] = hi1 * hi2;
1591
1592 tmp[0] = hi1 * lo2;
1593 tmp[1] = 0;
1594 dtrace_shift_128(tmp, 32);
1595 dtrace_add_128(product, tmp, product);
1596
1597 tmp[0] = hi2 * lo1;
1598 tmp[1] = 0;
1599 dtrace_shift_128(tmp, 32);
1600 dtrace_add_128(product, tmp, product);
1601 }
1602
1603 /*
1604 * This privilege check should be used by actions and subroutines to
1605 * verify that the user credentials of the process that enabled the
1606 * invoking ECB match the target credentials
1607 */
1608 static int
dtrace_priv_proc_common_user(dtrace_state_t * state)1609 dtrace_priv_proc_common_user(dtrace_state_t *state)
1610 {
1611 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1612
1613 /*
1614 * We should always have a non-NULL state cred here, since if cred
1615 * is null (anonymous tracing), we fast-path bypass this routine.
1616 */
1617 ASSERT(s_cr != NULL);
1618
1619 if ((cr = dtrace_CRED()) != NULL &&
1620 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_uid &&
1621 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_ruid &&
1622 posix_cred_get(s_cr)->cr_uid == posix_cred_get(cr)->cr_suid &&
1623 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_gid &&
1624 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_rgid &&
1625 posix_cred_get(s_cr)->cr_gid == posix_cred_get(cr)->cr_sgid)
1626 return (1);
1627
1628 return (0);
1629 }
1630
1631 /*
1632 * This privilege check should be used by actions and subroutines to
1633 * verify that the zone of the process that enabled the invoking ECB
1634 * matches the target credentials
1635 */
1636 static int
dtrace_priv_proc_common_zone(dtrace_state_t * state)1637 dtrace_priv_proc_common_zone(dtrace_state_t *state)
1638 {
1639 cred_t *cr, *s_cr = state->dts_cred.dcr_cred;
1640 #pragma unused(cr, s_cr, state) /* __APPLE__ */
1641
1642 /*
1643 * We should always have a non-NULL state cred here, since if cred
1644 * is null (anonymous tracing), we fast-path bypass this routine.
1645 */
1646 ASSERT(s_cr != NULL);
1647
1648 return 1; /* APPLE NOTE: Darwin doesn't do zones. */
1649 }
1650
1651 /*
1652 * This privilege check should be used by actions and subroutines to
1653 * verify that the process has not setuid or changed credentials.
1654 */
1655 static int
dtrace_priv_proc_common_nocd(void)1656 dtrace_priv_proc_common_nocd(void)
1657 {
1658 return 1; /* Darwin omits "No Core Dump" flag. */
1659 }
1660
1661 static int
dtrace_priv_proc_destructive(dtrace_state_t * state)1662 dtrace_priv_proc_destructive(dtrace_state_t *state)
1663 {
1664 int action = state->dts_cred.dcr_action;
1665
1666 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1667 goto bad;
1668
1669 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1670 goto bad;
1671
1672 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE) == 0) &&
1673 dtrace_priv_proc_common_zone(state) == 0)
1674 goto bad;
1675
1676 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER) == 0) &&
1677 dtrace_priv_proc_common_user(state) == 0)
1678 goto bad;
1679
1680 if (((action & DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG) == 0) &&
1681 dtrace_priv_proc_common_nocd() == 0)
1682 goto bad;
1683
1684 return (1);
1685
1686 bad:
1687 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1688
1689 return (0);
1690 }
1691
1692 static int
dtrace_priv_proc_control(dtrace_state_t * state)1693 dtrace_priv_proc_control(dtrace_state_t *state)
1694 {
1695 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1696 goto bad;
1697
1698 if (dtrace_is_restricted() && !dtrace_can_attach_to_proc(current_proc()))
1699 goto bad;
1700
1701 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC_CONTROL)
1702 return (1);
1703
1704 if (dtrace_priv_proc_common_zone(state) &&
1705 dtrace_priv_proc_common_user(state) &&
1706 dtrace_priv_proc_common_nocd())
1707 return (1);
1708
1709 bad:
1710 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1711
1712 return (0);
1713 }
1714
1715 static int
dtrace_priv_proc(dtrace_state_t * state)1716 dtrace_priv_proc(dtrace_state_t *state)
1717 {
1718 if (ISSET(current_proc()->p_lflag, P_LNOATTACH))
1719 goto bad;
1720
1721 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed() && !dtrace_can_attach_to_proc(current_proc()))
1722 goto bad;
1723
1724 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1725 return (1);
1726
1727 bad:
1728 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1729
1730 return (0);
1731 }
1732
1733 /*
1734 * The P_LNOATTACH check is an Apple specific check.
1735 * We need a version of dtrace_priv_proc() that omits
1736 * that check for PID and EXECNAME accesses
1737 */
1738 static int
dtrace_priv_proc_relaxed(dtrace_state_t * state)1739 dtrace_priv_proc_relaxed(dtrace_state_t *state)
1740 {
1741
1742 if (state->dts_cred.dcr_action & DTRACE_CRA_PROC)
1743 return (1);
1744
1745 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_UPRIV;
1746
1747 return (0);
1748 }
1749
1750 static int
dtrace_priv_kernel(dtrace_state_t * state)1751 dtrace_priv_kernel(dtrace_state_t *state)
1752 {
1753 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed())
1754 goto bad;
1755
1756 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL)
1757 return (1);
1758
1759 bad:
1760 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1761
1762 return (0);
1763 }
1764
1765 static int
dtrace_priv_kernel_destructive(dtrace_state_t * state)1766 dtrace_priv_kernel_destructive(dtrace_state_t *state)
1767 {
1768 if (dtrace_is_restricted())
1769 goto bad;
1770
1771 if (state->dts_cred.dcr_action & DTRACE_CRA_KERNEL_DESTRUCTIVE)
1772 return (1);
1773
1774 bad:
1775 cpu_core[CPU->cpu_id].cpuc_dtrace_flags |= CPU_DTRACE_KPRIV;
1776
1777 return (0);
1778 }
1779
1780 /*
1781 * Note: not called from probe context. This function is called
1782 * asynchronously (and at a regular interval) from outside of probe context to
1783 * clean the dirty dynamic variable lists on all CPUs. Dynamic variable
1784 * cleaning is explained in detail in <sys/dtrace_impl.h>.
1785 */
1786 static void
dtrace_dynvar_clean(dtrace_dstate_t * dstate)1787 dtrace_dynvar_clean(dtrace_dstate_t *dstate)
1788 {
1789 dtrace_dynvar_t *dirty;
1790 int work = 0;
1791
1792 zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1793 ASSERT(dcpu->dtdsc_rinsing == NULL);
1794
1795 /*
1796 * If the dirty list is NULL, there is no dirty work to do.
1797 */
1798 if (dcpu->dtdsc_dirty == NULL)
1799 continue;
1800
1801 /*
1802 * If the clean list is non-NULL, then we're not going to do
1803 * any work for this CPU -- it means that there has not been
1804 * a dtrace_dynvar() allocation on this CPU (or from this CPU)
1805 * since the last time we cleaned house.
1806 */
1807 if (dcpu->dtdsc_clean != NULL)
1808 continue;
1809
1810 work = 1;
1811
1812 /*
1813 * Atomically move the dirty list aside.
1814 */
1815 do {
1816 dirty = dcpu->dtdsc_dirty;
1817
1818 /*
1819 * Before we zap the dirty list, set the rinsing list.
1820 * (This allows for a potential assertion in
1821 * dtrace_dynvar(): if a free dynamic variable appears
1822 * on a hash chain, either the dirty list or the
1823 * rinsing list for some CPU must be non-NULL.)
1824 */
1825 dcpu->dtdsc_rinsing = dirty;
1826 dtrace_membar_producer();
1827 } while (dtrace_casptr(&dcpu->dtdsc_dirty,
1828 dirty, NULL) != dirty);
1829 }
1830
1831 if (!work) {
1832 /*
1833 * We have no work to do; we can simply return.
1834 */
1835 return;
1836 }
1837
1838 dtrace_sync();
1839
1840 zpercpu_foreach(dcpu, dstate->dtds_percpu) {
1841 if (dcpu->dtdsc_rinsing == NULL)
1842 continue;
1843
1844 /*
1845 * We are now guaranteed that no hash chain contains a pointer
1846 * into this dirty list; we can make it clean.
1847 */
1848 ASSERT(dcpu->dtdsc_clean == NULL);
1849 dcpu->dtdsc_clean = dcpu->dtdsc_rinsing;
1850 dcpu->dtdsc_rinsing = NULL;
1851 }
1852
1853 /*
1854 * Before we actually set the state to be DTRACE_DSTATE_CLEAN, make
1855 * sure that all CPUs have seen all of the dtdsc_clean pointers.
1856 * This prevents a race whereby a CPU incorrectly decides that
1857 * the state should be something other than DTRACE_DSTATE_CLEAN
1858 * after dtrace_dynvar_clean() has completed.
1859 */
1860 dtrace_sync();
1861
1862 dstate->dtds_state = DTRACE_DSTATE_CLEAN;
1863 }
1864
1865 /*
1866 * Depending on the value of the op parameter, this function looks-up,
1867 * allocates or deallocates an arbitrarily-keyed dynamic variable. If an
1868 * allocation is requested, this function will return a pointer to a
1869 * dtrace_dynvar_t corresponding to the allocated variable -- or NULL if no
1870 * variable can be allocated. If NULL is returned, the appropriate counter
1871 * will be incremented.
1872 */
1873 static dtrace_dynvar_t *
dtrace_dynvar(dtrace_dstate_t * dstate,uint_t nkeys,dtrace_key_t * key,size_t dsize,dtrace_dynvar_op_t op,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate)1874 dtrace_dynvar(dtrace_dstate_t *dstate, uint_t nkeys,
1875 dtrace_key_t *key, size_t dsize, dtrace_dynvar_op_t op,
1876 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate)
1877 {
1878 uint64_t hashval = DTRACE_DYNHASH_VALID;
1879 dtrace_dynhash_t *hash = dstate->dtds_hash;
1880 dtrace_dynvar_t *free, *new_free, *next, *dvar, *start, *prev = NULL;
1881 processorid_t me = CPU->cpu_id, cpu = me;
1882 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, me);
1883 size_t bucket, ksize;
1884 size_t chunksize = dstate->dtds_chunksize;
1885 uintptr_t kdata, lock, nstate;
1886 uint_t i;
1887
1888 ASSERT(nkeys != 0);
1889
1890 /*
1891 * Hash the key. As with aggregations, we use Jenkins' "One-at-a-time"
1892 * algorithm. For the by-value portions, we perform the algorithm in
1893 * 16-bit chunks (as opposed to 8-bit chunks). This speeds things up a
1894 * bit, and seems to have only a minute effect on distribution. For
1895 * the by-reference data, we perform "One-at-a-time" iterating (safely)
1896 * over each referenced byte. It's painful to do this, but it's much
1897 * better than pathological hash distribution. The efficacy of the
1898 * hashing algorithm (and a comparison with other algorithms) may be
1899 * found by running the ::dtrace_dynstat MDB dcmd.
1900 */
1901 for (i = 0; i < nkeys; i++) {
1902 if (key[i].dttk_size == 0) {
1903 uint64_t val = key[i].dttk_value;
1904
1905 hashval += (val >> 48) & 0xffff;
1906 hashval += (hashval << 10);
1907 hashval ^= (hashval >> 6);
1908
1909 hashval += (val >> 32) & 0xffff;
1910 hashval += (hashval << 10);
1911 hashval ^= (hashval >> 6);
1912
1913 hashval += (val >> 16) & 0xffff;
1914 hashval += (hashval << 10);
1915 hashval ^= (hashval >> 6);
1916
1917 hashval += val & 0xffff;
1918 hashval += (hashval << 10);
1919 hashval ^= (hashval >> 6);
1920 } else {
1921 /*
1922 * This is incredibly painful, but it beats the hell
1923 * out of the alternative.
1924 */
1925 uint64_t j, size = key[i].dttk_size;
1926 uintptr_t base = (uintptr_t)key[i].dttk_value;
1927
1928 if (!dtrace_canload(base, size, mstate, vstate))
1929 break;
1930
1931 for (j = 0; j < size; j++) {
1932 hashval += dtrace_load8(base + j);
1933 hashval += (hashval << 10);
1934 hashval ^= (hashval >> 6);
1935 }
1936 }
1937 }
1938
1939 if (DTRACE_CPUFLAG_ISSET(CPU_DTRACE_FAULT))
1940 return (NULL);
1941
1942 hashval += (hashval << 3);
1943 hashval ^= (hashval >> 11);
1944 hashval += (hashval << 15);
1945
1946 /*
1947 * There is a remote chance (ideally, 1 in 2^31) that our hashval
1948 * comes out to be one of our two sentinel hash values. If this
1949 * actually happens, we set the hashval to be a value known to be a
1950 * non-sentinel value.
1951 */
1952 if (hashval == DTRACE_DYNHASH_FREE || hashval == DTRACE_DYNHASH_SINK)
1953 hashval = DTRACE_DYNHASH_VALID;
1954
1955 /*
1956 * Yes, it's painful to do a divide here. If the cycle count becomes
1957 * important here, tricks can be pulled to reduce it. (However, it's
1958 * critical that hash collisions be kept to an absolute minimum;
1959 * they're much more painful than a divide.) It's better to have a
1960 * solution that generates few collisions and still keeps things
1961 * relatively simple.
1962 */
1963 bucket = hashval % dstate->dtds_hashsize;
1964
1965 if (op == DTRACE_DYNVAR_DEALLOC) {
1966 volatile uintptr_t *lockp = &hash[bucket].dtdh_lock;
1967
1968 for (;;) {
1969 while ((lock = *lockp) & 1)
1970 continue;
1971
1972 if (dtrace_casptr((void *)(uintptr_t)lockp,
1973 (void *)lock, (void *)(lock + 1)) == (void *)lock)
1974 break;
1975 }
1976
1977 dtrace_membar_producer();
1978 }
1979
1980 top:
1981 prev = NULL;
1982 lock = hash[bucket].dtdh_lock;
1983
1984 dtrace_membar_consumer();
1985
1986 start = hash[bucket].dtdh_chain;
1987 ASSERT(start != NULL && (start->dtdv_hashval == DTRACE_DYNHASH_SINK ||
1988 start->dtdv_hashval != DTRACE_DYNHASH_FREE ||
1989 op != DTRACE_DYNVAR_DEALLOC));
1990
1991 for (dvar = start; dvar != NULL; dvar = dvar->dtdv_next) {
1992 dtrace_tuple_t *dtuple = &dvar->dtdv_tuple;
1993 dtrace_key_t *dkey = &dtuple->dtt_key[0];
1994
1995 if (dvar->dtdv_hashval != hashval) {
1996 if (dvar->dtdv_hashval == DTRACE_DYNHASH_SINK) {
1997 /*
1998 * We've reached the sink, and therefore the
1999 * end of the hash chain; we can kick out of
2000 * the loop knowing that we have seen a valid
2001 * snapshot of state.
2002 */
2003 ASSERT(dvar->dtdv_next == NULL);
2004 ASSERT(dvar == &dtrace_dynhash_sink);
2005 break;
2006 }
2007
2008 if (dvar->dtdv_hashval == DTRACE_DYNHASH_FREE) {
2009 /*
2010 * We've gone off the rails: somewhere along
2011 * the line, one of the members of this hash
2012 * chain was deleted. Note that we could also
2013 * detect this by simply letting this loop run
2014 * to completion, as we would eventually hit
2015 * the end of the dirty list. However, we
2016 * want to avoid running the length of the
2017 * dirty list unnecessarily (it might be quite
2018 * long), so we catch this as early as
2019 * possible by detecting the hash marker. In
2020 * this case, we simply set dvar to NULL and
2021 * break; the conditional after the loop will
2022 * send us back to top.
2023 */
2024 dvar = NULL;
2025 break;
2026 }
2027
2028 goto next;
2029 }
2030
2031 if (dtuple->dtt_nkeys != nkeys)
2032 goto next;
2033
2034 for (i = 0; i < nkeys; i++, dkey++) {
2035 if (dkey->dttk_size != key[i].dttk_size)
2036 goto next; /* size or type mismatch */
2037
2038 if (dkey->dttk_size != 0) {
2039 if (dtrace_bcmp(
2040 (void *)(uintptr_t)key[i].dttk_value,
2041 (void *)(uintptr_t)dkey->dttk_value,
2042 dkey->dttk_size))
2043 goto next;
2044 } else {
2045 if (dkey->dttk_value != key[i].dttk_value)
2046 goto next;
2047 }
2048 }
2049
2050 if (op != DTRACE_DYNVAR_DEALLOC)
2051 return (dvar);
2052
2053 ASSERT(dvar->dtdv_next == NULL ||
2054 dvar->dtdv_next->dtdv_hashval != DTRACE_DYNHASH_FREE);
2055
2056 if (prev != NULL) {
2057 ASSERT(hash[bucket].dtdh_chain != dvar);
2058 ASSERT(start != dvar);
2059 ASSERT(prev->dtdv_next == dvar);
2060 prev->dtdv_next = dvar->dtdv_next;
2061 } else {
2062 if (dtrace_casptr(&hash[bucket].dtdh_chain,
2063 start, dvar->dtdv_next) != start) {
2064 /*
2065 * We have failed to atomically swing the
2066 * hash table head pointer, presumably because
2067 * of a conflicting allocation on another CPU.
2068 * We need to reread the hash chain and try
2069 * again.
2070 */
2071 goto top;
2072 }
2073 }
2074
2075 dtrace_membar_producer();
2076
2077 /*
2078 * Now set the hash value to indicate that it's free.
2079 */
2080 ASSERT(hash[bucket].dtdh_chain != dvar);
2081 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2082
2083 dtrace_membar_producer();
2084
2085 /*
2086 * Set the next pointer to point at the dirty list, and
2087 * atomically swing the dirty pointer to the newly freed dvar.
2088 */
2089 do {
2090 next = dcpu->dtdsc_dirty;
2091 dvar->dtdv_next = next;
2092 } while (dtrace_casptr(&dcpu->dtdsc_dirty, next, dvar) != next);
2093
2094 /*
2095 * Finally, unlock this hash bucket.
2096 */
2097 ASSERT(hash[bucket].dtdh_lock == lock);
2098 ASSERT(lock & 1);
2099 hash[bucket].dtdh_lock++;
2100
2101 return (NULL);
2102 next:
2103 prev = dvar;
2104 continue;
2105 }
2106
2107 if (dvar == NULL) {
2108 /*
2109 * If dvar is NULL, it is because we went off the rails:
2110 * one of the elements that we traversed in the hash chain
2111 * was deleted while we were traversing it. In this case,
2112 * we assert that we aren't doing a dealloc (deallocs lock
2113 * the hash bucket to prevent themselves from racing with
2114 * one another), and retry the hash chain traversal.
2115 */
2116 ASSERT(op != DTRACE_DYNVAR_DEALLOC);
2117 goto top;
2118 }
2119
2120 if (op != DTRACE_DYNVAR_ALLOC) {
2121 /*
2122 * If we are not to allocate a new variable, we want to
2123 * return NULL now. Before we return, check that the value
2124 * of the lock word hasn't changed. If it has, we may have
2125 * seen an inconsistent snapshot.
2126 */
2127 if (op == DTRACE_DYNVAR_NOALLOC) {
2128 if (hash[bucket].dtdh_lock != lock)
2129 goto top;
2130 } else {
2131 ASSERT(op == DTRACE_DYNVAR_DEALLOC);
2132 ASSERT(hash[bucket].dtdh_lock == lock);
2133 ASSERT(lock & 1);
2134 hash[bucket].dtdh_lock++;
2135 }
2136
2137 return (NULL);
2138 }
2139
2140 /*
2141 * We need to allocate a new dynamic variable. The size we need is the
2142 * size of dtrace_dynvar plus the size of nkeys dtrace_key_t's plus the
2143 * size of any auxiliary key data (rounded up to 8-byte alignment) plus
2144 * the size of any referred-to data (dsize). We then round the final
2145 * size up to the chunksize for allocation.
2146 */
2147 for (ksize = 0, i = 0; i < nkeys; i++)
2148 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
2149
2150 /*
2151 * This should be pretty much impossible, but could happen if, say,
2152 * strange DIF specified the tuple. Ideally, this should be an
2153 * assertion and not an error condition -- but that requires that the
2154 * chunksize calculation in dtrace_difo_chunksize() be absolutely
2155 * bullet-proof. (That is, it must not be able to be fooled by
2156 * malicious DIF.) Given the lack of backwards branches in DIF,
2157 * solving this would presumably not amount to solving the Halting
2158 * Problem -- but it still seems awfully hard.
2159 */
2160 if (sizeof (dtrace_dynvar_t) + sizeof (dtrace_key_t) * (nkeys - 1) +
2161 ksize + dsize > chunksize) {
2162 dcpu->dtdsc_drops++;
2163 return (NULL);
2164 }
2165
2166 nstate = DTRACE_DSTATE_EMPTY;
2167
2168 do {
2169 retry:
2170 free = dcpu->dtdsc_free;
2171
2172 if (free == NULL) {
2173 dtrace_dynvar_t *clean = dcpu->dtdsc_clean;
2174 void *rval;
2175
2176 if (clean == NULL) {
2177 /*
2178 * We're out of dynamic variable space on
2179 * this CPU. Unless we have tried all CPUs,
2180 * we'll try to allocate from a different
2181 * CPU.
2182 */
2183 switch (dstate->dtds_state) {
2184 case DTRACE_DSTATE_CLEAN: {
2185 void *sp = &dstate->dtds_state;
2186
2187 if (++cpu >= (int)NCPU)
2188 cpu = 0;
2189
2190 if (dcpu->dtdsc_dirty != NULL &&
2191 nstate == DTRACE_DSTATE_EMPTY)
2192 nstate = DTRACE_DSTATE_DIRTY;
2193
2194 if (dcpu->dtdsc_rinsing != NULL)
2195 nstate = DTRACE_DSTATE_RINSING;
2196
2197 dcpu = zpercpu_get_cpu(dstate->dtds_percpu, cpu);
2198
2199 if (cpu != me)
2200 goto retry;
2201
2202 (void) dtrace_cas32(sp,
2203 DTRACE_DSTATE_CLEAN, nstate);
2204
2205 /*
2206 * To increment the correct bean
2207 * counter, take another lap.
2208 */
2209 goto retry;
2210 }
2211
2212 case DTRACE_DSTATE_DIRTY:
2213 dcpu->dtdsc_dirty_drops++;
2214 break;
2215
2216 case DTRACE_DSTATE_RINSING:
2217 dcpu->dtdsc_rinsing_drops++;
2218 break;
2219
2220 case DTRACE_DSTATE_EMPTY:
2221 dcpu->dtdsc_drops++;
2222 break;
2223 }
2224
2225 DTRACE_CPUFLAG_SET(CPU_DTRACE_DROP);
2226 return (NULL);
2227 }
2228
2229 /*
2230 * The clean list appears to be non-empty. We want to
2231 * move the clean list to the free list; we start by
2232 * moving the clean pointer aside.
2233 */
2234 if (dtrace_casptr(&dcpu->dtdsc_clean,
2235 clean, NULL) != clean) {
2236 /*
2237 * We are in one of two situations:
2238 *
2239 * (a) The clean list was switched to the
2240 * free list by another CPU.
2241 *
2242 * (b) The clean list was added to by the
2243 * cleansing cyclic.
2244 *
2245 * In either of these situations, we can
2246 * just reattempt the free list allocation.
2247 */
2248 goto retry;
2249 }
2250
2251 ASSERT(clean->dtdv_hashval == DTRACE_DYNHASH_FREE);
2252
2253 /*
2254 * Now we'll move the clean list to the free list.
2255 * It's impossible for this to fail: the only way
2256 * the free list can be updated is through this
2257 * code path, and only one CPU can own the clean list.
2258 * Thus, it would only be possible for this to fail if
2259 * this code were racing with dtrace_dynvar_clean().
2260 * (That is, if dtrace_dynvar_clean() updated the clean
2261 * list, and we ended up racing to update the free
2262 * list.) This race is prevented by the dtrace_sync()
2263 * in dtrace_dynvar_clean() -- which flushes the
2264 * owners of the clean lists out before resetting
2265 * the clean lists.
2266 */
2267 rval = dtrace_casptr(&dcpu->dtdsc_free, NULL, clean);
2268 ASSERT(rval == NULL);
2269 goto retry;
2270 }
2271
2272 dvar = free;
2273 new_free = dvar->dtdv_next;
2274 } while (dtrace_casptr(&dcpu->dtdsc_free, free, new_free) != free);
2275
2276 /*
2277 * We have now allocated a new chunk. We copy the tuple keys into the
2278 * tuple array and copy any referenced key data into the data space
2279 * following the tuple array. As we do this, we relocate dttk_value
2280 * in the final tuple to point to the key data address in the chunk.
2281 */
2282 kdata = (uintptr_t)&dvar->dtdv_tuple.dtt_key[nkeys];
2283 dvar->dtdv_data = (void *)(kdata + ksize);
2284 dvar->dtdv_tuple.dtt_nkeys = nkeys;
2285
2286 for (i = 0; i < nkeys; i++) {
2287 dtrace_key_t *dkey = &dvar->dtdv_tuple.dtt_key[i];
2288 size_t kesize = key[i].dttk_size;
2289
2290 if (kesize != 0) {
2291 dtrace_bcopy(
2292 (const void *)(uintptr_t)key[i].dttk_value,
2293 (void *)kdata, kesize);
2294 dkey->dttk_value = kdata;
2295 kdata += P2ROUNDUP(kesize, sizeof (uint64_t));
2296 } else {
2297 dkey->dttk_value = key[i].dttk_value;
2298 }
2299
2300 dkey->dttk_size = kesize;
2301 }
2302
2303 ASSERT(dvar->dtdv_hashval == DTRACE_DYNHASH_FREE);
2304 dvar->dtdv_hashval = hashval;
2305 dvar->dtdv_next = start;
2306
2307 if (dtrace_casptr(&hash[bucket].dtdh_chain, start, dvar) == start)
2308 return (dvar);
2309
2310 /*
2311 * The cas has failed. Either another CPU is adding an element to
2312 * this hash chain, or another CPU is deleting an element from this
2313 * hash chain. The simplest way to deal with both of these cases
2314 * (though not necessarily the most efficient) is to free our
2315 * allocated block and tail-call ourselves. Note that the free is
2316 * to the dirty list and _not_ to the free list. This is to prevent
2317 * races with allocators, above.
2318 */
2319 dvar->dtdv_hashval = DTRACE_DYNHASH_FREE;
2320
2321 dtrace_membar_producer();
2322
2323 do {
2324 free = dcpu->dtdsc_dirty;
2325 dvar->dtdv_next = free;
2326 } while (dtrace_casptr(&dcpu->dtdsc_dirty, free, dvar) != free);
2327
2328 return (dtrace_dynvar(dstate, nkeys, key, dsize, op, mstate, vstate));
2329 }
2330
2331 /*ARGSUSED*/
2332 static void
dtrace_aggregate_min(uint64_t * oval,uint64_t nval,uint64_t arg)2333 dtrace_aggregate_min(uint64_t *oval, uint64_t nval, uint64_t arg)
2334 {
2335 #pragma unused(arg) /* __APPLE__ */
2336 if ((int64_t)nval < (int64_t)*oval)
2337 *oval = nval;
2338 }
2339
2340 /*ARGSUSED*/
2341 static void
dtrace_aggregate_max(uint64_t * oval,uint64_t nval,uint64_t arg)2342 dtrace_aggregate_max(uint64_t *oval, uint64_t nval, uint64_t arg)
2343 {
2344 #pragma unused(arg) /* __APPLE__ */
2345 if ((int64_t)nval > (int64_t)*oval)
2346 *oval = nval;
2347 }
2348
2349 static void
dtrace_aggregate_quantize(uint64_t * quanta,uint64_t nval,uint64_t incr)2350 dtrace_aggregate_quantize(uint64_t *quanta, uint64_t nval, uint64_t incr)
2351 {
2352 int i, zero = DTRACE_QUANTIZE_ZEROBUCKET;
2353 int64_t val = (int64_t)nval;
2354
2355 if (val < 0) {
2356 for (i = 0; i < zero; i++) {
2357 if (val <= DTRACE_QUANTIZE_BUCKETVAL(i)) {
2358 quanta[i] += incr;
2359 return;
2360 }
2361 }
2362 } else {
2363 for (i = zero + 1; i < DTRACE_QUANTIZE_NBUCKETS; i++) {
2364 if (val < DTRACE_QUANTIZE_BUCKETVAL(i)) {
2365 quanta[i - 1] += incr;
2366 return;
2367 }
2368 }
2369
2370 quanta[DTRACE_QUANTIZE_NBUCKETS - 1] += incr;
2371 return;
2372 }
2373
2374 ASSERT(0);
2375 }
2376
2377 static void
dtrace_aggregate_lquantize(uint64_t * lquanta,uint64_t nval,uint64_t incr)2378 dtrace_aggregate_lquantize(uint64_t *lquanta, uint64_t nval, uint64_t incr)
2379 {
2380 uint64_t arg = *lquanta++;
2381 int32_t base = DTRACE_LQUANTIZE_BASE(arg);
2382 uint16_t step = DTRACE_LQUANTIZE_STEP(arg);
2383 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(arg);
2384 int32_t val = (int32_t)nval, level;
2385
2386 ASSERT(step != 0);
2387 ASSERT(levels != 0);
2388
2389 if (val < base) {
2390 /*
2391 * This is an underflow.
2392 */
2393 lquanta[0] += incr;
2394 return;
2395 }
2396
2397 level = (val - base) / step;
2398
2399 if (level < levels) {
2400 lquanta[level + 1] += incr;
2401 return;
2402 }
2403
2404 /*
2405 * This is an overflow.
2406 */
2407 lquanta[levels + 1] += incr;
2408 }
2409
2410 static int
dtrace_aggregate_llquantize_bucket(int16_t factor,int16_t low,int16_t high,int16_t nsteps,int64_t value)2411 dtrace_aggregate_llquantize_bucket(int16_t factor, int16_t low, int16_t high,
2412 int16_t nsteps, int64_t value)
2413 {
2414 int64_t this = 1, last, next;
2415 int base = 1, order;
2416
2417 for (order = 0; order < low; ++order)
2418 this *= factor;
2419
2420 /*
2421 * If our value is less than our factor taken to the power of the
2422 * low order of magnitude, it goes into the zeroth bucket.
2423 */
2424 if (value < this)
2425 return 0;
2426 else
2427 last = this;
2428
2429 for (this *= factor; order <= high; ++order) {
2430 int nbuckets = this > nsteps ? nsteps : this;
2431
2432 /*
2433 * We should not generally get log/linear quantizations
2434 * with a high magnitude that allows 64-bits to
2435 * overflow, but we nonetheless protect against this
2436 * by explicitly checking for overflow, and clamping
2437 * our value accordingly.
2438 */
2439 next = this * factor;
2440 if (next < this) {
2441 value = this - 1;
2442 }
2443
2444 /*
2445 * If our value lies within this order of magnitude,
2446 * determine its position by taking the offset within
2447 * the order of magnitude, dividing by the bucket
2448 * width, and adding to our (accumulated) base.
2449 */
2450 if (value < this) {
2451 return (base + (value - last) / (this / nbuckets));
2452 }
2453
2454 base += nbuckets - (nbuckets / factor);
2455 last = this;
2456 this = next;
2457 }
2458
2459 /*
2460 * Our value is greater than or equal to our factor taken to the
2461 * power of one plus the high magnitude -- return the top bucket.
2462 */
2463 return base;
2464 }
2465
2466 static void
dtrace_aggregate_llquantize(uint64_t * llquanta,uint64_t nval,uint64_t incr)2467 dtrace_aggregate_llquantize(uint64_t *llquanta, uint64_t nval, uint64_t incr)
2468 {
2469 uint64_t arg = *llquanta++;
2470 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(arg);
2471 uint16_t low = DTRACE_LLQUANTIZE_LOW(arg);
2472 uint16_t high = DTRACE_LLQUANTIZE_HIGH(arg);
2473 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(arg);
2474
2475 llquanta[dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, nval)] += incr;
2476 }
2477
2478 /*ARGSUSED*/
2479 static void
dtrace_aggregate_avg(uint64_t * data,uint64_t nval,uint64_t arg)2480 dtrace_aggregate_avg(uint64_t *data, uint64_t nval, uint64_t arg)
2481 {
2482 #pragma unused(arg) /* __APPLE__ */
2483 data[0]++;
2484 data[1] += nval;
2485 }
2486
2487 /*ARGSUSED*/
2488 static void
dtrace_aggregate_stddev(uint64_t * data,uint64_t nval,uint64_t arg)2489 dtrace_aggregate_stddev(uint64_t *data, uint64_t nval, uint64_t arg)
2490 {
2491 #pragma unused(arg) /* __APPLE__ */
2492 int64_t snval = (int64_t)nval;
2493 uint64_t tmp[2];
2494
2495 data[0]++;
2496 data[1] += nval;
2497
2498 /*
2499 * What we want to say here is:
2500 *
2501 * data[2] += nval * nval;
2502 *
2503 * But given that nval is 64-bit, we could easily overflow, so
2504 * we do this as 128-bit arithmetic.
2505 */
2506 if (snval < 0)
2507 snval = -snval;
2508
2509 dtrace_multiply_128((uint64_t)snval, (uint64_t)snval, tmp);
2510 dtrace_add_128(data + 2, tmp, data + 2);
2511 }
2512
2513 /*ARGSUSED*/
2514 static void
dtrace_aggregate_count(uint64_t * oval,uint64_t nval,uint64_t arg)2515 dtrace_aggregate_count(uint64_t *oval, uint64_t nval, uint64_t arg)
2516 {
2517 #pragma unused(nval, arg) /* __APPLE__ */
2518 *oval = *oval + 1;
2519 }
2520
2521 /*ARGSUSED*/
2522 static void
dtrace_aggregate_sum(uint64_t * oval,uint64_t nval,uint64_t arg)2523 dtrace_aggregate_sum(uint64_t *oval, uint64_t nval, uint64_t arg)
2524 {
2525 #pragma unused(arg) /* __APPLE__ */
2526 *oval += nval;
2527 }
2528
2529 /*
2530 * Aggregate given the tuple in the principal data buffer, and the aggregating
2531 * action denoted by the specified dtrace_aggregation_t. The aggregation
2532 * buffer is specified as the buf parameter. This routine does not return
2533 * failure; if there is no space in the aggregation buffer, the data will be
2534 * dropped, and a corresponding counter incremented.
2535 */
2536 __attribute__((noinline))
2537 static void
dtrace_aggregate(dtrace_aggregation_t * agg,dtrace_buffer_t * dbuf,intptr_t offset,dtrace_buffer_t * buf,uint64_t expr,uint64_t arg)2538 dtrace_aggregate(dtrace_aggregation_t *agg, dtrace_buffer_t *dbuf,
2539 intptr_t offset, dtrace_buffer_t *buf, uint64_t expr, uint64_t arg)
2540 {
2541 #pragma unused(arg)
2542 dtrace_recdesc_t *rec = &agg->dtag_action.dta_rec;
2543 uint32_t i, ndx, size, fsize;
2544 uint32_t align = sizeof (uint64_t) - 1;
2545 dtrace_aggbuffer_t *agb;
2546 dtrace_aggkey_t *key;
2547 uint32_t hashval = 0, limit, isstr;
2548 caddr_t tomax, data, kdata;
2549 dtrace_actkind_t action;
2550 dtrace_action_t *act;
2551 uintptr_t offs;
2552
2553 if (buf == NULL)
2554 return;
2555
2556 if (!agg->dtag_hasarg) {
2557 /*
2558 * Currently, only quantize() and lquantize() take additional
2559 * arguments, and they have the same semantics: an increment
2560 * value that defaults to 1 when not present. If additional
2561 * aggregating actions take arguments, the setting of the
2562 * default argument value will presumably have to become more
2563 * sophisticated...
2564 */
2565 arg = 1;
2566 }
2567
2568 action = agg->dtag_action.dta_kind - DTRACEACT_AGGREGATION;
2569 size = rec->dtrd_offset - agg->dtag_base;
2570 fsize = size + rec->dtrd_size;
2571
2572 ASSERT(dbuf->dtb_tomax != NULL);
2573 data = dbuf->dtb_tomax + offset + agg->dtag_base;
2574
2575 if ((tomax = buf->dtb_tomax) == NULL) {
2576 dtrace_buffer_drop(buf);
2577 return;
2578 }
2579
2580 /*
2581 * The metastructure is always at the bottom of the buffer.
2582 */
2583 agb = (dtrace_aggbuffer_t *)(tomax + buf->dtb_size -
2584 sizeof (dtrace_aggbuffer_t));
2585
2586 if (buf->dtb_offset == 0) {
2587 /*
2588 * We just kludge up approximately 1/8th of the size to be
2589 * buckets. If this guess ends up being routinely
2590 * off-the-mark, we may need to dynamically readjust this
2591 * based on past performance.
2592 */
2593 uintptr_t hashsize = (buf->dtb_size >> 3) / sizeof (uintptr_t);
2594
2595 if ((uintptr_t)agb - hashsize * sizeof (dtrace_aggkey_t *) <
2596 (uintptr_t)tomax || hashsize == 0) {
2597 /*
2598 * We've been given a ludicrously small buffer;
2599 * increment our drop count and leave.
2600 */
2601 dtrace_buffer_drop(buf);
2602 return;
2603 }
2604
2605 /*
2606 * And now, a pathetic attempt to try to get a an odd (or
2607 * perchance, a prime) hash size for better hash distribution.
2608 */
2609 if (hashsize > (DTRACE_AGGHASHSIZE_SLEW << 3))
2610 hashsize -= DTRACE_AGGHASHSIZE_SLEW;
2611
2612 agb->dtagb_hashsize = hashsize;
2613 agb->dtagb_hash = (dtrace_aggkey_t **)((uintptr_t)agb -
2614 agb->dtagb_hashsize * sizeof (dtrace_aggkey_t *));
2615 agb->dtagb_free = (uintptr_t)agb->dtagb_hash;
2616
2617 for (i = 0; i < agb->dtagb_hashsize; i++)
2618 agb->dtagb_hash[i] = NULL;
2619 }
2620
2621 ASSERT(agg->dtag_first != NULL);
2622 ASSERT(agg->dtag_first->dta_intuple);
2623
2624 /*
2625 * Calculate the hash value based on the key. Note that we _don't_
2626 * include the aggid in the hashing (but we will store it as part of
2627 * the key). The hashing algorithm is Bob Jenkins' "One-at-a-time"
2628 * algorithm: a simple, quick algorithm that has no known funnels, and
2629 * gets good distribution in practice. The efficacy of the hashing
2630 * algorithm (and a comparison with other algorithms) may be found by
2631 * running the ::dtrace_aggstat MDB dcmd.
2632 */
2633 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2634 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2635 limit = i + act->dta_rec.dtrd_size;
2636 ASSERT(limit <= size);
2637 isstr = DTRACEACT_ISSTRING(act);
2638
2639 for (; i < limit; i++) {
2640 hashval += data[i];
2641 hashval += (hashval << 10);
2642 hashval ^= (hashval >> 6);
2643
2644 if (isstr && data[i] == '\0')
2645 break;
2646 }
2647 }
2648
2649 hashval += (hashval << 3);
2650 hashval ^= (hashval >> 11);
2651 hashval += (hashval << 15);
2652
2653 /*
2654 * Yes, the divide here is expensive -- but it's generally the least
2655 * of the performance issues given the amount of data that we iterate
2656 * over to compute hash values, compare data, etc.
2657 */
2658 ndx = hashval % agb->dtagb_hashsize;
2659
2660 for (key = agb->dtagb_hash[ndx]; key != NULL; key = key->dtak_next) {
2661 ASSERT((caddr_t)key >= tomax);
2662 ASSERT((caddr_t)key < tomax + buf->dtb_size);
2663
2664 if (hashval != key->dtak_hashval || key->dtak_size != size)
2665 continue;
2666
2667 kdata = key->dtak_data;
2668 ASSERT(kdata >= tomax && kdata < tomax + buf->dtb_size);
2669
2670 for (act = agg->dtag_first; act->dta_intuple;
2671 act = act->dta_next) {
2672 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2673 limit = i + act->dta_rec.dtrd_size;
2674 ASSERT(limit <= size);
2675 isstr = DTRACEACT_ISSTRING(act);
2676
2677 for (; i < limit; i++) {
2678 if (kdata[i] != data[i])
2679 goto next;
2680
2681 if (isstr && data[i] == '\0')
2682 break;
2683 }
2684 }
2685
2686 if (action != key->dtak_action) {
2687 /*
2688 * We are aggregating on the same value in the same
2689 * aggregation with two different aggregating actions.
2690 * (This should have been picked up in the compiler,
2691 * so we may be dealing with errant or devious DIF.)
2692 * This is an error condition; we indicate as much,
2693 * and return.
2694 */
2695 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
2696 return;
2697 }
2698
2699 /*
2700 * This is a hit: we need to apply the aggregator to
2701 * the value at this key.
2702 */
2703 agg->dtag_aggregate((uint64_t *)(kdata + size), expr, arg);
2704 return;
2705 next:
2706 continue;
2707 }
2708
2709 /*
2710 * We didn't find it. We need to allocate some zero-filled space,
2711 * link it into the hash table appropriately, and apply the aggregator
2712 * to the (zero-filled) value.
2713 */
2714 offs = buf->dtb_offset;
2715 while (offs & (align - 1))
2716 offs += sizeof (uint32_t);
2717
2718 /*
2719 * If we don't have enough room to both allocate a new key _and_
2720 * its associated data, increment the drop count and return.
2721 */
2722 if ((uintptr_t)tomax + offs + fsize >
2723 agb->dtagb_free - sizeof (dtrace_aggkey_t)) {
2724 dtrace_buffer_drop(buf);
2725 return;
2726 }
2727
2728 /*CONSTCOND*/
2729 ASSERT(!(sizeof (dtrace_aggkey_t) & (sizeof (uintptr_t) - 1)));
2730 key = (dtrace_aggkey_t *)(agb->dtagb_free - sizeof (dtrace_aggkey_t));
2731 agb->dtagb_free -= sizeof (dtrace_aggkey_t);
2732
2733 key->dtak_data = kdata = tomax + offs;
2734 buf->dtb_offset = offs + fsize;
2735
2736 /*
2737 * Now copy the data across.
2738 */
2739 *((dtrace_aggid_t *)kdata) = agg->dtag_id;
2740
2741 for (i = sizeof (dtrace_aggid_t); i < size; i++)
2742 kdata[i] = data[i];
2743
2744 /*
2745 * Because strings are not zeroed out by default, we need to iterate
2746 * looking for actions that store strings, and we need to explicitly
2747 * pad these strings out with zeroes.
2748 */
2749 for (act = agg->dtag_first; act->dta_intuple; act = act->dta_next) {
2750 int nul;
2751
2752 if (!DTRACEACT_ISSTRING(act))
2753 continue;
2754
2755 i = act->dta_rec.dtrd_offset - agg->dtag_base;
2756 limit = i + act->dta_rec.dtrd_size;
2757 ASSERT(limit <= size);
2758
2759 for (nul = 0; i < limit; i++) {
2760 if (nul) {
2761 kdata[i] = '\0';
2762 continue;
2763 }
2764
2765 if (data[i] != '\0')
2766 continue;
2767
2768 nul = 1;
2769 }
2770 }
2771
2772 for (i = size; i < fsize; i++)
2773 kdata[i] = 0;
2774
2775 key->dtak_hashval = hashval;
2776 key->dtak_size = size;
2777 key->dtak_action = action;
2778 key->dtak_next = agb->dtagb_hash[ndx];
2779 agb->dtagb_hash[ndx] = key;
2780
2781 /*
2782 * Finally, apply the aggregator.
2783 */
2784 *((uint64_t *)(key->dtak_data + size)) = agg->dtag_initial;
2785 agg->dtag_aggregate((uint64_t *)(key->dtak_data + size), expr, arg);
2786 }
2787
2788 /*
2789 * Given consumer state, this routine finds a speculation in the INACTIVE
2790 * state and transitions it into the ACTIVE state. If there is no speculation
2791 * in the INACTIVE state, 0 is returned. In this case, no error counter is
2792 * incremented -- it is up to the caller to take appropriate action.
2793 */
2794 static int
dtrace_speculation(dtrace_state_t * state)2795 dtrace_speculation(dtrace_state_t *state)
2796 {
2797 int i = 0;
2798 dtrace_speculation_state_t current;
2799 uint32_t *stat = &state->dts_speculations_unavail, count;
2800
2801 while (i < state->dts_nspeculations) {
2802 dtrace_speculation_t *spec = &state->dts_speculations[i];
2803
2804 current = spec->dtsp_state;
2805
2806 if (current != DTRACESPEC_INACTIVE) {
2807 if (current == DTRACESPEC_COMMITTINGMANY ||
2808 current == DTRACESPEC_COMMITTING ||
2809 current == DTRACESPEC_DISCARDING)
2810 stat = &state->dts_speculations_busy;
2811 i++;
2812 continue;
2813 }
2814
2815 if (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2816 current, DTRACESPEC_ACTIVE) == current)
2817 return (i + 1);
2818 }
2819
2820 /*
2821 * We couldn't find a speculation. If we found as much as a single
2822 * busy speculation buffer, we'll attribute this failure as "busy"
2823 * instead of "unavail".
2824 */
2825 do {
2826 count = *stat;
2827 } while (dtrace_cas32(stat, count, count + 1) != count);
2828
2829 return (0);
2830 }
2831
2832 /*
2833 * This routine commits an active speculation. If the specified speculation
2834 * is not in a valid state to perform a commit(), this routine will silently do
2835 * nothing. The state of the specified speculation is transitioned according
2836 * to the state transition diagram outlined in <sys/dtrace_impl.h>
2837 */
2838 static void
dtrace_speculation_commit(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)2839 dtrace_speculation_commit(dtrace_state_t *state, processorid_t cpu,
2840 dtrace_specid_t which)
2841 {
2842 dtrace_speculation_t *spec;
2843 dtrace_buffer_t *src, *dest;
2844 uintptr_t daddr, saddr, dlimit, slimit;
2845 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
2846 intptr_t offs;
2847 uint64_t timestamp;
2848
2849 if (which == 0)
2850 return;
2851
2852 if (which > (dtrace_specid_t)state->dts_nspeculations) {
2853 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
2854 return;
2855 }
2856
2857 spec = &state->dts_speculations[which - 1];
2858 src = &spec->dtsp_buffer[cpu];
2859 dest = &state->dts_buffer[cpu];
2860
2861 do {
2862 current = spec->dtsp_state;
2863
2864 if (current == DTRACESPEC_COMMITTINGMANY)
2865 break;
2866
2867 switch (current) {
2868 case DTRACESPEC_INACTIVE:
2869 case DTRACESPEC_DISCARDING:
2870 return;
2871
2872 case DTRACESPEC_COMMITTING:
2873 /*
2874 * This is only possible if we are (a) commit()'ing
2875 * without having done a prior speculate() on this CPU
2876 * and (b) racing with another commit() on a different
2877 * CPU. There's nothing to do -- we just assert that
2878 * our offset is 0.
2879 */
2880 ASSERT(src->dtb_offset == 0);
2881 return;
2882
2883 case DTRACESPEC_ACTIVE:
2884 new = DTRACESPEC_COMMITTING;
2885 break;
2886
2887 case DTRACESPEC_ACTIVEONE:
2888 /*
2889 * This speculation is active on one CPU. If our
2890 * buffer offset is non-zero, we know that the one CPU
2891 * must be us. Otherwise, we are committing on a
2892 * different CPU from the speculate(), and we must
2893 * rely on being asynchronously cleaned.
2894 */
2895 if (src->dtb_offset != 0) {
2896 new = DTRACESPEC_COMMITTING;
2897 break;
2898 }
2899 OS_FALLTHROUGH;
2900
2901 case DTRACESPEC_ACTIVEMANY:
2902 new = DTRACESPEC_COMMITTINGMANY;
2903 break;
2904
2905 default:
2906 ASSERT(0);
2907 }
2908 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
2909 current, new) != current);
2910
2911 /*
2912 * We have set the state to indicate that we are committing this
2913 * speculation. Now reserve the necessary space in the destination
2914 * buffer.
2915 */
2916 if ((offs = dtrace_buffer_reserve(dest, src->dtb_offset,
2917 sizeof (uint64_t), state, NULL)) < 0) {
2918 dtrace_buffer_drop(dest);
2919 goto out;
2920 }
2921
2922 /*
2923 * We have sufficient space to copy the speculative buffer into the
2924 * primary buffer. First, modify the speculative buffer, filling
2925 * in the timestamp of all entries with the current time. The data
2926 * must have the commit() time rather than the time it was traced,
2927 * so that all entries in the primary buffer are in timestamp order.
2928 */
2929 timestamp = dtrace_gethrtime();
2930 saddr = (uintptr_t)src->dtb_tomax;
2931 slimit = saddr + src->dtb_offset;
2932 while (saddr < slimit) {
2933 size_t size;
2934 dtrace_rechdr_t *dtrh = (dtrace_rechdr_t *)saddr;
2935
2936 if (dtrh->dtrh_epid == DTRACE_EPIDNONE) {
2937 saddr += sizeof (dtrace_epid_t);
2938 continue;
2939 }
2940
2941 ASSERT(dtrh->dtrh_epid <= ((dtrace_epid_t) state->dts_necbs));
2942 size = state->dts_ecbs[dtrh->dtrh_epid - 1]->dte_size;
2943
2944 ASSERT(saddr + size <= slimit);
2945 ASSERT(size >= sizeof(dtrace_rechdr_t));
2946 ASSERT(DTRACE_RECORD_LOAD_TIMESTAMP(dtrh) == UINT64_MAX);
2947
2948 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, timestamp);
2949
2950 saddr += size;
2951 }
2952
2953 /*
2954 * Copy the buffer across. (Note that this is a
2955 * highly subobtimal bcopy(); in the unlikely event that this becomes
2956 * a serious performance issue, a high-performance DTrace-specific
2957 * bcopy() should obviously be invented.)
2958 */
2959 daddr = (uintptr_t)dest->dtb_tomax + offs;
2960 dlimit = daddr + src->dtb_offset;
2961 saddr = (uintptr_t)src->dtb_tomax;
2962
2963 /*
2964 * First, the aligned portion.
2965 */
2966 while (dlimit - daddr >= sizeof (uint64_t)) {
2967 *((uint64_t *)daddr) = *((uint64_t *)saddr);
2968
2969 daddr += sizeof (uint64_t);
2970 saddr += sizeof (uint64_t);
2971 }
2972
2973 /*
2974 * Now any left-over bit...
2975 */
2976 while (dlimit - daddr)
2977 *((uint8_t *)daddr++) = *((uint8_t *)saddr++);
2978
2979 /*
2980 * Finally, commit the reserved space in the destination buffer.
2981 */
2982 dest->dtb_offset = offs + src->dtb_offset;
2983
2984 out:
2985 /*
2986 * If we're lucky enough to be the only active CPU on this speculation
2987 * buffer, we can just set the state back to DTRACESPEC_INACTIVE.
2988 */
2989 if (current == DTRACESPEC_ACTIVE ||
2990 (current == DTRACESPEC_ACTIVEONE && new == DTRACESPEC_COMMITTING)) {
2991 uint32_t rval = dtrace_cas32((uint32_t *)&spec->dtsp_state,
2992 DTRACESPEC_COMMITTING, DTRACESPEC_INACTIVE);
2993 #pragma unused(rval) /* __APPLE__ */
2994
2995 ASSERT(rval == DTRACESPEC_COMMITTING);
2996 }
2997
2998 src->dtb_offset = 0;
2999 src->dtb_xamot_drops += src->dtb_drops;
3000 src->dtb_drops = 0;
3001 }
3002
3003 /*
3004 * This routine discards an active speculation. If the specified speculation
3005 * is not in a valid state to perform a discard(), this routine will silently
3006 * do nothing. The state of the specified speculation is transitioned
3007 * according to the state transition diagram outlined in <sys/dtrace_impl.h>
3008 */
3009 __attribute__((noinline))
3010 static void
dtrace_speculation_discard(dtrace_state_t * state,processorid_t cpu,dtrace_specid_t which)3011 dtrace_speculation_discard(dtrace_state_t *state, processorid_t cpu,
3012 dtrace_specid_t which)
3013 {
3014 dtrace_speculation_t *spec;
3015 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3016 dtrace_buffer_t *buf;
3017
3018 if (which == 0)
3019 return;
3020
3021 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3022 cpu_core[cpu].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3023 return;
3024 }
3025
3026 spec = &state->dts_speculations[which - 1];
3027 buf = &spec->dtsp_buffer[cpu];
3028
3029 do {
3030 current = spec->dtsp_state;
3031
3032 switch (current) {
3033 case DTRACESPEC_INACTIVE:
3034 case DTRACESPEC_COMMITTINGMANY:
3035 case DTRACESPEC_COMMITTING:
3036 case DTRACESPEC_DISCARDING:
3037 return;
3038
3039 case DTRACESPEC_ACTIVE:
3040 case DTRACESPEC_ACTIVEMANY:
3041 new = DTRACESPEC_DISCARDING;
3042 break;
3043
3044 case DTRACESPEC_ACTIVEONE:
3045 if (buf->dtb_offset != 0) {
3046 new = DTRACESPEC_INACTIVE;
3047 } else {
3048 new = DTRACESPEC_DISCARDING;
3049 }
3050 break;
3051
3052 default:
3053 ASSERT(0);
3054 }
3055 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3056 current, new) != current);
3057
3058 buf->dtb_offset = 0;
3059 buf->dtb_drops = 0;
3060 }
3061
3062 /*
3063 * Note: not called from probe context. This function is called
3064 * asynchronously from cross call context to clean any speculations that are
3065 * in the COMMITTINGMANY or DISCARDING states. These speculations may not be
3066 * transitioned back to the INACTIVE state until all CPUs have cleaned the
3067 * speculation.
3068 */
3069 static void
dtrace_speculation_clean_here(dtrace_state_t * state)3070 dtrace_speculation_clean_here(dtrace_state_t *state)
3071 {
3072 dtrace_icookie_t cookie;
3073 processorid_t cpu = CPU->cpu_id;
3074 dtrace_buffer_t *dest = &state->dts_buffer[cpu];
3075 dtrace_specid_t i;
3076
3077 cookie = dtrace_interrupt_disable();
3078
3079 if (dest->dtb_tomax == NULL) {
3080 dtrace_interrupt_enable(cookie);
3081 return;
3082 }
3083
3084 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3085 dtrace_speculation_t *spec = &state->dts_speculations[i];
3086 dtrace_buffer_t *src = &spec->dtsp_buffer[cpu];
3087
3088 if (src->dtb_tomax == NULL)
3089 continue;
3090
3091 if (spec->dtsp_state == DTRACESPEC_DISCARDING) {
3092 src->dtb_offset = 0;
3093 continue;
3094 }
3095
3096 if (spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3097 continue;
3098
3099 if (src->dtb_offset == 0)
3100 continue;
3101
3102 dtrace_speculation_commit(state, cpu, i + 1);
3103 }
3104
3105 dtrace_interrupt_enable(cookie);
3106 }
3107
3108 /*
3109 * Note: not called from probe context. This function is called
3110 * asynchronously (and at a regular interval) to clean any speculations that
3111 * are in the COMMITTINGMANY or DISCARDING states. If it discovers that there
3112 * is work to be done, it cross calls all CPUs to perform that work;
3113 * COMMITMANY and DISCARDING speculations may not be transitioned back to the
3114 * INACTIVE state until they have been cleaned by all CPUs.
3115 */
3116 static void
dtrace_speculation_clean(dtrace_state_t * state)3117 dtrace_speculation_clean(dtrace_state_t *state)
3118 {
3119 int work = 0;
3120 uint32_t rv;
3121 dtrace_specid_t i;
3122
3123 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3124 dtrace_speculation_t *spec = &state->dts_speculations[i];
3125
3126 ASSERT(!spec->dtsp_cleaning);
3127
3128 if (spec->dtsp_state != DTRACESPEC_DISCARDING &&
3129 spec->dtsp_state != DTRACESPEC_COMMITTINGMANY)
3130 continue;
3131
3132 work++;
3133 spec->dtsp_cleaning = 1;
3134 }
3135
3136 if (!work)
3137 return;
3138
3139 dtrace_xcall(DTRACE_CPUALL,
3140 (dtrace_xcall_t)dtrace_speculation_clean_here, state);
3141
3142 /*
3143 * We now know that all CPUs have committed or discarded their
3144 * speculation buffers, as appropriate. We can now set the state
3145 * to inactive.
3146 */
3147 for (i = 0; i < (dtrace_specid_t)state->dts_nspeculations; i++) {
3148 dtrace_speculation_t *spec = &state->dts_speculations[i];
3149 dtrace_speculation_state_t current, new;
3150
3151 if (!spec->dtsp_cleaning)
3152 continue;
3153
3154 current = spec->dtsp_state;
3155 ASSERT(current == DTRACESPEC_DISCARDING ||
3156 current == DTRACESPEC_COMMITTINGMANY);
3157
3158 new = DTRACESPEC_INACTIVE;
3159
3160 rv = dtrace_cas32((uint32_t *)&spec->dtsp_state, current, new);
3161 ASSERT(rv == current);
3162 spec->dtsp_cleaning = 0;
3163 }
3164 }
3165
3166 /*
3167 * Called as part of a speculate() to get the speculative buffer associated
3168 * with a given speculation. Returns NULL if the specified speculation is not
3169 * in an ACTIVE state. If the speculation is in the ACTIVEONE state -- and
3170 * the active CPU is not the specified CPU -- the speculation will be
3171 * atomically transitioned into the ACTIVEMANY state.
3172 */
3173 __attribute__((noinline))
3174 static dtrace_buffer_t *
dtrace_speculation_buffer(dtrace_state_t * state,processorid_t cpuid,dtrace_specid_t which)3175 dtrace_speculation_buffer(dtrace_state_t *state, processorid_t cpuid,
3176 dtrace_specid_t which)
3177 {
3178 dtrace_speculation_t *spec;
3179 dtrace_speculation_state_t current, new = DTRACESPEC_INACTIVE;
3180 dtrace_buffer_t *buf;
3181
3182 if (which == 0)
3183 return (NULL);
3184
3185 if (which > (dtrace_specid_t)state->dts_nspeculations) {
3186 cpu_core[cpuid].cpuc_dtrace_flags |= CPU_DTRACE_ILLOP;
3187 return (NULL);
3188 }
3189
3190 spec = &state->dts_speculations[which - 1];
3191 buf = &spec->dtsp_buffer[cpuid];
3192
3193 do {
3194 current = spec->dtsp_state;
3195
3196 switch (current) {
3197 case DTRACESPEC_INACTIVE:
3198 case DTRACESPEC_COMMITTINGMANY:
3199 case DTRACESPEC_DISCARDING:
3200 return (NULL);
3201
3202 case DTRACESPEC_COMMITTING:
3203 ASSERT(buf->dtb_offset == 0);
3204 return (NULL);
3205
3206 case DTRACESPEC_ACTIVEONE:
3207 /*
3208 * This speculation is currently active on one CPU.
3209 * Check the offset in the buffer; if it's non-zero,
3210 * that CPU must be us (and we leave the state alone).
3211 * If it's zero, assume that we're starting on a new
3212 * CPU -- and change the state to indicate that the
3213 * speculation is active on more than one CPU.
3214 */
3215 if (buf->dtb_offset != 0)
3216 return (buf);
3217
3218 new = DTRACESPEC_ACTIVEMANY;
3219 break;
3220
3221 case DTRACESPEC_ACTIVEMANY:
3222 return (buf);
3223
3224 case DTRACESPEC_ACTIVE:
3225 new = DTRACESPEC_ACTIVEONE;
3226 break;
3227
3228 default:
3229 ASSERT(0);
3230 }
3231 } while (dtrace_cas32((uint32_t *)&spec->dtsp_state,
3232 current, new) != current);
3233
3234 ASSERT(new == DTRACESPEC_ACTIVEONE || new == DTRACESPEC_ACTIVEMANY);
3235 return (buf);
3236 }
3237
3238 /*
3239 * Return a string. In the event that the user lacks the privilege to access
3240 * arbitrary kernel memory, we copy the string out to scratch memory so that we
3241 * don't fail access checking.
3242 *
3243 * dtrace_dif_variable() uses this routine as a helper for various
3244 * builtin values such as 'execname' and 'probefunc.'
3245 */
3246 static
3247 uintptr_t
dtrace_dif_varstr(uintptr_t addr,dtrace_state_t * state,dtrace_mstate_t * mstate)3248 dtrace_dif_varstr(uintptr_t addr, dtrace_state_t *state,
3249 dtrace_mstate_t *mstate)
3250 {
3251 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
3252 uintptr_t ret;
3253 size_t strsz;
3254
3255 /*
3256 * The easy case: this probe is allowed to read all of memory, so
3257 * we can just return this as a vanilla pointer.
3258 */
3259 if ((mstate->dtms_access & DTRACE_ACCESS_KERNEL) != 0)
3260 return (addr);
3261
3262 /*
3263 * This is the tougher case: we copy the string in question from
3264 * kernel memory into scratch memory and return it that way: this
3265 * ensures that we won't trip up when access checking tests the
3266 * BYREF return value.
3267 */
3268 strsz = dtrace_strlen((char *)addr, size) + 1;
3269
3270 if (mstate->dtms_scratch_ptr + strsz >
3271 mstate->dtms_scratch_base + mstate->dtms_scratch_size) {
3272 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3273 return (0);
3274 }
3275
3276 dtrace_strcpy((const void *)addr, (void *)mstate->dtms_scratch_ptr,
3277 strsz);
3278 ret = mstate->dtms_scratch_ptr;
3279 mstate->dtms_scratch_ptr += strsz;
3280 return (ret);
3281 }
3282
3283 /*
3284 * This function implements the DIF emulator's variable lookups. The emulator
3285 * passes a reserved variable identifier and optional built-in array index.
3286 */
3287 static uint64_t
dtrace_dif_variable(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t v,uint64_t ndx)3288 dtrace_dif_variable(dtrace_mstate_t *mstate, dtrace_state_t *state, uint64_t v,
3289 uint64_t ndx)
3290 {
3291 /*
3292 * If we're accessing one of the uncached arguments, we'll turn this
3293 * into a reference in the args array.
3294 */
3295 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9) {
3296 ndx = v - DIF_VAR_ARG0;
3297 v = DIF_VAR_ARGS;
3298 }
3299
3300 switch (v) {
3301 case DIF_VAR_ARGS:
3302 ASSERT(mstate->dtms_present & DTRACE_MSTATE_ARGS);
3303 if (ndx >= sizeof (mstate->dtms_arg) /
3304 sizeof (mstate->dtms_arg[0])) {
3305 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3306 dtrace_vstate_t *vstate = &state->dts_vstate;
3307 dtrace_provider_t *pv;
3308 uint64_t val;
3309
3310 pv = mstate->dtms_probe->dtpr_provider;
3311 if (pv->dtpv_pops.dtps_getargval != NULL)
3312 val = pv->dtpv_pops.dtps_getargval(pv->dtpv_arg,
3313 mstate->dtms_probe->dtpr_id,
3314 mstate->dtms_probe->dtpr_arg, ndx, aframes);
3315 /* Special case access of arg5 as passed to dtrace_probe_error() (which see.) */
3316 else if (mstate->dtms_probe->dtpr_id == dtrace_probeid_error && ndx == 5) {
3317 return ((dtrace_state_t *)(uintptr_t)(mstate->dtms_arg[0]))->dts_arg_error_illval;
3318 }
3319
3320 else
3321 val = dtrace_getarg(ndx, aframes, mstate, vstate);
3322
3323 /*
3324 * This is regrettably required to keep the compiler
3325 * from tail-optimizing the call to dtrace_getarg().
3326 * The condition always evaluates to true, but the
3327 * compiler has no way of figuring that out a priori.
3328 * (None of this would be necessary if the compiler
3329 * could be relied upon to _always_ tail-optimize
3330 * the call to dtrace_getarg() -- but it can't.)
3331 */
3332 if (mstate->dtms_probe != NULL)
3333 return (val);
3334
3335 ASSERT(0);
3336 }
3337
3338 return (mstate->dtms_arg[ndx]);
3339
3340 case DIF_VAR_UREGS: {
3341 thread_t thread;
3342
3343 if (!dtrace_priv_proc(state))
3344 return (0);
3345
3346 if ((thread = current_thread()) == NULL) {
3347 DTRACE_CPUFLAG_SET(CPU_DTRACE_BADADDR);
3348 cpu_core[CPU->cpu_id].cpuc_dtrace_illval = 0;
3349 return (0);
3350 }
3351
3352 return (dtrace_getreg(find_user_regs(thread), ndx));
3353 }
3354
3355 case DIF_VAR_VMREGS: {
3356 uint64_t rval;
3357
3358 if (!dtrace_priv_kernel(state))
3359 return (0);
3360
3361 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3362
3363 rval = dtrace_getvmreg(ndx);
3364
3365 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3366
3367 return (rval);
3368 }
3369
3370 case DIF_VAR_CURTHREAD:
3371 if (!dtrace_priv_kernel(state))
3372 return (0);
3373
3374 return ((uint64_t)(uintptr_t)current_thread());
3375
3376 case DIF_VAR_TIMESTAMP:
3377 if (!(mstate->dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
3378 mstate->dtms_timestamp = dtrace_gethrtime();
3379 mstate->dtms_present |= DTRACE_MSTATE_TIMESTAMP;
3380 }
3381 return (mstate->dtms_timestamp);
3382
3383 case DIF_VAR_VTIMESTAMP:
3384 ASSERT(dtrace_vtime_references != 0);
3385 return (dtrace_get_thread_vtime(current_thread()));
3386
3387 case DIF_VAR_WALLTIMESTAMP:
3388 if (!(mstate->dtms_present & DTRACE_MSTATE_WALLTIMESTAMP)) {
3389 mstate->dtms_walltimestamp = dtrace_gethrestime();
3390 mstate->dtms_present |= DTRACE_MSTATE_WALLTIMESTAMP;
3391 }
3392 return (mstate->dtms_walltimestamp);
3393
3394 case DIF_VAR_MACHTIMESTAMP:
3395 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHTIMESTAMP)) {
3396 mstate->dtms_machtimestamp = mach_absolute_time();
3397 mstate->dtms_present |= DTRACE_MSTATE_MACHTIMESTAMP;
3398 }
3399 return (mstate->dtms_machtimestamp);
3400
3401 case DIF_VAR_MACHCTIMESTAMP:
3402 if (!(mstate->dtms_present & DTRACE_MSTATE_MACHCTIMESTAMP)) {
3403 mstate->dtms_machctimestamp = mach_continuous_time();
3404 mstate->dtms_present |= DTRACE_MSTATE_MACHCTIMESTAMP;
3405 }
3406 return (mstate->dtms_machctimestamp);
3407
3408
3409 case DIF_VAR_CPU:
3410 return ((uint64_t) dtrace_get_thread_last_cpu_id(current_thread()));
3411
3412 case DIF_VAR_IPL:
3413 if (!dtrace_priv_kernel(state))
3414 return (0);
3415 if (!(mstate->dtms_present & DTRACE_MSTATE_IPL)) {
3416 mstate->dtms_ipl = dtrace_getipl();
3417 mstate->dtms_present |= DTRACE_MSTATE_IPL;
3418 }
3419 return (mstate->dtms_ipl);
3420
3421 case DIF_VAR_EPID:
3422 ASSERT(mstate->dtms_present & DTRACE_MSTATE_EPID);
3423 return (mstate->dtms_epid);
3424
3425 case DIF_VAR_ID:
3426 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3427 return (mstate->dtms_probe->dtpr_id);
3428
3429 case DIF_VAR_STACKDEPTH:
3430 if (!dtrace_priv_kernel(state))
3431 return (0);
3432 if (!(mstate->dtms_present & DTRACE_MSTATE_STACKDEPTH)) {
3433 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3434
3435 mstate->dtms_stackdepth = dtrace_getstackdepth(aframes);
3436 mstate->dtms_present |= DTRACE_MSTATE_STACKDEPTH;
3437 }
3438 return (mstate->dtms_stackdepth);
3439
3440 case DIF_VAR_USTACKDEPTH:
3441 if (!dtrace_priv_proc(state))
3442 return (0);
3443 if (!(mstate->dtms_present & DTRACE_MSTATE_USTACKDEPTH)) {
3444 /*
3445 * See comment in DIF_VAR_PID.
3446 */
3447 if (DTRACE_ANCHORED(mstate->dtms_probe) &&
3448 CPU_ON_INTR(CPU)) {
3449 mstate->dtms_ustackdepth = 0;
3450 } else {
3451 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3452 mstate->dtms_ustackdepth =
3453 dtrace_getustackdepth();
3454 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3455 }
3456 mstate->dtms_present |= DTRACE_MSTATE_USTACKDEPTH;
3457 }
3458 return (mstate->dtms_ustackdepth);
3459
3460 case DIF_VAR_CALLER:
3461 if (!dtrace_priv_kernel(state))
3462 return (0);
3463 if (!(mstate->dtms_present & DTRACE_MSTATE_CALLER)) {
3464 int aframes = mstate->dtms_probe->dtpr_aframes + 2;
3465
3466 if (!DTRACE_ANCHORED(mstate->dtms_probe)) {
3467 /*
3468 * If this is an unanchored probe, we are
3469 * required to go through the slow path:
3470 * dtrace_caller() only guarantees correct
3471 * results for anchored probes.
3472 */
3473 pc_t caller[2];
3474
3475 dtrace_getpcstack(caller, 2, aframes,
3476 (uint32_t *)(uintptr_t)mstate->dtms_arg[0]);
3477 mstate->dtms_caller = caller[1];
3478 } else if ((mstate->dtms_caller =
3479 dtrace_caller(aframes)) == (uintptr_t)-1) {
3480 /*
3481 * We have failed to do this the quick way;
3482 * we must resort to the slower approach of
3483 * calling dtrace_getpcstack().
3484 */
3485 pc_t caller;
3486
3487 dtrace_getpcstack(&caller, 1, aframes, NULL);
3488 mstate->dtms_caller = caller;
3489 }
3490
3491 mstate->dtms_present |= DTRACE_MSTATE_CALLER;
3492 }
3493 return (mstate->dtms_caller);
3494
3495 case DIF_VAR_UCALLER:
3496 if (!dtrace_priv_proc(state))
3497 return (0);
3498
3499 if (!(mstate->dtms_present & DTRACE_MSTATE_UCALLER)) {
3500 uint64_t ustack[3];
3501
3502 /*
3503 * dtrace_getupcstack() fills in the first uint64_t
3504 * with the current PID. The second uint64_t will
3505 * be the program counter at user-level. The third
3506 * uint64_t will contain the caller, which is what
3507 * we're after.
3508 */
3509 ustack[2] = 0;
3510 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
3511 dtrace_getupcstack(ustack, 3);
3512 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
3513 mstate->dtms_ucaller = ustack[2];
3514 mstate->dtms_present |= DTRACE_MSTATE_UCALLER;
3515 }
3516
3517 return (mstate->dtms_ucaller);
3518
3519 case DIF_VAR_PROBEPROV:
3520 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3521 return (dtrace_dif_varstr(
3522 (uintptr_t)mstate->dtms_probe->dtpr_provider->dtpv_name,
3523 state, mstate));
3524
3525 case DIF_VAR_PROBEMOD:
3526 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3527 return (dtrace_dif_varstr(
3528 (uintptr_t)mstate->dtms_probe->dtpr_mod,
3529 state, mstate));
3530
3531 case DIF_VAR_PROBEFUNC:
3532 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3533 return (dtrace_dif_varstr(
3534 (uintptr_t)mstate->dtms_probe->dtpr_func,
3535 state, mstate));
3536
3537 case DIF_VAR_PROBENAME:
3538 ASSERT(mstate->dtms_present & DTRACE_MSTATE_PROBE);
3539 return (dtrace_dif_varstr(
3540 (uintptr_t)mstate->dtms_probe->dtpr_name,
3541 state, mstate));
3542
3543 case DIF_VAR_PID:
3544 if (!dtrace_priv_proc_relaxed(state))
3545 return (0);
3546
3547 /*
3548 * Note that we are assuming that an unanchored probe is
3549 * always due to a high-level interrupt. (And we're assuming
3550 * that there is only a single high level interrupt.)
3551 */
3552 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3553 /* Anchored probe that fires while on an interrupt accrues to process 0 */
3554 return 0;
3555
3556 return ((uint64_t)dtrace_proc_selfpid());
3557
3558 case DIF_VAR_PPID:
3559 if (!dtrace_priv_proc_relaxed(state))
3560 return (0);
3561
3562 /*
3563 * See comment in DIF_VAR_PID.
3564 */
3565 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3566 return (0);
3567
3568 return ((uint64_t)dtrace_proc_selfppid());
3569
3570 case DIF_VAR_TID:
3571 /* We do not need to check for null current_thread() */
3572 return thread_tid(current_thread()); /* globally unique */
3573
3574 case DIF_VAR_PTHREAD_SELF:
3575 if (!dtrace_priv_proc(state))
3576 return (0);
3577
3578 /* Not currently supported, but we should be able to delta the dispatchqaddr and dispatchqoffset to get pthread_self */
3579 return 0;
3580
3581 case DIF_VAR_DISPATCHQADDR:
3582 if (!dtrace_priv_proc(state))
3583 return (0);
3584
3585 /* We do not need to check for null current_thread() */
3586 return thread_dispatchqaddr(current_thread());
3587
3588 case DIF_VAR_EXECNAME:
3589 {
3590 char *xname = (char *)mstate->dtms_scratch_ptr;
3591 char *pname = proc_best_name(curproc);
3592 size_t scratch_size = sizeof(proc_name_t);
3593
3594 /* The scratch allocation's lifetime is that of the clause. */
3595 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3596 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3597 return 0;
3598 }
3599
3600 if (!dtrace_priv_proc_relaxed(state))
3601 return (0);
3602
3603 mstate->dtms_scratch_ptr += scratch_size;
3604 strlcpy(xname, pname, scratch_size);
3605
3606 return ((uint64_t)(uintptr_t)xname);
3607 }
3608
3609
3610 case DIF_VAR_ZONENAME:
3611 {
3612 /* scratch_size is equal to length('global') + 1 for the null-terminator. */
3613 char *zname = (char *)mstate->dtms_scratch_ptr;
3614 size_t scratch_size = 6 + 1;
3615
3616 if (!dtrace_priv_proc(state))
3617 return (0);
3618
3619 /* The scratch allocation's lifetime is that of the clause. */
3620 if (!DTRACE_INSCRATCH(mstate, scratch_size)) {
3621 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
3622 return 0;
3623 }
3624
3625 mstate->dtms_scratch_ptr += scratch_size;
3626
3627 /* The kernel does not provide zonename, it will always return 'global'. */
3628 strlcpy(zname, "global", scratch_size);
3629
3630 return ((uint64_t)(uintptr_t)zname);
3631 }
3632
3633 #if MONOTONIC
3634 case DIF_VAR_CPUINSTRS:
3635 return mt_cur_cpu_instrs();
3636
3637 case DIF_VAR_CPUCYCLES:
3638 return mt_cur_cpu_cycles();
3639
3640 case DIF_VAR_VINSTRS:
3641 return mt_cur_thread_instrs();
3642
3643 case DIF_VAR_VCYCLES:
3644 return mt_cur_thread_cycles();
3645 #else /* MONOTONIC */
3646 case DIF_VAR_CPUINSTRS: /* FALLTHROUGH */
3647 case DIF_VAR_CPUCYCLES: /* FALLTHROUGH */
3648 case DIF_VAR_VINSTRS: /* FALLTHROUGH */
3649 case DIF_VAR_VCYCLES: /* FALLTHROUGH */
3650 return 0;
3651 #endif /* !MONOTONIC */
3652
3653 case DIF_VAR_UID:
3654 if (!dtrace_priv_proc_relaxed(state))
3655 return (0);
3656
3657 /*
3658 * See comment in DIF_VAR_PID.
3659 */
3660 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3661 return (0);
3662
3663 return ((uint64_t) dtrace_proc_selfruid());
3664
3665 case DIF_VAR_GID:
3666 if (!dtrace_priv_proc(state))
3667 return (0);
3668
3669 /*
3670 * See comment in DIF_VAR_PID.
3671 */
3672 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3673 return (0);
3674
3675 if (dtrace_CRED() != NULL)
3676 /* Credential does not require lazy initialization. */
3677 return ((uint64_t)kauth_getgid());
3678 else {
3679 /* proc_lock would be taken under kauth_cred_proc_ref() in kauth_cred_get(). */
3680 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3681 return -1ULL;
3682 }
3683
3684 case DIF_VAR_ERRNO: {
3685 uthread_t uthread = current_uthread();
3686 if (!dtrace_priv_proc(state))
3687 return (0);
3688
3689 /*
3690 * See comment in DIF_VAR_PID.
3691 */
3692 if (DTRACE_ANCHORED(mstate->dtms_probe) && CPU_ON_INTR(CPU))
3693 return (0);
3694
3695 if (uthread)
3696 return (uint64_t)uthread->t_dtrace_errno;
3697 else {
3698 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3699 return -1ULL;
3700 }
3701 }
3702
3703 default:
3704 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
3705 return (0);
3706 }
3707 }
3708
3709 typedef enum dtrace_json_state {
3710 DTRACE_JSON_REST = 1,
3711 DTRACE_JSON_OBJECT,
3712 DTRACE_JSON_STRING,
3713 DTRACE_JSON_STRING_ESCAPE,
3714 DTRACE_JSON_STRING_ESCAPE_UNICODE,
3715 DTRACE_JSON_COLON,
3716 DTRACE_JSON_COMMA,
3717 DTRACE_JSON_VALUE,
3718 DTRACE_JSON_IDENTIFIER,
3719 DTRACE_JSON_NUMBER,
3720 DTRACE_JSON_NUMBER_FRAC,
3721 DTRACE_JSON_NUMBER_EXP,
3722 DTRACE_JSON_COLLECT_OBJECT
3723 } dtrace_json_state_t;
3724
3725 /*
3726 * This function possesses just enough knowledge about JSON to extract a single
3727 * value from a JSON string and store it in the scratch buffer. It is able
3728 * to extract nested object values, and members of arrays by index.
3729 *
3730 * elemlist is a list of JSON keys, stored as packed NUL-terminated strings, to
3731 * be looked up as we descend into the object tree. e.g.
3732 *
3733 * foo[0].bar.baz[32] --> "foo" NUL "0" NUL "bar" NUL "baz" NUL "32" NUL
3734 * with nelems = 5.
3735 *
3736 * The run time of this function must be bounded above by strsize to limit the
3737 * amount of work done in probe context. As such, it is implemented as a
3738 * simple state machine, reading one character at a time using safe loads
3739 * until we find the requested element, hit a parsing error or run off the
3740 * end of the object or string.
3741 *
3742 * As there is no way for a subroutine to return an error without interrupting
3743 * clause execution, we simply return NULL in the event of a missing key or any
3744 * other error condition. Each NULL return in this function is commented with
3745 * the error condition it represents -- parsing or otherwise.
3746 *
3747 * The set of states for the state machine closely matches the JSON
3748 * specification (http://json.org/). Briefly:
3749 *
3750 * DTRACE_JSON_REST:
3751 * Skip whitespace until we find either a top-level Object, moving
3752 * to DTRACE_JSON_OBJECT; or an Array, moving to DTRACE_JSON_VALUE.
3753 *
3754 * DTRACE_JSON_OBJECT:
3755 * Locate the next key String in an Object. Sets a flag to denote
3756 * the next String as a key string and moves to DTRACE_JSON_STRING.
3757 *
3758 * DTRACE_JSON_COLON:
3759 * Skip whitespace until we find the colon that separates key Strings
3760 * from their values. Once found, move to DTRACE_JSON_VALUE.
3761 *
3762 * DTRACE_JSON_VALUE:
3763 * Detects the type of the next value (String, Number, Identifier, Object
3764 * or Array) and routes to the states that process that type. Here we also
3765 * deal with the element selector list if we are requested to traverse down
3766 * into the object tree.
3767 *
3768 * DTRACE_JSON_COMMA:
3769 * Skip whitespace until we find the comma that separates key-value pairs
3770 * in Objects (returning to DTRACE_JSON_OBJECT) or values in Arrays
3771 * (similarly DTRACE_JSON_VALUE). All following literal value processing
3772 * states return to this state at the end of their value, unless otherwise
3773 * noted.
3774 *
3775 * DTRACE_JSON_NUMBER, DTRACE_JSON_NUMBER_FRAC, DTRACE_JSON_NUMBER_EXP:
3776 * Processes a Number literal from the JSON, including any exponent
3777 * component that may be present. Numbers are returned as strings, which
3778 * may be passed to strtoll() if an integer is required.
3779 *
3780 * DTRACE_JSON_IDENTIFIER:
3781 * Processes a "true", "false" or "null" literal in the JSON.
3782 *
3783 * DTRACE_JSON_STRING, DTRACE_JSON_STRING_ESCAPE,
3784 * DTRACE_JSON_STRING_ESCAPE_UNICODE:
3785 * Processes a String literal from the JSON, whether the String denotes
3786 * a key, a value or part of a larger Object. Handles all escape sequences
3787 * present in the specification, including four-digit unicode characters,
3788 * but merely includes the escape sequence without converting it to the
3789 * actual escaped character. If the String is flagged as a key, we
3790 * move to DTRACE_JSON_COLON rather than DTRACE_JSON_COMMA.
3791 *
3792 * DTRACE_JSON_COLLECT_OBJECT:
3793 * This state collects an entire Object (or Array), correctly handling
3794 * embedded strings. If the full element selector list matches this nested
3795 * object, we return the Object in full as a string. If not, we use this
3796 * state to skip to the next value at this level and continue processing.
3797 */
3798 static char *
dtrace_json(uint64_t size,uintptr_t json,char * elemlist,int nelems,char * dest)3799 dtrace_json(uint64_t size, uintptr_t json, char *elemlist, int nelems,
3800 char *dest)
3801 {
3802 dtrace_json_state_t state = DTRACE_JSON_REST;
3803 int64_t array_elem = INT64_MIN;
3804 int64_t array_pos = 0;
3805 uint8_t escape_unicount = 0;
3806 boolean_t string_is_key = B_FALSE;
3807 boolean_t collect_object = B_FALSE;
3808 boolean_t found_key = B_FALSE;
3809 boolean_t in_array = B_FALSE;
3810 uint32_t braces = 0, brackets = 0;
3811 char *elem = elemlist;
3812 char *dd = dest;
3813 uintptr_t cur;
3814
3815 for (cur = json; cur < json + size; cur++) {
3816 char cc = dtrace_load8(cur);
3817 if (cc == '\0')
3818 return (NULL);
3819
3820 switch (state) {
3821 case DTRACE_JSON_REST:
3822 if (isspace(cc))
3823 break;
3824
3825 if (cc == '{') {
3826 state = DTRACE_JSON_OBJECT;
3827 break;
3828 }
3829
3830 if (cc == '[') {
3831 in_array = B_TRUE;
3832 array_pos = 0;
3833 array_elem = dtrace_strtoll(elem, 10, size);
3834 found_key = array_elem == 0 ? B_TRUE : B_FALSE;
3835 state = DTRACE_JSON_VALUE;
3836 break;
3837 }
3838
3839 /*
3840 * ERROR: expected to find a top-level object or array.
3841 */
3842 return (NULL);
3843 case DTRACE_JSON_OBJECT:
3844 if (isspace(cc))
3845 break;
3846
3847 if (cc == '"') {
3848 state = DTRACE_JSON_STRING;
3849 string_is_key = B_TRUE;
3850 break;
3851 }
3852
3853 /*
3854 * ERROR: either the object did not start with a key
3855 * string, or we've run off the end of the object
3856 * without finding the requested key.
3857 */
3858 return (NULL);
3859 case DTRACE_JSON_STRING:
3860 if (cc == '\\') {
3861 *dd++ = '\\';
3862 state = DTRACE_JSON_STRING_ESCAPE;
3863 break;
3864 }
3865
3866 if (cc == '"') {
3867 if (collect_object) {
3868 /*
3869 * We don't reset the dest here, as
3870 * the string is part of a larger
3871 * object being collected.
3872 */
3873 *dd++ = cc;
3874 collect_object = B_FALSE;
3875 state = DTRACE_JSON_COLLECT_OBJECT;
3876 break;
3877 }
3878 *dd = '\0';
3879 dd = dest; /* reset string buffer */
3880 if (string_is_key) {
3881 if (dtrace_strncmp(dest, elem,
3882 size) == 0)
3883 found_key = B_TRUE;
3884 } else if (found_key) {
3885 if (nelems > 1) {
3886 /*
3887 * We expected an object, not
3888 * this string.
3889 */
3890 return (NULL);
3891 }
3892 return (dest);
3893 }
3894 state = string_is_key ? DTRACE_JSON_COLON :
3895 DTRACE_JSON_COMMA;
3896 string_is_key = B_FALSE;
3897 break;
3898 }
3899
3900 *dd++ = cc;
3901 break;
3902 case DTRACE_JSON_STRING_ESCAPE:
3903 *dd++ = cc;
3904 if (cc == 'u') {
3905 escape_unicount = 0;
3906 state = DTRACE_JSON_STRING_ESCAPE_UNICODE;
3907 } else {
3908 state = DTRACE_JSON_STRING;
3909 }
3910 break;
3911 case DTRACE_JSON_STRING_ESCAPE_UNICODE:
3912 if (!isxdigit(cc)) {
3913 /*
3914 * ERROR: invalid unicode escape, expected
3915 * four valid hexidecimal digits.
3916 */
3917 return (NULL);
3918 }
3919
3920 *dd++ = cc;
3921 if (++escape_unicount == 4)
3922 state = DTRACE_JSON_STRING;
3923 break;
3924 case DTRACE_JSON_COLON:
3925 if (isspace(cc))
3926 break;
3927
3928 if (cc == ':') {
3929 state = DTRACE_JSON_VALUE;
3930 break;
3931 }
3932
3933 /*
3934 * ERROR: expected a colon.
3935 */
3936 return (NULL);
3937 case DTRACE_JSON_COMMA:
3938 if (isspace(cc))
3939 break;
3940
3941 if (cc == ',') {
3942 if (in_array) {
3943 state = DTRACE_JSON_VALUE;
3944 if (++array_pos == array_elem)
3945 found_key = B_TRUE;
3946 } else {
3947 state = DTRACE_JSON_OBJECT;
3948 }
3949 break;
3950 }
3951
3952 /*
3953 * ERROR: either we hit an unexpected character, or
3954 * we reached the end of the object or array without
3955 * finding the requested key.
3956 */
3957 return (NULL);
3958 case DTRACE_JSON_IDENTIFIER:
3959 if (islower(cc)) {
3960 *dd++ = cc;
3961 break;
3962 }
3963
3964 *dd = '\0';
3965 dd = dest; /* reset string buffer */
3966
3967 if (dtrace_strncmp(dest, "true", 5) == 0 ||
3968 dtrace_strncmp(dest, "false", 6) == 0 ||
3969 dtrace_strncmp(dest, "null", 5) == 0) {
3970 if (found_key) {
3971 if (nelems > 1) {
3972 /*
3973 * ERROR: We expected an object,
3974 * not this identifier.
3975 */
3976 return (NULL);
3977 }
3978 return (dest);
3979 } else {
3980 cur--;
3981 state = DTRACE_JSON_COMMA;
3982 break;
3983 }
3984 }
3985
3986 /*
3987 * ERROR: we did not recognise the identifier as one
3988 * of those in the JSON specification.
3989 */
3990 return (NULL);
3991 case DTRACE_JSON_NUMBER:
3992 if (cc == '.') {
3993 *dd++ = cc;
3994 state = DTRACE_JSON_NUMBER_FRAC;
3995 break;
3996 }
3997
3998 if (cc == 'x' || cc == 'X') {
3999 /*
4000 * ERROR: specification explicitly excludes
4001 * hexidecimal or octal numbers.
4002 */
4003 return (NULL);
4004 }
4005
4006 OS_FALLTHROUGH;
4007 case DTRACE_JSON_NUMBER_FRAC:
4008 if (cc == 'e' || cc == 'E') {
4009 *dd++ = cc;
4010 state = DTRACE_JSON_NUMBER_EXP;
4011 break;
4012 }
4013
4014 if (cc == '+' || cc == '-') {
4015 /*
4016 * ERROR: expect sign as part of exponent only.
4017 */
4018 return (NULL);
4019 }
4020 OS_FALLTHROUGH;
4021 case DTRACE_JSON_NUMBER_EXP:
4022 if (isdigit(cc) || cc == '+' || cc == '-') {
4023 *dd++ = cc;
4024 break;
4025 }
4026
4027 *dd = '\0';
4028 dd = dest; /* reset string buffer */
4029 if (found_key) {
4030 if (nelems > 1) {
4031 /*
4032 * ERROR: We expected an object, not
4033 * this number.
4034 */
4035 return (NULL);
4036 }
4037 return (dest);
4038 }
4039
4040 cur--;
4041 state = DTRACE_JSON_COMMA;
4042 break;
4043 case DTRACE_JSON_VALUE:
4044 if (isspace(cc))
4045 break;
4046
4047 if (cc == '{' || cc == '[') {
4048 if (nelems > 1 && found_key) {
4049 in_array = cc == '[' ? B_TRUE : B_FALSE;
4050 /*
4051 * If our element selector directs us
4052 * to descend into this nested object,
4053 * then move to the next selector
4054 * element in the list and restart the
4055 * state machine.
4056 */
4057 while (*elem != '\0')
4058 elem++;
4059 elem++; /* skip the inter-element NUL */
4060 nelems--;
4061 dd = dest;
4062 if (in_array) {
4063 state = DTRACE_JSON_VALUE;
4064 array_pos = 0;
4065 array_elem = dtrace_strtoll(
4066 elem, 10, size);
4067 found_key = array_elem == 0 ?
4068 B_TRUE : B_FALSE;
4069 } else {
4070 found_key = B_FALSE;
4071 state = DTRACE_JSON_OBJECT;
4072 }
4073 break;
4074 }
4075
4076 /*
4077 * Otherwise, we wish to either skip this
4078 * nested object or return it in full.
4079 */
4080 if (cc == '[')
4081 brackets = 1;
4082 else
4083 braces = 1;
4084 *dd++ = cc;
4085 state = DTRACE_JSON_COLLECT_OBJECT;
4086 break;
4087 }
4088
4089 if (cc == '"') {
4090 state = DTRACE_JSON_STRING;
4091 break;
4092 }
4093
4094 if (islower(cc)) {
4095 /*
4096 * Here we deal with true, false and null.
4097 */
4098 *dd++ = cc;
4099 state = DTRACE_JSON_IDENTIFIER;
4100 break;
4101 }
4102
4103 if (cc == '-' || isdigit(cc)) {
4104 *dd++ = cc;
4105 state = DTRACE_JSON_NUMBER;
4106 break;
4107 }
4108
4109 /*
4110 * ERROR: unexpected character at start of value.
4111 */
4112 return (NULL);
4113 case DTRACE_JSON_COLLECT_OBJECT:
4114 if (cc == '\0')
4115 /*
4116 * ERROR: unexpected end of input.
4117 */
4118 return (NULL);
4119
4120 *dd++ = cc;
4121 if (cc == '"') {
4122 collect_object = B_TRUE;
4123 state = DTRACE_JSON_STRING;
4124 break;
4125 }
4126
4127 if (cc == ']') {
4128 if (brackets-- == 0) {
4129 /*
4130 * ERROR: unbalanced brackets.
4131 */
4132 return (NULL);
4133 }
4134 } else if (cc == '}') {
4135 if (braces-- == 0) {
4136 /*
4137 * ERROR: unbalanced braces.
4138 */
4139 return (NULL);
4140 }
4141 } else if (cc == '{') {
4142 braces++;
4143 } else if (cc == '[') {
4144 brackets++;
4145 }
4146
4147 if (brackets == 0 && braces == 0) {
4148 if (found_key) {
4149 *dd = '\0';
4150 return (dest);
4151 }
4152 dd = dest; /* reset string buffer */
4153 state = DTRACE_JSON_COMMA;
4154 }
4155 break;
4156 }
4157 }
4158 return (NULL);
4159 }
4160
4161 /*
4162 * Emulate the execution of DTrace ID subroutines invoked by the call opcode.
4163 * Notice that we don't bother validating the proper number of arguments or
4164 * their types in the tuple stack. This isn't needed because all argument
4165 * interpretation is safe because of our load safety -- the worst that can
4166 * happen is that a bogus program can obtain bogus results.
4167 */
4168 static void
dtrace_dif_subr(uint_t subr,uint_t rd,uint64_t * regs,dtrace_key_t * tupregs,int nargs,dtrace_mstate_t * mstate,dtrace_state_t * state)4169 dtrace_dif_subr(uint_t subr, uint_t rd, uint64_t *regs,
4170 dtrace_key_t *tupregs, int nargs,
4171 dtrace_mstate_t *mstate, dtrace_state_t *state)
4172 {
4173 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
4174 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
4175 dtrace_vstate_t *vstate = &state->dts_vstate;
4176
4177 #if !defined(__APPLE__)
4178 union {
4179 mutex_impl_t mi;
4180 uint64_t mx;
4181 } m;
4182
4183 union {
4184 krwlock_t ri;
4185 uintptr_t rw;
4186 } r;
4187 #else
4188 /* FIXME: awaits lock/mutex work */
4189 #endif /* __APPLE__ */
4190
4191 switch (subr) {
4192 case DIF_SUBR_RAND:
4193 regs[rd] = dtrace_xoroshiro128_plus_next(
4194 state->dts_rstate[CPU->cpu_id]);
4195 break;
4196
4197 #if !defined(__APPLE__)
4198 case DIF_SUBR_MUTEX_OWNED:
4199 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4200 mstate, vstate)) {
4201 regs[rd] = 0;
4202 break;
4203 }
4204
4205 m.mx = dtrace_load64(tupregs[0].dttk_value);
4206 if (MUTEX_TYPE_ADAPTIVE(&m.mi))
4207 regs[rd] = MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER;
4208 else
4209 regs[rd] = LOCK_HELD(&m.mi.m_spin.m_spinlock);
4210 break;
4211
4212 case DIF_SUBR_MUTEX_OWNER:
4213 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4214 mstate, vstate)) {
4215 regs[rd] = 0;
4216 break;
4217 }
4218
4219 m.mx = dtrace_load64(tupregs[0].dttk_value);
4220 if (MUTEX_TYPE_ADAPTIVE(&m.mi) &&
4221 MUTEX_OWNER(&m.mi) != MUTEX_NO_OWNER)
4222 regs[rd] = (uintptr_t)MUTEX_OWNER(&m.mi);
4223 else
4224 regs[rd] = 0;
4225 break;
4226
4227 case DIF_SUBR_MUTEX_TYPE_ADAPTIVE:
4228 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4229 mstate, vstate)) {
4230 regs[rd] = 0;
4231 break;
4232 }
4233
4234 m.mx = dtrace_load64(tupregs[0].dttk_value);
4235 regs[rd] = MUTEX_TYPE_ADAPTIVE(&m.mi);
4236 break;
4237
4238 case DIF_SUBR_MUTEX_TYPE_SPIN:
4239 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (kmutex_t),
4240 mstate, vstate)) {
4241 regs[rd] = 0;
4242 break;
4243 }
4244
4245 m.mx = dtrace_load64(tupregs[0].dttk_value);
4246 regs[rd] = MUTEX_TYPE_SPIN(&m.mi);
4247 break;
4248
4249 case DIF_SUBR_RW_READ_HELD: {
4250 uintptr_t tmp;
4251
4252 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (uintptr_t),
4253 mstate, vstate)) {
4254 regs[rd] = 0;
4255 break;
4256 }
4257
4258 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4259 regs[rd] = _RW_READ_HELD(&r.ri, tmp);
4260 break;
4261 }
4262
4263 case DIF_SUBR_RW_WRITE_HELD:
4264 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4265 mstate, vstate)) {
4266 regs[rd] = 0;
4267 break;
4268 }
4269
4270 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4271 regs[rd] = _RW_WRITE_HELD(&r.ri);
4272 break;
4273
4274 case DIF_SUBR_RW_ISWRITER:
4275 if (!dtrace_canload(tupregs[0].dttk_value, sizeof (krwlock_t),
4276 mstate, vstate)) {
4277 regs[rd] = 0;
4278 break;
4279 }
4280
4281 r.rw = dtrace_loadptr(tupregs[0].dttk_value);
4282 regs[rd] = _RW_ISWRITER(&r.ri);
4283 break;
4284 #else
4285 /* FIXME: awaits lock/mutex work */
4286 #endif /* __APPLE__ */
4287
4288 case DIF_SUBR_BCOPY: {
4289 /*
4290 * We need to be sure that the destination is in the scratch
4291 * region -- no other region is allowed.
4292 */
4293 uintptr_t src = tupregs[0].dttk_value;
4294 uintptr_t dest = tupregs[1].dttk_value;
4295 size_t size = tupregs[2].dttk_value;
4296
4297 if (!dtrace_inscratch(dest, size, mstate)) {
4298 *flags |= CPU_DTRACE_BADADDR;
4299 *illval = regs[rd];
4300 break;
4301 }
4302
4303 if (!dtrace_canload(src, size, mstate, vstate)) {
4304 regs[rd] = 0;
4305 break;
4306 }
4307
4308 dtrace_bcopy((void *)src, (void *)dest, size);
4309 break;
4310 }
4311
4312 case DIF_SUBR_ALLOCA:
4313 case DIF_SUBR_COPYIN: {
4314 uintptr_t dest = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
4315 uint64_t size =
4316 tupregs[subr == DIF_SUBR_ALLOCA ? 0 : 1].dttk_value;
4317 size_t scratch_size = (dest - mstate->dtms_scratch_ptr) + size;
4318
4319 /*
4320 * Check whether the user can access kernel memory
4321 */
4322 if (dtrace_priv_kernel(state) == 0) {
4323 DTRACE_CPUFLAG_SET(CPU_DTRACE_KPRIV);
4324 regs[rd] = 0;
4325 break;
4326 }
4327 /*
4328 * This action doesn't require any credential checks since
4329 * probes will not activate in user contexts to which the
4330 * enabling user does not have permissions.
4331 */
4332
4333 /*
4334 * Rounding up the user allocation size could have overflowed
4335 * a large, bogus allocation (like -1ULL) to 0.
4336 */
4337 if (scratch_size < size ||
4338 !DTRACE_INSCRATCH(mstate, scratch_size)) {
4339 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4340 regs[rd] = 0;
4341 break;
4342 }
4343
4344 if (subr == DIF_SUBR_COPYIN) {
4345 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4346 if (dtrace_priv_proc(state))
4347 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4348 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4349 }
4350
4351 mstate->dtms_scratch_ptr += scratch_size;
4352 regs[rd] = dest;
4353 break;
4354 }
4355
4356 case DIF_SUBR_COPYINTO: {
4357 uint64_t size = tupregs[1].dttk_value;
4358 uintptr_t dest = tupregs[2].dttk_value;
4359
4360 /*
4361 * This action doesn't require any credential checks since
4362 * probes will not activate in user contexts to which the
4363 * enabling user does not have permissions.
4364 */
4365 if (!dtrace_inscratch(dest, size, mstate)) {
4366 *flags |= CPU_DTRACE_BADADDR;
4367 *illval = regs[rd];
4368 break;
4369 }
4370
4371 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4372 if (dtrace_priv_proc(state))
4373 dtrace_copyin(tupregs[0].dttk_value, dest, size, flags);
4374 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4375 break;
4376 }
4377
4378 case DIF_SUBR_COPYINSTR: {
4379 uintptr_t dest = mstate->dtms_scratch_ptr;
4380 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4381
4382 if (nargs > 1 && tupregs[1].dttk_value < size)
4383 size = tupregs[1].dttk_value + 1;
4384
4385 /*
4386 * This action doesn't require any credential checks since
4387 * probes will not activate in user contexts to which the
4388 * enabling user does not have permissions.
4389 */
4390 if (!DTRACE_INSCRATCH(mstate, size)) {
4391 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4392 regs[rd] = 0;
4393 break;
4394 }
4395
4396 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4397 if (dtrace_priv_proc(state))
4398 dtrace_copyinstr(tupregs[0].dttk_value, dest, size, flags);
4399 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4400
4401 ((char *)dest)[size - 1] = '\0';
4402 mstate->dtms_scratch_ptr += size;
4403 regs[rd] = dest;
4404 break;
4405 }
4406
4407 case DIF_SUBR_MSGSIZE:
4408 case DIF_SUBR_MSGDSIZE: {
4409 /* Darwin does not implement SysV streams messages */
4410 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4411 regs[rd] = 0;
4412 break;
4413 }
4414
4415 case DIF_SUBR_PROGENYOF: {
4416 pid_t pid = tupregs[0].dttk_value;
4417 struct proc *p = current_proc();
4418 int rval = 0, lim = nprocs;
4419
4420 while(p && (lim-- > 0)) {
4421 pid_t ppid;
4422
4423 ppid = (pid_t)dtrace_load32((uintptr_t)&(p->p_pid));
4424 if (*flags & CPU_DTRACE_FAULT)
4425 break;
4426
4427 if (ppid == pid) {
4428 rval = 1;
4429 break;
4430 }
4431
4432 if (ppid == 0)
4433 break; /* Can't climb process tree any further. */
4434
4435 p = (struct proc *)dtrace_loadptr((uintptr_t)&(p->p_pptr));
4436 #if __has_feature(ptrauth_calls)
4437 p = ptrauth_strip(p, ptrauth_key_process_independent_data);
4438 #endif
4439 if (*flags & CPU_DTRACE_FAULT)
4440 break;
4441 }
4442
4443 regs[rd] = rval;
4444 break;
4445 }
4446
4447 case DIF_SUBR_SPECULATION:
4448 regs[rd] = dtrace_speculation(state);
4449 break;
4450
4451
4452 case DIF_SUBR_COPYOUT: {
4453 uintptr_t kaddr = tupregs[0].dttk_value;
4454 user_addr_t uaddr = tupregs[1].dttk_value;
4455 uint64_t size = tupregs[2].dttk_value;
4456
4457 if (!dtrace_destructive_disallow &&
4458 dtrace_priv_proc_control(state) &&
4459 !dtrace_istoxic(kaddr, size) &&
4460 dtrace_canload(kaddr, size, mstate, vstate)) {
4461 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4462 dtrace_copyout(kaddr, uaddr, size, flags);
4463 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4464 }
4465 break;
4466 }
4467
4468 case DIF_SUBR_COPYOUTSTR: {
4469 uintptr_t kaddr = tupregs[0].dttk_value;
4470 user_addr_t uaddr = tupregs[1].dttk_value;
4471 uint64_t size = tupregs[2].dttk_value;
4472 size_t lim;
4473
4474 if (!dtrace_destructive_disallow &&
4475 dtrace_priv_proc_control(state) &&
4476 !dtrace_istoxic(kaddr, size) &&
4477 dtrace_strcanload(kaddr, size, &lim, mstate, vstate)) {
4478 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
4479 dtrace_copyoutstr(kaddr, uaddr, lim, flags);
4480 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
4481 }
4482 break;
4483 }
4484
4485 case DIF_SUBR_STRLEN: {
4486 size_t size = state->dts_options[DTRACEOPT_STRSIZE];
4487 uintptr_t addr = (uintptr_t)tupregs[0].dttk_value;
4488 size_t lim;
4489
4490 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4491 regs[rd] = 0;
4492 break;
4493 }
4494
4495 regs[rd] = dtrace_strlen((char *)addr, lim);
4496
4497 break;
4498 }
4499
4500 case DIF_SUBR_STRCHR:
4501 case DIF_SUBR_STRRCHR: {
4502 /*
4503 * We're going to iterate over the string looking for the
4504 * specified character. We will iterate until we have reached
4505 * the string length or we have found the character. If this
4506 * is DIF_SUBR_STRRCHR, we will look for the last occurrence
4507 * of the specified character instead of the first.
4508 */
4509 uintptr_t addr = tupregs[0].dttk_value;
4510 uintptr_t addr_limit;
4511 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4512 size_t lim;
4513 char c, target = (char)tupregs[1].dttk_value;
4514
4515 if (!dtrace_strcanload(addr, size, &lim, mstate, vstate)) {
4516 regs[rd] = 0;
4517 break;
4518 }
4519 addr_limit = addr + lim;
4520
4521 for (regs[rd] = 0; addr < addr_limit; addr++) {
4522 if ((c = dtrace_load8(addr)) == target) {
4523 regs[rd] = addr;
4524
4525 if (subr == DIF_SUBR_STRCHR)
4526 break;
4527 }
4528
4529 if (c == '\0')
4530 break;
4531 }
4532
4533 break;
4534 }
4535
4536 case DIF_SUBR_STRSTR:
4537 case DIF_SUBR_INDEX:
4538 case DIF_SUBR_RINDEX: {
4539 /*
4540 * We're going to iterate over the string looking for the
4541 * specified string. We will iterate until we have reached
4542 * the string length or we have found the string. (Yes, this
4543 * is done in the most naive way possible -- but considering
4544 * that the string we're searching for is likely to be
4545 * relatively short, the complexity of Rabin-Karp or similar
4546 * hardly seems merited.)
4547 */
4548 char *addr = (char *)(uintptr_t)tupregs[0].dttk_value;
4549 char *substr = (char *)(uintptr_t)tupregs[1].dttk_value;
4550 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4551 size_t len = dtrace_strlen(addr, size);
4552 size_t sublen = dtrace_strlen(substr, size);
4553 char *limit = addr + len, *orig = addr;
4554 int notfound = subr == DIF_SUBR_STRSTR ? 0 : -1;
4555 int inc = 1;
4556
4557 regs[rd] = notfound;
4558
4559 if (!dtrace_canload((uintptr_t)addr, len + 1, mstate, vstate)) {
4560 regs[rd] = 0;
4561 break;
4562 }
4563
4564 if (!dtrace_canload((uintptr_t)substr, sublen + 1, mstate,
4565 vstate)) {
4566 regs[rd] = 0;
4567 break;
4568 }
4569
4570 /*
4571 * strstr() and index()/rindex() have similar semantics if
4572 * both strings are the empty string: strstr() returns a
4573 * pointer to the (empty) string, and index() and rindex()
4574 * both return index 0 (regardless of any position argument).
4575 */
4576 if (sublen == 0 && len == 0) {
4577 if (subr == DIF_SUBR_STRSTR)
4578 regs[rd] = (uintptr_t)addr;
4579 else
4580 regs[rd] = 0;
4581 break;
4582 }
4583
4584 if (subr != DIF_SUBR_STRSTR) {
4585 if (subr == DIF_SUBR_RINDEX) {
4586 limit = orig - 1;
4587 addr += len;
4588 inc = -1;
4589 }
4590
4591 /*
4592 * Both index() and rindex() take an optional position
4593 * argument that denotes the starting position.
4594 */
4595 if (nargs == 3) {
4596 int64_t pos = (int64_t)tupregs[2].dttk_value;
4597
4598 /*
4599 * If the position argument to index() is
4600 * negative, Perl implicitly clamps it at
4601 * zero. This semantic is a little surprising
4602 * given the special meaning of negative
4603 * positions to similar Perl functions like
4604 * substr(), but it appears to reflect a
4605 * notion that index() can start from a
4606 * negative index and increment its way up to
4607 * the string. Given this notion, Perl's
4608 * rindex() is at least self-consistent in
4609 * that it implicitly clamps positions greater
4610 * than the string length to be the string
4611 * length. Where Perl completely loses
4612 * coherence, however, is when the specified
4613 * substring is the empty string (""). In
4614 * this case, even if the position is
4615 * negative, rindex() returns 0 -- and even if
4616 * the position is greater than the length,
4617 * index() returns the string length. These
4618 * semantics violate the notion that index()
4619 * should never return a value less than the
4620 * specified position and that rindex() should
4621 * never return a value greater than the
4622 * specified position. (One assumes that
4623 * these semantics are artifacts of Perl's
4624 * implementation and not the results of
4625 * deliberate design -- it beggars belief that
4626 * even Larry Wall could desire such oddness.)
4627 * While in the abstract one would wish for
4628 * consistent position semantics across
4629 * substr(), index() and rindex() -- or at the
4630 * very least self-consistent position
4631 * semantics for index() and rindex() -- we
4632 * instead opt to keep with the extant Perl
4633 * semantics, in all their broken glory. (Do
4634 * we have more desire to maintain Perl's
4635 * semantics than Perl does? Probably.)
4636 */
4637 if (subr == DIF_SUBR_RINDEX) {
4638 if (pos < 0) {
4639 if (sublen == 0)
4640 regs[rd] = 0;
4641 break;
4642 }
4643
4644 if ((size_t)pos > len)
4645 pos = len;
4646 } else {
4647 if (pos < 0)
4648 pos = 0;
4649
4650 if ((size_t)pos >= len) {
4651 if (sublen == 0)
4652 regs[rd] = len;
4653 break;
4654 }
4655 }
4656
4657 addr = orig + pos;
4658 }
4659 }
4660
4661 for (regs[rd] = notfound; addr != limit; addr += inc) {
4662 if (dtrace_strncmp(addr, substr, sublen) == 0) {
4663 if (subr != DIF_SUBR_STRSTR) {
4664 /*
4665 * As D index() and rindex() are
4666 * modeled on Perl (and not on awk),
4667 * we return a zero-based (and not a
4668 * one-based) index. (For you Perl
4669 * weenies: no, we're not going to add
4670 * $[ -- and shouldn't you be at a con
4671 * or something?)
4672 */
4673 regs[rd] = (uintptr_t)(addr - orig);
4674 break;
4675 }
4676
4677 ASSERT(subr == DIF_SUBR_STRSTR);
4678 regs[rd] = (uintptr_t)addr;
4679 break;
4680 }
4681 }
4682
4683 break;
4684 }
4685
4686 case DIF_SUBR_STRTOK: {
4687 uintptr_t addr = tupregs[0].dttk_value;
4688 uintptr_t tokaddr = tupregs[1].dttk_value;
4689 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4690 uintptr_t limit, toklimit;
4691 size_t clim;
4692 char *dest = (char *)mstate->dtms_scratch_ptr;
4693 uint8_t c='\0', tokmap[32]; /* 256 / 8 */
4694 uint64_t i = 0;
4695
4696 /*
4697 * Check both the token buffer and (later) the input buffer,
4698 * since both could be non-scratch addresses.
4699 */
4700 if (!dtrace_strcanload(tokaddr, size, &clim, mstate, vstate)) {
4701 regs[rd] = 0;
4702 break;
4703 }
4704 toklimit = tokaddr + clim;
4705
4706 if (!DTRACE_INSCRATCH(mstate, size)) {
4707 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4708 regs[rd] = 0;
4709 break;
4710 }
4711
4712 if (addr == 0) {
4713 /*
4714 * If the address specified is NULL, we use our saved
4715 * strtok pointer from the mstate. Note that this
4716 * means that the saved strtok pointer is _only_
4717 * valid within multiple enablings of the same probe --
4718 * it behaves like an implicit clause-local variable.
4719 */
4720 addr = mstate->dtms_strtok;
4721 limit = mstate->dtms_strtok_limit;
4722 } else {
4723 /*
4724 * If the user-specified address is non-NULL we must
4725 * access check it. This is the only time we have
4726 * a chance to do so, since this address may reside
4727 * in the string table of this clause-- future calls
4728 * (when we fetch addr from mstate->dtms_strtok)
4729 * would fail this access check.
4730 */
4731 if (!dtrace_strcanload(addr, size, &clim, mstate,
4732 vstate)) {
4733 regs[rd] = 0;
4734 break;
4735 }
4736 limit = addr + clim;
4737 }
4738
4739 /*
4740 * First, zero the token map, and then process the token
4741 * string -- setting a bit in the map for every character
4742 * found in the token string.
4743 */
4744 for (i = 0; i < (int)sizeof (tokmap); i++)
4745 tokmap[i] = 0;
4746
4747 for (; tokaddr < toklimit; tokaddr++) {
4748 if ((c = dtrace_load8(tokaddr)) == '\0')
4749 break;
4750
4751 ASSERT((c >> 3) < sizeof (tokmap));
4752 tokmap[c >> 3] |= (1 << (c & 0x7));
4753 }
4754
4755 for (; addr < limit; addr++) {
4756 /*
4757 * We're looking for a character that is _not_
4758 * contained in the token string.
4759 */
4760 if ((c = dtrace_load8(addr)) == '\0')
4761 break;
4762
4763 if (!(tokmap[c >> 3] & (1 << (c & 0x7))))
4764 break;
4765 }
4766
4767 if (c == '\0') {
4768 /*
4769 * We reached the end of the string without finding
4770 * any character that was not in the token string.
4771 * We return NULL in this case, and we set the saved
4772 * address to NULL as well.
4773 */
4774 regs[rd] = 0;
4775 mstate->dtms_strtok = 0;
4776 mstate->dtms_strtok_limit = 0;
4777 break;
4778 }
4779
4780 /*
4781 * From here on, we're copying into the destination string.
4782 */
4783 for (i = 0; addr < limit && i < size - 1; addr++) {
4784 if ((c = dtrace_load8(addr)) == '\0')
4785 break;
4786
4787 if (tokmap[c >> 3] & (1 << (c & 0x7)))
4788 break;
4789
4790 ASSERT(i < size);
4791 dest[i++] = c;
4792 }
4793
4794 ASSERT(i < size);
4795 dest[i] = '\0';
4796 regs[rd] = (uintptr_t)dest;
4797 mstate->dtms_scratch_ptr += size;
4798 mstate->dtms_strtok = addr;
4799 mstate->dtms_strtok_limit = limit;
4800 break;
4801 }
4802
4803 case DIF_SUBR_SUBSTR: {
4804 uintptr_t s = tupregs[0].dttk_value;
4805 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4806 char *d = (char *)mstate->dtms_scratch_ptr;
4807 int64_t index = (int64_t)tupregs[1].dttk_value;
4808 int64_t remaining = (int64_t)tupregs[2].dttk_value;
4809 size_t len = dtrace_strlen((char *)s, size);
4810 int64_t i = 0;
4811
4812 if (!dtrace_canload(s, len + 1, mstate, vstate)) {
4813 regs[rd] = 0;
4814 break;
4815 }
4816
4817 if (!DTRACE_INSCRATCH(mstate, size)) {
4818 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4819 regs[rd] = 0;
4820 break;
4821 }
4822
4823 if (nargs <= 2)
4824 remaining = (int64_t)size;
4825
4826 if (index < 0) {
4827 index += len;
4828
4829 if (index < 0 && index + remaining > 0) {
4830 remaining += index;
4831 index = 0;
4832 }
4833 }
4834
4835 if ((size_t)index >= len || index < 0) {
4836 remaining = 0;
4837 } else if (remaining < 0) {
4838 remaining += len - index;
4839 } else if ((uint64_t)index + (uint64_t)remaining > size) {
4840 remaining = size - index;
4841 }
4842
4843 for (i = 0; i < remaining; i++) {
4844 if ((d[i] = dtrace_load8(s + index + i)) == '\0')
4845 break;
4846 }
4847
4848 d[i] = '\0';
4849
4850 mstate->dtms_scratch_ptr += size;
4851 regs[rd] = (uintptr_t)d;
4852 break;
4853 }
4854
4855 case DIF_SUBR_GETMAJOR:
4856 regs[rd] = (uintptr_t)major( (dev_t)tupregs[0].dttk_value );
4857 break;
4858
4859 case DIF_SUBR_GETMINOR:
4860 regs[rd] = (uintptr_t)minor( (dev_t)tupregs[0].dttk_value );
4861 break;
4862
4863 case DIF_SUBR_DDI_PATHNAME: {
4864 /* APPLE NOTE: currently unsupported on Darwin */
4865 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
4866 regs[rd] = 0;
4867 break;
4868 }
4869
4870 case DIF_SUBR_STRJOIN: {
4871 char *d = (char *)mstate->dtms_scratch_ptr;
4872 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4873 uintptr_t s1 = tupregs[0].dttk_value;
4874 uintptr_t s2 = tupregs[1].dttk_value;
4875 uint64_t i = 0, j = 0;
4876 size_t lim1, lim2;
4877 char c;
4878
4879 if (!dtrace_strcanload(s1, size, &lim1, mstate, vstate) ||
4880 !dtrace_strcanload(s2, size, &lim2, mstate, vstate)) {
4881 regs[rd] = 0;
4882 break;
4883 }
4884
4885 if (!DTRACE_INSCRATCH(mstate, size)) {
4886 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4887 regs[rd] = 0;
4888 break;
4889 }
4890
4891 for (;;) {
4892 if (i >= size) {
4893 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4894 regs[rd] = 0;
4895 break;
4896 }
4897 c = (i >= lim1) ? '\0' : dtrace_load8(s1++);
4898 if ((d[i++] = c) == '\0') {
4899 i--;
4900 break;
4901 }
4902 }
4903
4904 for (;;) {
4905 if (i >= size) {
4906 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4907 regs[rd] = 0;
4908 break;
4909 }
4910 c = (j++ >= lim2) ? '\0' : dtrace_load8(s2++);
4911 if ((d[i++] = c) == '\0')
4912 break;
4913 }
4914
4915 if (i < size) {
4916 mstate->dtms_scratch_ptr += i;
4917 regs[rd] = (uintptr_t)d;
4918 }
4919
4920 break;
4921 }
4922
4923 case DIF_SUBR_STRTOLL: {
4924 uintptr_t s = tupregs[0].dttk_value;
4925 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
4926 size_t lim;
4927 int base = 10;
4928
4929 if (nargs > 1) {
4930 if ((base = tupregs[1].dttk_value) <= 1 ||
4931 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4932 *flags |= CPU_DTRACE_ILLOP;
4933 break;
4934 }
4935 }
4936
4937 if (!dtrace_strcanload(s, size, &lim, mstate, vstate)) {
4938 regs[rd] = INT64_MIN;
4939 break;
4940 }
4941
4942 regs[rd] = dtrace_strtoll((char *)s, base, lim);
4943 break;
4944 }
4945
4946 case DIF_SUBR_LLTOSTR: {
4947 int64_t i = (int64_t)tupregs[0].dttk_value;
4948 uint64_t val, digit;
4949 uint64_t size = 65; /* enough room for 2^64 in binary */
4950 char *end = (char *)mstate->dtms_scratch_ptr + size - 1;
4951 int base = 10;
4952
4953 if (nargs > 1) {
4954 if ((base = tupregs[1].dttk_value) <= 1 ||
4955 base > ('z' - 'a' + 1) + ('9' - '0' + 1)) {
4956 *flags |= CPU_DTRACE_ILLOP;
4957 break;
4958 }
4959 }
4960
4961 val = (base == 10 && i < 0) ? i * -1 : i;
4962
4963 if (!DTRACE_INSCRATCH(mstate, size)) {
4964 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
4965 regs[rd] = 0;
4966 break;
4967 }
4968
4969 for (*end-- = '\0'; val; val /= base) {
4970 if ((digit = val % base) <= '9' - '0') {
4971 *end-- = '0' + digit;
4972 } else {
4973 *end-- = 'a' + (digit - ('9' - '0') - 1);
4974 }
4975 }
4976
4977 if (i == 0 && base == 16)
4978 *end-- = '0';
4979
4980 if (base == 16)
4981 *end-- = 'x';
4982
4983 if (i == 0 || base == 8 || base == 16)
4984 *end-- = '0';
4985
4986 if (i < 0 && base == 10)
4987 *end-- = '-';
4988
4989 regs[rd] = (uintptr_t)end + 1;
4990 mstate->dtms_scratch_ptr += size;
4991 break;
4992 }
4993
4994 case DIF_SUBR_HTONS:
4995 case DIF_SUBR_NTOHS:
4996 #ifdef _BIG_ENDIAN
4997 regs[rd] = (uint16_t)tupregs[0].dttk_value;
4998 #else
4999 regs[rd] = DT_BSWAP_16((uint16_t)tupregs[0].dttk_value);
5000 #endif
5001 break;
5002
5003
5004 case DIF_SUBR_HTONL:
5005 case DIF_SUBR_NTOHL:
5006 #ifdef _BIG_ENDIAN
5007 regs[rd] = (uint32_t)tupregs[0].dttk_value;
5008 #else
5009 regs[rd] = DT_BSWAP_32((uint32_t)tupregs[0].dttk_value);
5010 #endif
5011 break;
5012
5013
5014 case DIF_SUBR_HTONLL:
5015 case DIF_SUBR_NTOHLL:
5016 #ifdef _BIG_ENDIAN
5017 regs[rd] = (uint64_t)tupregs[0].dttk_value;
5018 #else
5019 regs[rd] = DT_BSWAP_64((uint64_t)tupregs[0].dttk_value);
5020 #endif
5021 break;
5022
5023
5024 case DIF_SUBR_DIRNAME:
5025 case DIF_SUBR_BASENAME: {
5026 char *dest = (char *)mstate->dtms_scratch_ptr;
5027 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5028 uintptr_t src = tupregs[0].dttk_value;
5029 int i, j, len = dtrace_strlen((char *)src, size);
5030 int lastbase = -1, firstbase = -1, lastdir = -1;
5031 int start, end;
5032
5033 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5034 regs[rd] = 0;
5035 break;
5036 }
5037
5038 if (!DTRACE_INSCRATCH(mstate, size)) {
5039 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5040 regs[rd] = 0;
5041 break;
5042 }
5043
5044 /*
5045 * The basename and dirname for a zero-length string is
5046 * defined to be "."
5047 */
5048 if (len == 0) {
5049 len = 1;
5050 src = (uintptr_t)".";
5051 }
5052
5053 /*
5054 * Start from the back of the string, moving back toward the
5055 * front until we see a character that isn't a slash. That
5056 * character is the last character in the basename.
5057 */
5058 for (i = len - 1; i >= 0; i--) {
5059 if (dtrace_load8(src + i) != '/')
5060 break;
5061 }
5062
5063 if (i >= 0)
5064 lastbase = i;
5065
5066 /*
5067 * Starting from the last character in the basename, move
5068 * towards the front until we find a slash. The character
5069 * that we processed immediately before that is the first
5070 * character in the basename.
5071 */
5072 for (; i >= 0; i--) {
5073 if (dtrace_load8(src + i) == '/')
5074 break;
5075 }
5076
5077 if (i >= 0)
5078 firstbase = i + 1;
5079
5080 /*
5081 * Now keep going until we find a non-slash character. That
5082 * character is the last character in the dirname.
5083 */
5084 for (; i >= 0; i--) {
5085 if (dtrace_load8(src + i) != '/')
5086 break;
5087 }
5088
5089 if (i >= 0)
5090 lastdir = i;
5091
5092 ASSERT(!(lastbase == -1 && firstbase != -1));
5093 ASSERT(!(firstbase == -1 && lastdir != -1));
5094
5095 if (lastbase == -1) {
5096 /*
5097 * We didn't find a non-slash character. We know that
5098 * the length is non-zero, so the whole string must be
5099 * slashes. In either the dirname or the basename
5100 * case, we return '/'.
5101 */
5102 ASSERT(firstbase == -1);
5103 firstbase = lastbase = lastdir = 0;
5104 }
5105
5106 if (firstbase == -1) {
5107 /*
5108 * The entire string consists only of a basename
5109 * component. If we're looking for dirname, we need
5110 * to change our string to be just "."; if we're
5111 * looking for a basename, we'll just set the first
5112 * character of the basename to be 0.
5113 */
5114 if (subr == DIF_SUBR_DIRNAME) {
5115 ASSERT(lastdir == -1);
5116 src = (uintptr_t)".";
5117 lastdir = 0;
5118 } else {
5119 firstbase = 0;
5120 }
5121 }
5122
5123 if (subr == DIF_SUBR_DIRNAME) {
5124 if (lastdir == -1) {
5125 /*
5126 * We know that we have a slash in the name --
5127 * or lastdir would be set to 0, above. And
5128 * because lastdir is -1, we know that this
5129 * slash must be the first character. (That
5130 * is, the full string must be of the form
5131 * "/basename".) In this case, the last
5132 * character of the directory name is 0.
5133 */
5134 lastdir = 0;
5135 }
5136
5137 start = 0;
5138 end = lastdir;
5139 } else {
5140 ASSERT(subr == DIF_SUBR_BASENAME);
5141 ASSERT(firstbase != -1 && lastbase != -1);
5142 start = firstbase;
5143 end = lastbase;
5144 }
5145
5146 for (i = start, j = 0; i <= end && (uint64_t)j < size - 1; i++, j++)
5147 dest[j] = dtrace_load8(src + i);
5148
5149 dest[j] = '\0';
5150 regs[rd] = (uintptr_t)dest;
5151 mstate->dtms_scratch_ptr += size;
5152 break;
5153 }
5154
5155 case DIF_SUBR_CLEANPATH: {
5156 char *dest = (char *)mstate->dtms_scratch_ptr, c;
5157 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5158 uintptr_t src = tupregs[0].dttk_value;
5159 size_t lim;
5160 size_t i = 0, j = 0;
5161
5162 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5163 regs[rd] = 0;
5164 break;
5165 }
5166
5167 if (!DTRACE_INSCRATCH(mstate, size)) {
5168 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5169 regs[rd] = 0;
5170 break;
5171 }
5172
5173 /*
5174 * Move forward, loading each character.
5175 */
5176 do {
5177 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5178 next:
5179 if ((uint64_t)(j + 5) >= size) /* 5 = strlen("/..c\0") */
5180 break;
5181
5182 if (c != '/') {
5183 dest[j++] = c;
5184 continue;
5185 }
5186
5187 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5188
5189 if (c == '/') {
5190 /*
5191 * We have two slashes -- we can just advance
5192 * to the next character.
5193 */
5194 goto next;
5195 }
5196
5197 if (c != '.') {
5198 /*
5199 * This is not "." and it's not ".." -- we can
5200 * just store the "/" and this character and
5201 * drive on.
5202 */
5203 dest[j++] = '/';
5204 dest[j++] = c;
5205 continue;
5206 }
5207
5208 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5209
5210 if (c == '/') {
5211 /*
5212 * This is a "/./" component. We're not going
5213 * to store anything in the destination buffer;
5214 * we're just going to go to the next component.
5215 */
5216 goto next;
5217 }
5218
5219 if (c != '.') {
5220 /*
5221 * This is not ".." -- we can just store the
5222 * "/." and this character and continue
5223 * processing.
5224 */
5225 dest[j++] = '/';
5226 dest[j++] = '.';
5227 dest[j++] = c;
5228 continue;
5229 }
5230
5231 c = (i >= lim) ? '\0' : dtrace_load8(src + i++);
5232
5233 if (c != '/' && c != '\0') {
5234 /*
5235 * This is not ".." -- it's "..[mumble]".
5236 * We'll store the "/.." and this character
5237 * and continue processing.
5238 */
5239 dest[j++] = '/';
5240 dest[j++] = '.';
5241 dest[j++] = '.';
5242 dest[j++] = c;
5243 continue;
5244 }
5245
5246 /*
5247 * This is "/../" or "/..\0". We need to back up
5248 * our destination pointer until we find a "/".
5249 */
5250 i--;
5251 while (j != 0 && dest[--j] != '/')
5252 continue;
5253
5254 if (c == '\0')
5255 dest[++j] = '/';
5256 } while (c != '\0');
5257
5258 dest[j] = '\0';
5259 regs[rd] = (uintptr_t)dest;
5260 mstate->dtms_scratch_ptr += size;
5261 break;
5262 }
5263
5264 case DIF_SUBR_INET_NTOA:
5265 case DIF_SUBR_INET_NTOA6:
5266 case DIF_SUBR_INET_NTOP: {
5267 size_t size;
5268 int af, argi, i;
5269 char *base, *end;
5270
5271 if (subr == DIF_SUBR_INET_NTOP) {
5272 af = (int)tupregs[0].dttk_value;
5273 argi = 1;
5274 } else {
5275 af = subr == DIF_SUBR_INET_NTOA ? AF_INET: AF_INET6;
5276 argi = 0;
5277 }
5278
5279 if (af == AF_INET) {
5280 #if !defined(__APPLE__)
5281 ipaddr_t ip4;
5282 #else
5283 uint32_t ip4;
5284 #endif /* __APPLE__ */
5285 uint8_t *ptr8, val;
5286
5287 /*
5288 * Safely load the IPv4 address.
5289 */
5290 #if !defined(__APPLE__)
5291 ip4 = dtrace_load32(tupregs[argi].dttk_value);
5292 #else
5293 if (!dtrace_canload(tupregs[argi].dttk_value, sizeof(ip4),
5294 mstate, vstate)) {
5295 regs[rd] = 0;
5296 break;
5297 }
5298
5299 dtrace_bcopy(
5300 (void *)(uintptr_t)tupregs[argi].dttk_value,
5301 (void *)(uintptr_t)&ip4, sizeof (ip4));
5302 #endif /* __APPLE__ */
5303 /*
5304 * Check an IPv4 string will fit in scratch.
5305 */
5306 #if !defined(__APPLE__)
5307 size = INET_ADDRSTRLEN;
5308 #else
5309 size = MAX_IPv4_STR_LEN;
5310 #endif /* __APPLE__ */
5311 if (!DTRACE_INSCRATCH(mstate, size)) {
5312 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5313 regs[rd] = 0;
5314 break;
5315 }
5316 base = (char *)mstate->dtms_scratch_ptr;
5317 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5318
5319 /*
5320 * Stringify as a dotted decimal quad.
5321 */
5322 *end-- = '\0';
5323 ptr8 = (uint8_t *)&ip4;
5324 for (i = 3; i >= 0; i--) {
5325 val = ptr8[i];
5326
5327 if (val == 0) {
5328 *end-- = '0';
5329 } else {
5330 for (; val; val /= 10) {
5331 *end-- = '0' + (val % 10);
5332 }
5333 }
5334
5335 if (i > 0)
5336 *end-- = '.';
5337 }
5338 ASSERT(end + 1 >= base);
5339
5340 } else if (af == AF_INET6) {
5341 #if defined(__APPLE__)
5342 #define _S6_un __u6_addr
5343 #define _S6_u8 __u6_addr8
5344 #endif /* __APPLE__ */
5345 struct in6_addr ip6;
5346 int firstzero, tryzero, numzero, v6end;
5347 uint16_t val;
5348 const char digits[] = "0123456789abcdef";
5349
5350 /*
5351 * Stringify using RFC 1884 convention 2 - 16 bit
5352 * hexadecimal values with a zero-run compression.
5353 * Lower case hexadecimal digits are used.
5354 * eg, fe80::214:4fff:fe0b:76c8.
5355 * The IPv4 embedded form is returned for inet_ntop,
5356 * just the IPv4 string is returned for inet_ntoa6.
5357 */
5358
5359 if (!dtrace_canload(tupregs[argi].dttk_value,
5360 sizeof(struct in6_addr), mstate, vstate)) {
5361 regs[rd] = 0;
5362 break;
5363 }
5364
5365 /*
5366 * Safely load the IPv6 address.
5367 */
5368 dtrace_bcopy(
5369 (void *)(uintptr_t)tupregs[argi].dttk_value,
5370 (void *)(uintptr_t)&ip6, sizeof (struct in6_addr));
5371
5372 /*
5373 * Check an IPv6 string will fit in scratch.
5374 */
5375 size = INET6_ADDRSTRLEN;
5376 if (!DTRACE_INSCRATCH(mstate, size)) {
5377 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5378 regs[rd] = 0;
5379 break;
5380 }
5381 base = (char *)mstate->dtms_scratch_ptr;
5382 end = (char *)mstate->dtms_scratch_ptr + size - 1;
5383 *end-- = '\0';
5384
5385 /*
5386 * Find the longest run of 16 bit zero values
5387 * for the single allowed zero compression - "::".
5388 */
5389 firstzero = -1;
5390 tryzero = -1;
5391 numzero = 1;
5392 for (i = 0; i < (int)sizeof (struct in6_addr); i++) {
5393 if (ip6._S6_un._S6_u8[i] == 0 &&
5394 tryzero == -1 && i % 2 == 0) {
5395 tryzero = i;
5396 continue;
5397 }
5398
5399 if (tryzero != -1 &&
5400 (ip6._S6_un._S6_u8[i] != 0 ||
5401 i == sizeof (struct in6_addr) - 1)) {
5402
5403 if (i - tryzero <= numzero) {
5404 tryzero = -1;
5405 continue;
5406 }
5407
5408 firstzero = tryzero;
5409 numzero = i - i % 2 - tryzero;
5410 tryzero = -1;
5411
5412 if (ip6._S6_un._S6_u8[i] == 0 &&
5413 i == sizeof (struct in6_addr) - 1)
5414 numzero += 2;
5415 }
5416 }
5417 ASSERT(firstzero + numzero <= (int)sizeof (struct in6_addr));
5418
5419 /*
5420 * Check for an IPv4 embedded address.
5421 */
5422 v6end = sizeof (struct in6_addr) - 2;
5423 if (IN6_IS_ADDR_V4MAPPED(&ip6) ||
5424 IN6_IS_ADDR_V4COMPAT(&ip6)) {
5425 for (i = sizeof (struct in6_addr) - 1;
5426 i >= (int)DTRACE_V4MAPPED_OFFSET; i--) {
5427 ASSERT(end >= base);
5428
5429 val = ip6._S6_un._S6_u8[i];
5430
5431 if (val == 0) {
5432 *end-- = '0';
5433 } else {
5434 for (; val; val /= 10) {
5435 *end-- = '0' + val % 10;
5436 }
5437 }
5438
5439 if (i > (int)DTRACE_V4MAPPED_OFFSET)
5440 *end-- = '.';
5441 }
5442
5443 if (subr == DIF_SUBR_INET_NTOA6)
5444 goto inetout;
5445
5446 /*
5447 * Set v6end to skip the IPv4 address that
5448 * we have already stringified.
5449 */
5450 v6end = 10;
5451 }
5452
5453 /*
5454 * Build the IPv6 string by working through the
5455 * address in reverse.
5456 */
5457 for (i = v6end; i >= 0; i -= 2) {
5458 ASSERT(end >= base);
5459
5460 if (i == firstzero + numzero - 2) {
5461 *end-- = ':';
5462 *end-- = ':';
5463 i -= numzero - 2;
5464 continue;
5465 }
5466
5467 if (i < 14 && i != firstzero - 2)
5468 *end-- = ':';
5469
5470 val = (ip6._S6_un._S6_u8[i] << 8) +
5471 ip6._S6_un._S6_u8[i + 1];
5472
5473 if (val == 0) {
5474 *end-- = '0';
5475 } else {
5476 for (; val; val /= 16) {
5477 *end-- = digits[val % 16];
5478 }
5479 }
5480 }
5481 ASSERT(end + 1 >= base);
5482
5483 #if defined(__APPLE__)
5484 #undef _S6_un
5485 #undef _S6_u8
5486 #endif /* __APPLE__ */
5487 } else {
5488 /*
5489 * The user didn't use AH_INET or AH_INET6.
5490 */
5491 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5492 regs[rd] = 0;
5493 break;
5494 }
5495
5496 inetout: regs[rd] = (uintptr_t)end + 1;
5497 mstate->dtms_scratch_ptr += size;
5498 break;
5499 }
5500
5501 case DIF_SUBR_JSON: {
5502 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5503 uintptr_t json = tupregs[0].dttk_value;
5504 size_t jsonlen = dtrace_strlen((char *)json, size);
5505 uintptr_t elem = tupregs[1].dttk_value;
5506 size_t elemlen = dtrace_strlen((char *)elem, size);
5507
5508 char *dest = (char *)mstate->dtms_scratch_ptr;
5509 char *elemlist = (char *)mstate->dtms_scratch_ptr + jsonlen + 1;
5510 char *ee = elemlist;
5511 int nelems = 1;
5512 uintptr_t cur;
5513
5514 if (!dtrace_canload(json, jsonlen + 1, mstate, vstate) ||
5515 !dtrace_canload(elem, elemlen + 1, mstate, vstate)) {
5516 regs[rd] = 0;
5517 break;
5518 }
5519
5520 if (!DTRACE_INSCRATCH(mstate, jsonlen + 1 + elemlen + 1)) {
5521 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5522 regs[rd] = 0;
5523 break;
5524 }
5525
5526 /*
5527 * Read the element selector and split it up into a packed list
5528 * of strings.
5529 */
5530 for (cur = elem; cur < elem + elemlen; cur++) {
5531 char cc = dtrace_load8(cur);
5532
5533 if (cur == elem && cc == '[') {
5534 /*
5535 * If the first element selector key is
5536 * actually an array index then ignore the
5537 * bracket.
5538 */
5539 continue;
5540 }
5541
5542 if (cc == ']')
5543 continue;
5544
5545 if (cc == '.' || cc == '[') {
5546 nelems++;
5547 cc = '\0';
5548 }
5549
5550 *ee++ = cc;
5551 }
5552 *ee++ = '\0';
5553
5554 if ((regs[rd] = (uintptr_t)dtrace_json(size, json, elemlist,
5555 nelems, dest)) != 0)
5556 mstate->dtms_scratch_ptr += jsonlen + 1;
5557 break;
5558 }
5559
5560 case DIF_SUBR_TOUPPER:
5561 case DIF_SUBR_TOLOWER: {
5562 uintptr_t src = tupregs[0].dttk_value;
5563 char *dest = (char *)mstate->dtms_scratch_ptr;
5564 char lower, upper, base, c;
5565 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5566 size_t len = dtrace_strlen((char*) src, size);
5567 size_t i = 0;
5568
5569 lower = (subr == DIF_SUBR_TOUPPER) ? 'a' : 'A';
5570 upper = (subr == DIF_SUBR_TOUPPER) ? 'z' : 'Z';
5571 base = (subr == DIF_SUBR_TOUPPER) ? 'A' : 'a';
5572
5573 if (!dtrace_canload(src, len + 1, mstate, vstate)) {
5574 regs[rd] = 0;
5575 break;
5576 }
5577
5578 if (!DTRACE_INSCRATCH(mstate, size)) {
5579 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5580 regs[rd] = 0;
5581 break;
5582 }
5583
5584 for (i = 0; i < size - 1; ++i) {
5585 if ((c = dtrace_load8(src + i)) == '\0')
5586 break;
5587 if (c >= lower && c <= upper)
5588 c = base + (c - lower);
5589 dest[i] = c;
5590 }
5591
5592 ASSERT(i < size);
5593
5594 dest[i] = '\0';
5595 regs[rd] = (uintptr_t) dest;
5596 mstate->dtms_scratch_ptr += size;
5597
5598 break;
5599 }
5600
5601 case DIF_SUBR_STRIP:
5602 if (!dtrace_is_valid_ptrauth_key(tupregs[1].dttk_value)) {
5603 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5604 break;
5605 }
5606 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
5607 (void*)tupregs[0].dttk_value, tupregs[1].dttk_value);
5608 break;
5609
5610 #if defined(__APPLE__)
5611 case DIF_SUBR_VM_KERNEL_ADDRPERM: {
5612 if (!dtrace_priv_kernel(state)) {
5613 regs[rd] = 0;
5614 } else {
5615 regs[rd] = VM_KERNEL_ADDRPERM((vm_offset_t) tupregs[0].dttk_value);
5616 }
5617
5618 break;
5619 }
5620
5621 case DIF_SUBR_KDEBUG_TRACE: {
5622 uint32_t debugid;
5623 uintptr_t args[4] = {0};
5624 int i;
5625
5626 if (nargs < 2 || nargs > 5) {
5627 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5628 break;
5629 }
5630
5631 if (dtrace_destructive_disallow ||
5632 !dtrace_priv_kernel_destructive(state)) {
5633 return;
5634 }
5635
5636 debugid = tupregs[0].dttk_value;
5637 for (i = 0; i < nargs - 1; i++)
5638 args[i] = tupregs[i + 1].dttk_value;
5639
5640 kernel_debug(debugid, args[0], args[1], args[2], args[3], 0);
5641
5642 break;
5643 }
5644
5645 case DIF_SUBR_KDEBUG_TRACE_STRING: {
5646 if (nargs != 3) {
5647 break;
5648 }
5649
5650 if (dtrace_destructive_disallow ||
5651 !dtrace_priv_kernel_destructive(state)) {
5652 return;
5653 }
5654
5655 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5656 uint32_t debugid = tupregs[0].dttk_value;
5657 uint64_t str_id = tupregs[1].dttk_value;
5658 uintptr_t src = tupregs[2].dttk_value;
5659 size_t lim;
5660 char buf[size];
5661 char* str = NULL;
5662
5663 if (src != (uintptr_t)0) {
5664 str = buf;
5665 if (!dtrace_strcanload(src, size, &lim, mstate, vstate)) {
5666 break;
5667 }
5668 dtrace_strcpy((void*)src, buf, size);
5669 }
5670
5671 (void)kernel_debug_string(debugid, &str_id, str);
5672 regs[rd] = str_id;
5673
5674 break;
5675 }
5676
5677 case DIF_SUBR_MTONS:
5678 absolutetime_to_nanoseconds(tupregs[0].dttk_value, ®s[rd]);
5679
5680 break;
5681 case DIF_SUBR_PHYSMEM_READ: {
5682 #if DEBUG || DEVELOPMENT
5683 if (dtrace_destructive_disallow ||
5684 !dtrace_priv_kernel_destructive(state)) {
5685 return;
5686 }
5687 regs[rd] = dtrace_physmem_read(tupregs[0].dttk_value,
5688 tupregs[1].dttk_value);
5689 #else
5690 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5691 #endif /* DEBUG || DEVELOPMENT */
5692 break;
5693 }
5694 case DIF_SUBR_PHYSMEM_WRITE: {
5695 #if DEBUG || DEVELOPMENT
5696 if (dtrace_destructive_disallow ||
5697 !dtrace_priv_kernel_destructive(state)) {
5698 return;
5699 }
5700
5701 dtrace_physmem_write(tupregs[0].dttk_value,
5702 tupregs[1].dttk_value, (size_t)tupregs[2].dttk_value);
5703 #else
5704 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5705 #endif /* DEBUG || DEVELOPMENT */
5706 break;
5707 }
5708
5709 case DIF_SUBR_KVTOPHYS: {
5710 #if DEBUG || DEVELOPMENT
5711 regs[rd] = kvtophys(tupregs[0].dttk_value);
5712 #else
5713 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5714 #endif /* DEBUG || DEVELOPMENT */
5715 break;
5716 }
5717
5718 case DIF_SUBR_LIVEDUMP: {
5719 #if DEBUG || DEVELOPMENT
5720 if (dtrace_destructive_disallow ||
5721 !dtrace_priv_kernel_destructive(state)) {
5722 break;
5723 }
5724
5725 /* For the moment, there is only one type of livedump. */
5726 if (nargs != 1 || tupregs[0].dttk_value != 0) {
5727 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5728 break;
5729 }
5730
5731 char *dest = (char *)mstate->dtms_scratch_ptr;
5732 uint64_t size = state->dts_options[DTRACEOPT_STRSIZE];
5733
5734 if (!DTRACE_INSCRATCH(mstate, size)) {
5735 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
5736 regs[rd] = 0;
5737 break;
5738 }
5739
5740 dtrace_livedump(dest, size);
5741 regs[rd] = (uintptr_t) dest;
5742 mstate->dtms_scratch_ptr += strlen(dest) + 1;
5743 #else
5744 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
5745 #endif /* DEBUG || DEVELOPMENT */
5746 break;
5747 }
5748 #endif /* defined(__APPLE__) */
5749
5750 }
5751 }
5752
5753 /*
5754 * Emulate the execution of DTrace IR instructions specified by the given
5755 * DIF object. This function is deliberately void of assertions as all of
5756 * the necessary checks are handled by a call to dtrace_difo_validate().
5757 */
5758 static uint64_t
dtrace_dif_emulate(dtrace_difo_t * difo,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate,dtrace_state_t * state)5759 dtrace_dif_emulate(dtrace_difo_t *difo, dtrace_mstate_t *mstate,
5760 dtrace_vstate_t *vstate, dtrace_state_t *state)
5761 {
5762 const dif_instr_t *text = difo->dtdo_buf;
5763 const uint_t textlen = difo->dtdo_len;
5764 const char *strtab = difo->dtdo_strtab;
5765 const uint64_t *inttab = difo->dtdo_inttab;
5766
5767 uint64_t rval = 0;
5768 dtrace_statvar_t *svar;
5769 dtrace_dstate_t *dstate = &vstate->dtvs_dynvars;
5770 dtrace_difv_t *v;
5771 volatile uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
5772 volatile uint64_t *illval = &cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
5773
5774 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
5775 uint64_t regs[DIF_DIR_NREGS];
5776 uint64_t *tmp;
5777
5778 uint8_t cc_n = 0, cc_z = 0, cc_v = 0, cc_c = 0;
5779 int64_t cc_r;
5780 uint_t pc = 0, id, opc = 0;
5781 uint8_t ttop = 0;
5782 dif_instr_t instr;
5783 uint_t r1, r2, rd;
5784
5785 /*
5786 * We stash the current DIF object into the machine state: we need it
5787 * for subsequent access checking.
5788 */
5789 mstate->dtms_difo = difo;
5790
5791 regs[DIF_REG_R0] = 0; /* %r0 is fixed at zero */
5792
5793 while (pc < textlen && !(*flags & CPU_DTRACE_FAULT)) {
5794 opc = pc;
5795
5796 instr = text[pc++];
5797 r1 = DIF_INSTR_R1(instr);
5798 r2 = DIF_INSTR_R2(instr);
5799 rd = DIF_INSTR_RD(instr);
5800
5801 switch (DIF_INSTR_OP(instr)) {
5802 case DIF_OP_OR:
5803 regs[rd] = regs[r1] | regs[r2];
5804 break;
5805 case DIF_OP_XOR:
5806 regs[rd] = regs[r1] ^ regs[r2];
5807 break;
5808 case DIF_OP_AND:
5809 regs[rd] = regs[r1] & regs[r2];
5810 break;
5811 case DIF_OP_SLL:
5812 regs[rd] = regs[r1] << regs[r2];
5813 break;
5814 case DIF_OP_SRL:
5815 regs[rd] = regs[r1] >> regs[r2];
5816 break;
5817 case DIF_OP_SUB:
5818 regs[rd] = regs[r1] - regs[r2];
5819 break;
5820 case DIF_OP_ADD:
5821 regs[rd] = regs[r1] + regs[r2];
5822 break;
5823 case DIF_OP_MUL:
5824 regs[rd] = regs[r1] * regs[r2];
5825 break;
5826 case DIF_OP_SDIV:
5827 if (regs[r2] == 0) {
5828 regs[rd] = 0;
5829 *flags |= CPU_DTRACE_DIVZERO;
5830 } else {
5831 regs[rd] = (int64_t)regs[r1] /
5832 (int64_t)regs[r2];
5833 }
5834 break;
5835
5836 case DIF_OP_UDIV:
5837 if (regs[r2] == 0) {
5838 regs[rd] = 0;
5839 *flags |= CPU_DTRACE_DIVZERO;
5840 } else {
5841 regs[rd] = regs[r1] / regs[r2];
5842 }
5843 break;
5844
5845 case DIF_OP_SREM:
5846 if (regs[r2] == 0) {
5847 regs[rd] = 0;
5848 *flags |= CPU_DTRACE_DIVZERO;
5849 } else {
5850 regs[rd] = (int64_t)regs[r1] %
5851 (int64_t)regs[r2];
5852 }
5853 break;
5854
5855 case DIF_OP_UREM:
5856 if (regs[r2] == 0) {
5857 regs[rd] = 0;
5858 *flags |= CPU_DTRACE_DIVZERO;
5859 } else {
5860 regs[rd] = regs[r1] % regs[r2];
5861 }
5862 break;
5863
5864 case DIF_OP_NOT:
5865 regs[rd] = ~regs[r1];
5866 break;
5867 case DIF_OP_MOV:
5868 regs[rd] = regs[r1];
5869 break;
5870 case DIF_OP_CMP:
5871 cc_r = regs[r1] - regs[r2];
5872 cc_n = cc_r < 0;
5873 cc_z = cc_r == 0;
5874 cc_v = 0;
5875 cc_c = regs[r1] < regs[r2];
5876 break;
5877 case DIF_OP_TST:
5878 cc_n = cc_v = cc_c = 0;
5879 cc_z = regs[r1] == 0;
5880 break;
5881 case DIF_OP_BA:
5882 pc = DIF_INSTR_LABEL(instr);
5883 break;
5884 case DIF_OP_BE:
5885 if (cc_z)
5886 pc = DIF_INSTR_LABEL(instr);
5887 break;
5888 case DIF_OP_BNE:
5889 if (cc_z == 0)
5890 pc = DIF_INSTR_LABEL(instr);
5891 break;
5892 case DIF_OP_BG:
5893 if ((cc_z | (cc_n ^ cc_v)) == 0)
5894 pc = DIF_INSTR_LABEL(instr);
5895 break;
5896 case DIF_OP_BGU:
5897 if ((cc_c | cc_z) == 0)
5898 pc = DIF_INSTR_LABEL(instr);
5899 break;
5900 case DIF_OP_BGE:
5901 if ((cc_n ^ cc_v) == 0)
5902 pc = DIF_INSTR_LABEL(instr);
5903 break;
5904 case DIF_OP_BGEU:
5905 if (cc_c == 0)
5906 pc = DIF_INSTR_LABEL(instr);
5907 break;
5908 case DIF_OP_BL:
5909 if (cc_n ^ cc_v)
5910 pc = DIF_INSTR_LABEL(instr);
5911 break;
5912 case DIF_OP_BLU:
5913 if (cc_c)
5914 pc = DIF_INSTR_LABEL(instr);
5915 break;
5916 case DIF_OP_BLE:
5917 if (cc_z | (cc_n ^ cc_v))
5918 pc = DIF_INSTR_LABEL(instr);
5919 break;
5920 case DIF_OP_BLEU:
5921 if (cc_c | cc_z)
5922 pc = DIF_INSTR_LABEL(instr);
5923 break;
5924 case DIF_OP_RLDSB:
5925 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5926 *flags |= CPU_DTRACE_KPRIV;
5927 *illval = regs[r1];
5928 break;
5929 }
5930 OS_FALLTHROUGH;
5931 case DIF_OP_LDSB:
5932 regs[rd] = (int8_t)dtrace_load8(regs[r1]);
5933 break;
5934 case DIF_OP_RLDSH:
5935 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5936 *flags |= CPU_DTRACE_KPRIV;
5937 *illval = regs[r1];
5938 break;
5939 }
5940 OS_FALLTHROUGH;
5941 case DIF_OP_LDSH:
5942 regs[rd] = (int16_t)dtrace_load16(regs[r1]);
5943 break;
5944 case DIF_OP_RLDSW:
5945 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5946 *flags |= CPU_DTRACE_KPRIV;
5947 *illval = regs[r1];
5948 break;
5949 }
5950 OS_FALLTHROUGH;
5951 case DIF_OP_LDSW:
5952 regs[rd] = (int32_t)dtrace_load32(regs[r1]);
5953 break;
5954 case DIF_OP_RLDUB:
5955 if (!dtrace_canstore(regs[r1], 1, mstate, vstate)) {
5956 *flags |= CPU_DTRACE_KPRIV;
5957 *illval = regs[r1];
5958 break;
5959 }
5960 OS_FALLTHROUGH;
5961 case DIF_OP_LDUB:
5962 regs[rd] = dtrace_load8(regs[r1]);
5963 break;
5964 case DIF_OP_RLDUH:
5965 if (!dtrace_canstore(regs[r1], 2, mstate, vstate)) {
5966 *flags |= CPU_DTRACE_KPRIV;
5967 *illval = regs[r1];
5968 break;
5969 }
5970 OS_FALLTHROUGH;
5971 case DIF_OP_LDUH:
5972 regs[rd] = dtrace_load16(regs[r1]);
5973 break;
5974 case DIF_OP_RLDUW:
5975 if (!dtrace_canstore(regs[r1], 4, mstate, vstate)) {
5976 *flags |= CPU_DTRACE_KPRIV;
5977 *illval = regs[r1];
5978 break;
5979 }
5980 OS_FALLTHROUGH;
5981 case DIF_OP_LDUW:
5982 regs[rd] = dtrace_load32(regs[r1]);
5983 break;
5984 case DIF_OP_RLDX:
5985 if (!dtrace_canstore(regs[r1], 8, mstate, vstate)) {
5986 *flags |= CPU_DTRACE_KPRIV;
5987 *illval = regs[r1];
5988 break;
5989 }
5990 OS_FALLTHROUGH;
5991 case DIF_OP_LDX:
5992 regs[rd] = dtrace_load64(regs[r1]);
5993 break;
5994 /*
5995 * Darwin 32-bit kernel may fetch from 64-bit user.
5996 * Do not cast regs to uintptr_t
5997 * DIF_OP_ULDSB,DIF_OP_ULDSH, DIF_OP_ULDSW, DIF_OP_ULDUB
5998 * DIF_OP_ULDUH, DIF_OP_ULDUW, DIF_OP_ULDX
5999 */
6000 case DIF_OP_ULDSB:
6001 regs[rd] = (int8_t)
6002 dtrace_fuword8(regs[r1]);
6003 break;
6004 case DIF_OP_ULDSH:
6005 regs[rd] = (int16_t)
6006 dtrace_fuword16(regs[r1]);
6007 break;
6008 case DIF_OP_ULDSW:
6009 regs[rd] = (int32_t)
6010 dtrace_fuword32(regs[r1]);
6011 break;
6012 case DIF_OP_ULDUB:
6013 regs[rd] =
6014 dtrace_fuword8(regs[r1]);
6015 break;
6016 case DIF_OP_ULDUH:
6017 regs[rd] =
6018 dtrace_fuword16(regs[r1]);
6019 break;
6020 case DIF_OP_ULDUW:
6021 regs[rd] =
6022 dtrace_fuword32(regs[r1]);
6023 break;
6024 case DIF_OP_ULDX:
6025 regs[rd] =
6026 dtrace_fuword64(regs[r1]);
6027 break;
6028 case DIF_OP_RET:
6029 rval = regs[rd];
6030 pc = textlen;
6031 break;
6032 case DIF_OP_NOP:
6033 break;
6034 case DIF_OP_SETX:
6035 regs[rd] = inttab[DIF_INSTR_INTEGER(instr)];
6036 break;
6037 case DIF_OP_SETS:
6038 regs[rd] = (uint64_t)(uintptr_t)
6039 (strtab + DIF_INSTR_STRING(instr));
6040 break;
6041 case DIF_OP_SCMP: {
6042 size_t sz = state->dts_options[DTRACEOPT_STRSIZE];
6043 uintptr_t s1 = regs[r1];
6044 uintptr_t s2 = regs[r2];
6045 size_t lim1 = sz, lim2 = sz;
6046
6047 if (s1 != 0 &&
6048 !dtrace_strcanload(s1, sz, &lim1, mstate, vstate))
6049 break;
6050 if (s2 != 0 &&
6051 !dtrace_strcanload(s2, sz, &lim2, mstate, vstate))
6052 break;
6053
6054 cc_r = dtrace_strncmp((char *)s1, (char *)s2,
6055 MIN(lim1, lim2));
6056
6057 cc_n = cc_r < 0;
6058 cc_z = cc_r == 0;
6059 cc_v = cc_c = 0;
6060 break;
6061 }
6062 case DIF_OP_LDGA:
6063 regs[rd] = dtrace_dif_variable(mstate, state,
6064 r1, regs[r2]);
6065 break;
6066 case DIF_OP_LDGS:
6067 id = DIF_INSTR_VAR(instr);
6068
6069 if (id >= DIF_VAR_OTHER_UBASE) {
6070 uintptr_t a;
6071
6072 id -= DIF_VAR_OTHER_UBASE;
6073 svar = vstate->dtvs_globals[id];
6074 ASSERT(svar != NULL);
6075 v = &svar->dtsv_var;
6076
6077 if (!(v->dtdv_type.dtdt_flags & DIF_TF_BYREF)) {
6078 regs[rd] = svar->dtsv_data;
6079 break;
6080 }
6081
6082 a = (uintptr_t)svar->dtsv_data;
6083
6084 if (*(uint8_t *)a == UINT8_MAX) {
6085 /*
6086 * If the 0th byte is set to UINT8_MAX
6087 * then this is to be treated as a
6088 * reference to a NULL variable.
6089 */
6090 regs[rd] = 0;
6091 } else {
6092 regs[rd] = a + sizeof (uint64_t);
6093 }
6094
6095 break;
6096 }
6097
6098 regs[rd] = dtrace_dif_variable(mstate, state, id, 0);
6099 break;
6100
6101 case DIF_OP_STGS:
6102 id = DIF_INSTR_VAR(instr);
6103
6104 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6105 id -= DIF_VAR_OTHER_UBASE;
6106
6107 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6108 svar = vstate->dtvs_globals[id];
6109 ASSERT(svar != NULL);
6110 v = &svar->dtsv_var;
6111
6112 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6113 uintptr_t a = (uintptr_t)svar->dtsv_data;
6114 size_t lim = 0;
6115
6116 ASSERT(a != 0);
6117 ASSERT(svar->dtsv_size != 0);
6118
6119 if (regs[rd] == 0) {
6120 *(uint8_t *)a = UINT8_MAX;
6121 break;
6122 } else {
6123 *(uint8_t *)a = 0;
6124 a += sizeof (uint64_t);
6125 }
6126 if (!dtrace_vcanload(
6127 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6128 &lim, mstate, vstate))
6129 break;
6130
6131 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6132 (void *)a, &v->dtdv_type, lim);
6133 break;
6134 }
6135
6136 svar->dtsv_data = regs[rd];
6137 break;
6138
6139 case DIF_OP_LDTA:
6140 /*
6141 * There are no DTrace built-in thread-local arrays at
6142 * present. This opcode is saved for future work.
6143 */
6144 *flags |= CPU_DTRACE_ILLOP;
6145 regs[rd] = 0;
6146 break;
6147
6148 case DIF_OP_LDLS:
6149 id = DIF_INSTR_VAR(instr);
6150
6151 if (id < DIF_VAR_OTHER_UBASE) {
6152 /*
6153 * For now, this has no meaning.
6154 */
6155 regs[rd] = 0;
6156 break;
6157 }
6158
6159 id -= DIF_VAR_OTHER_UBASE;
6160
6161 ASSERT(id < (uint_t)vstate->dtvs_nlocals);
6162 ASSERT(vstate->dtvs_locals != NULL);
6163 svar = vstate->dtvs_locals[id];
6164 ASSERT(svar != NULL);
6165 v = &svar->dtsv_var;
6166
6167 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6168 uintptr_t a = (uintptr_t)svar->dtsv_data;
6169 size_t sz = v->dtdv_type.dtdt_size;
6170
6171 sz += sizeof (uint64_t);
6172 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6173 a += CPU->cpu_id * sz;
6174
6175 if (*(uint8_t *)a == UINT8_MAX) {
6176 /*
6177 * If the 0th byte is set to UINT8_MAX
6178 * then this is to be treated as a
6179 * reference to a NULL variable.
6180 */
6181 regs[rd] = 0;
6182 } else {
6183 regs[rd] = a + sizeof (uint64_t);
6184 }
6185
6186 break;
6187 }
6188
6189 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6190 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6191 regs[rd] = tmp[CPU->cpu_id];
6192 break;
6193
6194 case DIF_OP_STLS:
6195 id = DIF_INSTR_VAR(instr);
6196
6197 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6198 id -= DIF_VAR_OTHER_UBASE;
6199 VERIFY(id < (uint_t)vstate->dtvs_nlocals);
6200 ASSERT(vstate->dtvs_locals != NULL);
6201 svar = vstate->dtvs_locals[id];
6202 ASSERT(svar != NULL);
6203 v = &svar->dtsv_var;
6204
6205 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6206 uintptr_t a = (uintptr_t)svar->dtsv_data;
6207 size_t sz = v->dtdv_type.dtdt_size;
6208 size_t lim = 0;
6209
6210 sz += sizeof (uint64_t);
6211 ASSERT(svar->dtsv_size == (int)NCPU * sz);
6212 a += CPU->cpu_id * sz;
6213
6214 if (regs[rd] == 0) {
6215 *(uint8_t *)a = UINT8_MAX;
6216 break;
6217 } else {
6218 *(uint8_t *)a = 0;
6219 a += sizeof (uint64_t);
6220 }
6221
6222 if (!dtrace_vcanload(
6223 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6224 &lim, mstate, vstate))
6225 break;
6226
6227 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6228 (void *)a, &v->dtdv_type, lim);
6229 break;
6230 }
6231
6232 ASSERT(svar->dtsv_size == (int)NCPU * sizeof (uint64_t));
6233 tmp = (uint64_t *)(uintptr_t)svar->dtsv_data;
6234 tmp[CPU->cpu_id] = regs[rd];
6235 break;
6236
6237 case DIF_OP_LDTS: {
6238 dtrace_dynvar_t *dvar;
6239 dtrace_key_t *key;
6240
6241 id = DIF_INSTR_VAR(instr);
6242 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6243 id -= DIF_VAR_OTHER_UBASE;
6244 v = &vstate->dtvs_tlocals[id];
6245
6246 key = &tupregs[DIF_DTR_NREGS];
6247 key[0].dttk_value = (uint64_t)id;
6248 key[0].dttk_size = 0;
6249 DTRACE_TLS_THRKEY(key[1].dttk_value);
6250 key[1].dttk_size = 0;
6251
6252 dvar = dtrace_dynvar(dstate, 2, key,
6253 sizeof (uint64_t), DTRACE_DYNVAR_NOALLOC,
6254 mstate, vstate);
6255
6256 if (dvar == NULL) {
6257 regs[rd] = 0;
6258 break;
6259 }
6260
6261 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6262 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6263 } else {
6264 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6265 }
6266
6267 break;
6268 }
6269
6270 case DIF_OP_STTS: {
6271 dtrace_dynvar_t *dvar;
6272 dtrace_key_t *key;
6273
6274 id = DIF_INSTR_VAR(instr);
6275 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6276 id -= DIF_VAR_OTHER_UBASE;
6277 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6278
6279 key = &tupregs[DIF_DTR_NREGS];
6280 key[0].dttk_value = (uint64_t)id;
6281 key[0].dttk_size = 0;
6282 DTRACE_TLS_THRKEY(key[1].dttk_value);
6283 key[1].dttk_size = 0;
6284 v = &vstate->dtvs_tlocals[id];
6285
6286 dvar = dtrace_dynvar(dstate, 2, key,
6287 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6288 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6289 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6290 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6291
6292 /*
6293 * Given that we're storing to thread-local data,
6294 * we need to flush our predicate cache.
6295 */
6296 dtrace_set_thread_predcache(current_thread(), 0);
6297
6298 if (dvar == NULL)
6299 break;
6300
6301 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6302 size_t lim = 0;
6303
6304 if (!dtrace_vcanload(
6305 (void *)(uintptr_t)regs[rd],
6306 &v->dtdv_type, &lim, mstate, vstate))
6307 break;
6308
6309 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6310 dvar->dtdv_data, &v->dtdv_type, lim);
6311 } else {
6312 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6313 }
6314
6315 break;
6316 }
6317
6318 case DIF_OP_SRA:
6319 regs[rd] = (int64_t)regs[r1] >> regs[r2];
6320 break;
6321
6322 case DIF_OP_CALL:
6323 dtrace_dif_subr(DIF_INSTR_SUBR(instr), rd,
6324 regs, tupregs, ttop, mstate, state);
6325 break;
6326
6327 case DIF_OP_PUSHTR:
6328 if (ttop == DIF_DTR_NREGS) {
6329 *flags |= CPU_DTRACE_TUPOFLOW;
6330 break;
6331 }
6332
6333 if (r1 == DIF_TYPE_STRING) {
6334 /*
6335 * If this is a string type and the size is 0,
6336 * we'll use the system-wide default string
6337 * size. Note that we are _not_ looking at
6338 * the value of the DTRACEOPT_STRSIZE option;
6339 * had this been set, we would expect to have
6340 * a non-zero size value in the "pushtr".
6341 */
6342 tupregs[ttop].dttk_size =
6343 dtrace_strlen((char *)(uintptr_t)regs[rd],
6344 regs[r2] ? regs[r2] :
6345 dtrace_strsize_default) + 1;
6346 } else {
6347 if (regs[r2] > LONG_MAX) {
6348 *flags |= CPU_DTRACE_ILLOP;
6349 break;
6350 }
6351 tupregs[ttop].dttk_size = regs[r2];
6352 }
6353
6354 tupregs[ttop++].dttk_value = regs[rd];
6355 break;
6356
6357 case DIF_OP_PUSHTV:
6358 if (ttop == DIF_DTR_NREGS) {
6359 *flags |= CPU_DTRACE_TUPOFLOW;
6360 break;
6361 }
6362
6363 tupregs[ttop].dttk_value = regs[rd];
6364 tupregs[ttop++].dttk_size = 0;
6365 break;
6366
6367 case DIF_OP_POPTS:
6368 if (ttop != 0)
6369 ttop--;
6370 break;
6371
6372 case DIF_OP_FLUSHTS:
6373 ttop = 0;
6374 break;
6375
6376 case DIF_OP_LDGAA:
6377 case DIF_OP_LDTAA: {
6378 dtrace_dynvar_t *dvar;
6379 dtrace_key_t *key = tupregs;
6380 uint_t nkeys = ttop;
6381
6382 id = DIF_INSTR_VAR(instr);
6383 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6384 id -= DIF_VAR_OTHER_UBASE;
6385
6386 key[nkeys].dttk_value = (uint64_t)id;
6387 key[nkeys++].dttk_size = 0;
6388
6389 if (DIF_INSTR_OP(instr) == DIF_OP_LDTAA) {
6390 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6391 key[nkeys++].dttk_size = 0;
6392 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6393 v = &vstate->dtvs_tlocals[id];
6394 } else {
6395 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6396 v = &vstate->dtvs_globals[id]->dtsv_var;
6397 }
6398
6399 dvar = dtrace_dynvar(dstate, nkeys, key,
6400 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6401 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6402 DTRACE_DYNVAR_NOALLOC, mstate, vstate);
6403
6404 if (dvar == NULL) {
6405 regs[rd] = 0;
6406 break;
6407 }
6408
6409 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6410 regs[rd] = (uint64_t)(uintptr_t)dvar->dtdv_data;
6411 } else {
6412 regs[rd] = *((uint64_t *)dvar->dtdv_data);
6413 }
6414
6415 break;
6416 }
6417
6418 case DIF_OP_STGAA:
6419 case DIF_OP_STTAA: {
6420 dtrace_dynvar_t *dvar;
6421 dtrace_key_t *key = tupregs;
6422 uint_t nkeys = ttop;
6423
6424 id = DIF_INSTR_VAR(instr);
6425 ASSERT(id >= DIF_VAR_OTHER_UBASE);
6426 id -= DIF_VAR_OTHER_UBASE;
6427
6428 key[nkeys].dttk_value = (uint64_t)id;
6429 key[nkeys++].dttk_size = 0;
6430
6431 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA) {
6432 DTRACE_TLS_THRKEY(key[nkeys].dttk_value);
6433 key[nkeys++].dttk_size = 0;
6434 VERIFY(id < (uint_t)vstate->dtvs_ntlocals);
6435 v = &vstate->dtvs_tlocals[id];
6436 } else {
6437 VERIFY(id < (uint_t)vstate->dtvs_nglobals);
6438 v = &vstate->dtvs_globals[id]->dtsv_var;
6439 }
6440
6441 dvar = dtrace_dynvar(dstate, nkeys, key,
6442 v->dtdv_type.dtdt_size > sizeof (uint64_t) ?
6443 v->dtdv_type.dtdt_size : sizeof (uint64_t),
6444 regs[rd] ? DTRACE_DYNVAR_ALLOC :
6445 DTRACE_DYNVAR_DEALLOC, mstate, vstate);
6446
6447 if (dvar == NULL)
6448 break;
6449
6450 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF) {
6451 size_t lim = 0;
6452
6453 if (!dtrace_vcanload(
6454 (void *)(uintptr_t)regs[rd], &v->dtdv_type,
6455 &lim, mstate, vstate))
6456 break;
6457
6458 dtrace_vcopy((void *)(uintptr_t)regs[rd],
6459 dvar->dtdv_data, &v->dtdv_type, lim);
6460 } else {
6461 *((uint64_t *)dvar->dtdv_data) = regs[rd];
6462 }
6463
6464 break;
6465 }
6466
6467 case DIF_OP_ALLOCS: {
6468 uintptr_t ptr = P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6469 size_t size = ptr - mstate->dtms_scratch_ptr + regs[r1];
6470
6471 /*
6472 * Rounding up the user allocation size could have
6473 * overflowed large, bogus allocations (like -1ULL) to
6474 * 0.
6475 */
6476 if (size < regs[r1] ||
6477 !DTRACE_INSCRATCH(mstate, size)) {
6478 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6479 regs[rd] = 0;
6480 break;
6481 }
6482
6483 dtrace_bzero((void *) mstate->dtms_scratch_ptr, size);
6484 mstate->dtms_scratch_ptr += size;
6485 regs[rd] = ptr;
6486 break;
6487 }
6488
6489 case DIF_OP_COPYS:
6490 if (!dtrace_canstore(regs[rd], regs[r2],
6491 mstate, vstate)) {
6492 *flags |= CPU_DTRACE_BADADDR;
6493 *illval = regs[rd];
6494 break;
6495 }
6496
6497 if (!dtrace_canload(regs[r1], regs[r2], mstate, vstate))
6498 break;
6499
6500 dtrace_bcopy((void *)(uintptr_t)regs[r1],
6501 (void *)(uintptr_t)regs[rd], (size_t)regs[r2]);
6502 break;
6503
6504 case DIF_OP_STB:
6505 if (!dtrace_canstore(regs[rd], 1, mstate, vstate)) {
6506 *flags |= CPU_DTRACE_BADADDR;
6507 *illval = regs[rd];
6508 break;
6509 }
6510 *((uint8_t *)(uintptr_t)regs[rd]) = (uint8_t)regs[r1];
6511 break;
6512
6513 case DIF_OP_STH:
6514 if (!dtrace_canstore(regs[rd], 2, mstate, vstate)) {
6515 *flags |= CPU_DTRACE_BADADDR;
6516 *illval = regs[rd];
6517 break;
6518 }
6519 if (regs[rd] & 1) {
6520 *flags |= CPU_DTRACE_BADALIGN;
6521 *illval = regs[rd];
6522 break;
6523 }
6524 *((uint16_t *)(uintptr_t)regs[rd]) = (uint16_t)regs[r1];
6525 break;
6526
6527 case DIF_OP_STW:
6528 if (!dtrace_canstore(regs[rd], 4, mstate, vstate)) {
6529 *flags |= CPU_DTRACE_BADADDR;
6530 *illval = regs[rd];
6531 break;
6532 }
6533 if (regs[rd] & 3) {
6534 *flags |= CPU_DTRACE_BADALIGN;
6535 *illval = regs[rd];
6536 break;
6537 }
6538 *((uint32_t *)(uintptr_t)regs[rd]) = (uint32_t)regs[r1];
6539 break;
6540
6541 case DIF_OP_STX:
6542 if (!dtrace_canstore(regs[rd], 8, mstate, vstate)) {
6543 *flags |= CPU_DTRACE_BADADDR;
6544 *illval = regs[rd];
6545 break;
6546 }
6547
6548 /*
6549 * Darwin kmem_zalloc() called from
6550 * dtrace_difo_init() is 4-byte aligned.
6551 */
6552 if (regs[rd] & 3) {
6553 *flags |= CPU_DTRACE_BADALIGN;
6554 *illval = regs[rd];
6555 break;
6556 }
6557 *((uint64_t *)(uintptr_t)regs[rd]) = regs[r1];
6558 break;
6559 case DIF_OP_STRIP:
6560 regs[rd] = (uint64_t)dtrace_ptrauth_strip(
6561 (void*)regs[r1], r2);
6562 break;
6563 }
6564 }
6565
6566 if (!(*flags & CPU_DTRACE_FAULT))
6567 return (rval);
6568
6569 mstate->dtms_fltoffs = opc * sizeof (dif_instr_t);
6570 mstate->dtms_present |= DTRACE_MSTATE_FLTOFFS;
6571
6572 return (0);
6573 }
6574
6575 __attribute__((noinline))
6576 static void
dtrace_action_breakpoint(dtrace_ecb_t * ecb)6577 dtrace_action_breakpoint(dtrace_ecb_t *ecb)
6578 {
6579 dtrace_probe_t *probe = ecb->dte_probe;
6580 dtrace_provider_t *prov = probe->dtpr_provider;
6581 char c[DTRACE_FULLNAMELEN + 80], *str;
6582 const char *msg = "dtrace: breakpoint action at probe ";
6583 const char *ecbmsg = " (ecb ";
6584 uintptr_t mask = (0xf << (sizeof (uintptr_t) * NBBY / 4));
6585 uintptr_t val = (uintptr_t)ecb;
6586 int shift = (sizeof (uintptr_t) * NBBY) - 4, i = 0;
6587
6588 if (dtrace_destructive_disallow)
6589 return;
6590
6591 /*
6592 * It's impossible to be taking action on the NULL probe.
6593 */
6594 ASSERT(probe != NULL);
6595
6596 /*
6597 * This is a poor man's (destitute man's?) sprintf(): we want to
6598 * print the provider name, module name, function name and name of
6599 * the probe, along with the hex address of the ECB with the breakpoint
6600 * action -- all of which we must place in the character buffer by
6601 * hand.
6602 */
6603 while (*msg != '\0')
6604 c[i++] = *msg++;
6605
6606 for (str = prov->dtpv_name; *str != '\0'; str++)
6607 c[i++] = *str;
6608 c[i++] = ':';
6609
6610 for (str = probe->dtpr_mod; *str != '\0'; str++)
6611 c[i++] = *str;
6612 c[i++] = ':';
6613
6614 for (str = probe->dtpr_func; *str != '\0'; str++)
6615 c[i++] = *str;
6616 c[i++] = ':';
6617
6618 for (str = probe->dtpr_name; *str != '\0'; str++)
6619 c[i++] = *str;
6620
6621 while (*ecbmsg != '\0')
6622 c[i++] = *ecbmsg++;
6623
6624 while (shift >= 0) {
6625 mask = (uintptr_t)0xf << shift;
6626
6627 if (val >= ((uintptr_t)1 << shift))
6628 c[i++] = "0123456789abcdef"[(val & mask) >> shift];
6629 shift -= 4;
6630 }
6631
6632 c[i++] = ')';
6633 c[i] = '\0';
6634
6635 debug_enter(c);
6636 }
6637
6638 __attribute__((noinline))
6639 static void
dtrace_action_panic(dtrace_ecb_t * ecb)6640 dtrace_action_panic(dtrace_ecb_t *ecb)
6641 {
6642 dtrace_probe_t *probe = ecb->dte_probe;
6643
6644 /*
6645 * It's impossible to be taking action on the NULL probe.
6646 */
6647 ASSERT(probe != NULL);
6648
6649 if (dtrace_destructive_disallow)
6650 return;
6651
6652 if (dtrace_panicked != NULL)
6653 return;
6654
6655 if (dtrace_casptr(&dtrace_panicked, NULL, current_thread()) != NULL)
6656 return;
6657
6658 /*
6659 * We won the right to panic. (We want to be sure that only one
6660 * thread calls panic() from dtrace_probe(), and that panic() is
6661 * called exactly once.)
6662 */
6663 panic("dtrace: panic action at probe %s:%s:%s:%s (ecb %p)",
6664 probe->dtpr_provider->dtpv_name, probe->dtpr_mod,
6665 probe->dtpr_func, probe->dtpr_name, (void *)ecb);
6666
6667 /*
6668 * APPLE NOTE: this was for an old Mac OS X debug feature
6669 * allowing a return from panic(). Revisit someday.
6670 */
6671 dtrace_panicked = NULL;
6672 }
6673
6674 static void
dtrace_action_raise(uint64_t sig)6675 dtrace_action_raise(uint64_t sig)
6676 {
6677 if (dtrace_destructive_disallow)
6678 return;
6679
6680 if (sig >= NSIG) {
6681 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6682 return;
6683 }
6684
6685 /*
6686 * raise() has a queue depth of 1 -- we ignore all subsequent
6687 * invocations of the raise() action.
6688 */
6689
6690 uthread_t uthread = current_uthread();
6691
6692 if (uthread && uthread->t_dtrace_sig == 0) {
6693 uthread->t_dtrace_sig = sig;
6694 act_set_astbsd(current_thread());
6695 }
6696 }
6697
6698 static void
dtrace_action_stop(void)6699 dtrace_action_stop(void)
6700 {
6701 if (dtrace_destructive_disallow)
6702 return;
6703
6704 uthread_t uthread = current_uthread();
6705 if (uthread) {
6706 /*
6707 * The currently running process will be set to task_suspend
6708 * when it next leaves the kernel.
6709 */
6710 uthread->t_dtrace_stop = 1;
6711 act_set_astbsd(current_thread());
6712 }
6713 }
6714
6715
6716 /*
6717 * APPLE NOTE: pidresume works in conjunction with the dtrace stop action.
6718 * Both activate only when the currently running process next leaves the
6719 * kernel.
6720 */
6721 static void
dtrace_action_pidresume(uint64_t pid)6722 dtrace_action_pidresume(uint64_t pid)
6723 {
6724 if (dtrace_destructive_disallow)
6725 return;
6726
6727 if (kauth_cred_issuser(kauth_cred_get()) == 0) {
6728 DTRACE_CPUFLAG_SET(CPU_DTRACE_ILLOP);
6729 return;
6730 }
6731 uthread_t uthread = current_uthread();
6732
6733 /*
6734 * When the currently running process leaves the kernel, it attempts to
6735 * task_resume the process (denoted by pid), if that pid appears to have
6736 * been stopped by dtrace_action_stop().
6737 * The currently running process has a pidresume() queue depth of 1 --
6738 * subsequent invocations of the pidresume() action are ignored.
6739 */
6740
6741 if (pid != 0 && uthread && uthread->t_dtrace_resumepid == 0) {
6742 uthread->t_dtrace_resumepid = pid;
6743 act_set_astbsd(current_thread());
6744 }
6745 }
6746
6747 __attribute__((noinline))
6748 static void
dtrace_action_chill(dtrace_mstate_t * mstate,hrtime_t val)6749 dtrace_action_chill(dtrace_mstate_t *mstate, hrtime_t val)
6750 {
6751 hrtime_t now;
6752 volatile uint16_t *flags;
6753 dtrace_cpu_t *cpu = CPU;
6754
6755 if (dtrace_destructive_disallow)
6756 return;
6757
6758 flags = (volatile uint16_t *)&cpu_core[cpu->cpu_id].cpuc_dtrace_flags;
6759
6760 now = dtrace_gethrtime();
6761
6762 if (now - cpu->cpu_dtrace_chillmark > dtrace_chill_interval) {
6763 /*
6764 * We need to advance the mark to the current time.
6765 */
6766 cpu->cpu_dtrace_chillmark = now;
6767 cpu->cpu_dtrace_chilled = 0;
6768 }
6769
6770 /*
6771 * Now check to see if the requested chill time would take us over
6772 * the maximum amount of time allowed in the chill interval. (Or
6773 * worse, if the calculation itself induces overflow.)
6774 */
6775 if (cpu->cpu_dtrace_chilled + val > dtrace_chill_max ||
6776 cpu->cpu_dtrace_chilled + val < cpu->cpu_dtrace_chilled) {
6777 *flags |= CPU_DTRACE_ILLOP;
6778 return;
6779 }
6780
6781 while (dtrace_gethrtime() - now < val)
6782 continue;
6783
6784 /*
6785 * Normally, we assure that the value of the variable "timestamp" does
6786 * not change within an ECB. The presence of chill() represents an
6787 * exception to this rule, however.
6788 */
6789 mstate->dtms_present &= ~DTRACE_MSTATE_TIMESTAMP;
6790 cpu->cpu_dtrace_chilled += val;
6791 }
6792
6793 __attribute__((noinline))
6794 static void
dtrace_action_ustack(dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t * buf,uint64_t arg)6795 dtrace_action_ustack(dtrace_mstate_t *mstate, dtrace_state_t *state,
6796 uint64_t *buf, uint64_t arg)
6797 {
6798 int nframes = DTRACE_USTACK_NFRAMES(arg);
6799 int strsize = DTRACE_USTACK_STRSIZE(arg);
6800 uint64_t *pcs = &buf[1], *fps;
6801 char *str = (char *)&pcs[nframes];
6802 int size, offs = 0, i, j;
6803 uintptr_t old = mstate->dtms_scratch_ptr, saved;
6804 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6805 char *sym;
6806
6807 /*
6808 * Should be taking a faster path if string space has not been
6809 * allocated.
6810 */
6811 ASSERT(strsize != 0);
6812
6813 /*
6814 * We will first allocate some temporary space for the frame pointers.
6815 */
6816 fps = (uint64_t *)P2ROUNDUP(mstate->dtms_scratch_ptr, 8);
6817 size = (uintptr_t)fps - mstate->dtms_scratch_ptr +
6818 (nframes * sizeof (uint64_t));
6819
6820 if (!DTRACE_INSCRATCH(mstate, (uintptr_t)size)) {
6821 /*
6822 * Not enough room for our frame pointers -- need to indicate
6823 * that we ran out of scratch space.
6824 */
6825 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOSCRATCH);
6826 return;
6827 }
6828
6829 mstate->dtms_scratch_ptr += size;
6830 saved = mstate->dtms_scratch_ptr;
6831
6832 /*
6833 * Now get a stack with both program counters and frame pointers.
6834 */
6835 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6836 dtrace_getufpstack(buf, fps, nframes + 1);
6837 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6838
6839 /*
6840 * If that faulted, we're cooked.
6841 */
6842 if (*flags & CPU_DTRACE_FAULT)
6843 goto out;
6844
6845 /*
6846 * Now we want to walk up the stack, calling the USTACK helper. For
6847 * each iteration, we restore the scratch pointer.
6848 */
6849 for (i = 0; i < nframes; i++) {
6850 mstate->dtms_scratch_ptr = saved;
6851
6852 if (offs >= strsize)
6853 break;
6854
6855 sym = (char *)(uintptr_t)dtrace_helper(
6856 DTRACE_HELPER_ACTION_USTACK,
6857 mstate, state, pcs[i], fps[i]);
6858
6859 /*
6860 * If we faulted while running the helper, we're going to
6861 * clear the fault and null out the corresponding string.
6862 */
6863 if (*flags & CPU_DTRACE_FAULT) {
6864 *flags &= ~CPU_DTRACE_FAULT;
6865 str[offs++] = '\0';
6866 continue;
6867 }
6868
6869 if (sym == NULL) {
6870 str[offs++] = '\0';
6871 continue;
6872 }
6873
6874 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6875
6876 /*
6877 * Now copy in the string that the helper returned to us.
6878 */
6879 for (j = 0; offs + j < strsize; j++) {
6880 if ((str[offs + j] = sym[j]) == '\0')
6881 break;
6882 }
6883
6884 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6885
6886 offs += j + 1;
6887 }
6888
6889 if (offs >= strsize) {
6890 /*
6891 * If we didn't have room for all of the strings, we don't
6892 * abort processing -- this needn't be a fatal error -- but we
6893 * still want to increment a counter (dts_stkstroverflows) to
6894 * allow this condition to be warned about. (If this is from
6895 * a jstack() action, it is easily tuned via jstackstrsize.)
6896 */
6897 dtrace_error(&state->dts_stkstroverflows);
6898 }
6899
6900 while (offs < strsize)
6901 str[offs++] = '\0';
6902
6903 out:
6904 mstate->dtms_scratch_ptr = old;
6905 }
6906
6907 __attribute__((noinline))
6908 static void
dtrace_store_by_ref(dtrace_difo_t * dp,caddr_t tomax,size_t size,size_t * valoffsp,uint64_t * valp,uint64_t end,int intuple,int dtkind)6909 dtrace_store_by_ref(dtrace_difo_t *dp, caddr_t tomax, size_t size,
6910 size_t *valoffsp, uint64_t *valp, uint64_t end, int intuple, int dtkind)
6911 {
6912 volatile uint16_t *flags;
6913 uint64_t val = *valp;
6914 size_t valoffs = *valoffsp;
6915
6916 flags = (volatile uint16_t *)&cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
6917 ASSERT(dtkind == DIF_TF_BYREF || dtkind == DIF_TF_BYUREF);
6918
6919 /*
6920 * If this is a string, we're going to only load until we find the zero
6921 * byte -- after which we'll store zero bytes.
6922 */
6923 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
6924 char c = '\0' + 1;
6925 size_t s;
6926
6927 for (s = 0; s < size; s++) {
6928 if (c != '\0' && dtkind == DIF_TF_BYREF) {
6929 c = dtrace_load8(val++);
6930 } else if (c != '\0' && dtkind == DIF_TF_BYUREF) {
6931 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6932 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6933 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6934 if (*flags & CPU_DTRACE_FAULT)
6935 break;
6936 }
6937
6938 DTRACE_STORE(uint8_t, tomax, valoffs++, c);
6939
6940 if (c == '\0' && intuple)
6941 break;
6942 }
6943 } else {
6944 uint8_t c;
6945 while (valoffs < end) {
6946 if (dtkind == DIF_TF_BYREF) {
6947 c = dtrace_load8(val++);
6948 } else if (dtkind == DIF_TF_BYUREF) {
6949 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
6950 c = dtrace_fuword8((user_addr_t)(uintptr_t)val++);
6951 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
6952 if (*flags & CPU_DTRACE_FAULT)
6953 break;
6954 }
6955
6956 DTRACE_STORE(uint8_t, tomax,
6957 valoffs++, c);
6958 }
6959 }
6960
6961 *valp = val;
6962 *valoffsp = valoffs;
6963 }
6964
6965 /*
6966 * Disables interrupts and sets the per-thread inprobe flag. When DEBUG is
6967 * defined, we also assert that we are not recursing unless the probe ID is an
6968 * error probe.
6969 */
6970 static dtrace_icookie_t
dtrace_probe_enter(dtrace_id_t id)6971 dtrace_probe_enter(dtrace_id_t id)
6972 {
6973 thread_t thread = current_thread();
6974 uint16_t inprobe;
6975
6976 dtrace_icookie_t cookie;
6977
6978 cookie = dtrace_interrupt_disable();
6979
6980 /*
6981 * Unless this is an ERROR probe, we are not allowed to recurse in
6982 * dtrace_probe(). Recursing into DTrace probe usually means that a
6983 * function is instrumented that should not have been instrumented or
6984 * that the ordering guarantee of the records will be violated,
6985 * resulting in unexpected output. If there is an exception to this
6986 * assertion, a new case should be added.
6987 */
6988 inprobe = dtrace_get_thread_inprobe(thread);
6989 VERIFY(inprobe == 0 ||
6990 id == dtrace_probeid_error);
6991 ASSERT(inprobe < UINT16_MAX);
6992 dtrace_set_thread_inprobe(thread, inprobe + 1);
6993
6994 return (cookie);
6995 }
6996
6997 /*
6998 * Clears the per-thread inprobe flag and enables interrupts.
6999 */
7000 static void
dtrace_probe_exit(dtrace_icookie_t cookie)7001 dtrace_probe_exit(dtrace_icookie_t cookie)
7002 {
7003 thread_t thread = current_thread();
7004 uint16_t inprobe = dtrace_get_thread_inprobe(thread);
7005
7006 ASSERT(inprobe > 0);
7007 dtrace_set_thread_inprobe(thread, inprobe - 1);
7008
7009 #if INTERRUPT_MASKED_DEBUG
7010 ml_spin_debug_reset(thread);
7011 #endif /* INTERRUPT_MASKED_DEBUG */
7012
7013 dtrace_interrupt_enable(cookie);
7014 }
7015
7016 /*
7017 * If you're looking for the epicenter of DTrace, you just found it. This
7018 * is the function called by the provider to fire a probe -- from which all
7019 * subsequent probe-context DTrace activity emanates.
7020 */
7021 void
dtrace_probe(dtrace_id_t id,uint64_t arg0,uint64_t arg1,uint64_t arg2,uint64_t arg3,uint64_t arg4)7022 dtrace_probe(dtrace_id_t id, uint64_t arg0, uint64_t arg1,
7023 uint64_t arg2, uint64_t arg3, uint64_t arg4)
7024 {
7025 processorid_t cpuid;
7026 dtrace_icookie_t cookie;
7027 dtrace_probe_t *probe;
7028 dtrace_mstate_t mstate;
7029 dtrace_ecb_t *ecb;
7030 dtrace_action_t *act;
7031 intptr_t offs;
7032 size_t size;
7033 int vtime, onintr;
7034 volatile uint16_t *flags;
7035 hrtime_t now;
7036
7037 cookie = dtrace_probe_enter(id);
7038
7039 /* Ensure that probe id is valid. */
7040 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
7041 dtrace_probe_exit(cookie);
7042 return;
7043 }
7044
7045 probe = dtrace_probes[id - 1];
7046 if (probe == NULL) {
7047 dtrace_probe_exit(cookie);
7048 return;
7049 }
7050
7051 cpuid = CPU->cpu_id;
7052 onintr = CPU_ON_INTR(CPU);
7053
7054 if (!onintr && probe->dtpr_predcache != DTRACE_CACHEIDNONE &&
7055 probe->dtpr_predcache == dtrace_get_thread_predcache(current_thread())) {
7056 /*
7057 * We have hit in the predicate cache; we know that
7058 * this predicate would evaluate to be false.
7059 */
7060 dtrace_probe_exit(cookie);
7061 return;
7062 }
7063
7064 if (panic_quiesce) {
7065 /*
7066 * We don't trace anything if we're panicking.
7067 */
7068 dtrace_probe_exit(cookie);
7069 return;
7070 }
7071
7072 #if !defined(__APPLE__)
7073 now = dtrace_gethrtime();
7074 vtime = dtrace_vtime_references != 0;
7075
7076 if (vtime && curthread->t_dtrace_start)
7077 curthread->t_dtrace_vtime += now - curthread->t_dtrace_start;
7078 #else
7079 /*
7080 * APPLE NOTE: The time spent entering DTrace and arriving
7081 * to this point, is attributed to the current thread.
7082 * Instead it should accrue to DTrace. FIXME
7083 */
7084 vtime = dtrace_vtime_references != 0;
7085
7086 if (vtime)
7087 {
7088 int64_t dtrace_accum_time, recent_vtime;
7089 thread_t thread = current_thread();
7090
7091 dtrace_accum_time = dtrace_get_thread_tracing(thread); /* Time spent inside DTrace so far (nanoseconds) */
7092
7093 if (dtrace_accum_time >= 0) {
7094 recent_vtime = dtrace_abs_to_nano(dtrace_calc_thread_recent_vtime(thread)); /* up to the moment thread vtime */
7095
7096 recent_vtime = recent_vtime - dtrace_accum_time; /* Time without DTrace contribution */
7097
7098 dtrace_set_thread_vtime(thread, recent_vtime);
7099 }
7100 }
7101
7102 now = dtrace_gethrtime(); /* must not precede dtrace_calc_thread_recent_vtime() call! */
7103 #endif /* __APPLE__ */
7104
7105 /*
7106 * APPLE NOTE: A provider may call dtrace_probe_error() in lieu of
7107 * dtrace_probe() in some circumstances. See, e.g. fasttrap_isa.c.
7108 * However the provider has no access to ECB context, so passes
7109 * 0 through "arg0" and the probe_id of the overridden probe as arg1.
7110 * Detect that here and cons up a viable state (from the probe_id).
7111 */
7112 if (dtrace_probeid_error == id && 0 == arg0) {
7113 dtrace_id_t ftp_id = (dtrace_id_t)arg1;
7114 dtrace_probe_t *ftp_probe = dtrace_probes[ftp_id - 1];
7115 dtrace_ecb_t *ftp_ecb = ftp_probe->dtpr_ecb;
7116
7117 if (NULL != ftp_ecb) {
7118 dtrace_state_t *ftp_state = ftp_ecb->dte_state;
7119
7120 arg0 = (uint64_t)(uintptr_t)ftp_state;
7121 arg1 = ftp_ecb->dte_epid;
7122 /*
7123 * args[2-4] established by caller.
7124 */
7125 ftp_state->dts_arg_error_illval = -1; /* arg5 */
7126 }
7127 }
7128
7129 mstate.dtms_difo = NULL;
7130 mstate.dtms_probe = probe;
7131 mstate.dtms_strtok = 0;
7132 mstate.dtms_arg[0] = arg0;
7133 mstate.dtms_arg[1] = arg1;
7134 mstate.dtms_arg[2] = arg2;
7135 mstate.dtms_arg[3] = arg3;
7136 mstate.dtms_arg[4] = arg4;
7137
7138 flags = (volatile uint16_t *)&cpu_core[cpuid].cpuc_dtrace_flags;
7139
7140 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
7141 dtrace_predicate_t *pred = ecb->dte_predicate;
7142 dtrace_state_t *state = ecb->dte_state;
7143 dtrace_buffer_t *buf = &state->dts_buffer[cpuid];
7144 dtrace_buffer_t *aggbuf = &state->dts_aggbuffer[cpuid];
7145 dtrace_vstate_t *vstate = &state->dts_vstate;
7146 dtrace_provider_t *prov = probe->dtpr_provider;
7147 uint64_t tracememsize = 0;
7148 int committed = 0;
7149 caddr_t tomax;
7150
7151 /*
7152 * A little subtlety with the following (seemingly innocuous)
7153 * declaration of the automatic 'val': by looking at the
7154 * code, you might think that it could be declared in the
7155 * action processing loop, below. (That is, it's only used in
7156 * the action processing loop.) However, it must be declared
7157 * out of that scope because in the case of DIF expression
7158 * arguments to aggregating actions, one iteration of the
7159 * action loop will use the last iteration's value.
7160 */
7161 #ifdef lint
7162 uint64_t val = 0;
7163 #else
7164 uint64_t val = 0;
7165 #endif
7166
7167 mstate.dtms_present = DTRACE_MSTATE_ARGS | DTRACE_MSTATE_PROBE;
7168 *flags &= ~CPU_DTRACE_ERROR;
7169
7170 if (prov == dtrace_provider) {
7171 /*
7172 * If dtrace itself is the provider of this probe,
7173 * we're only going to continue processing the ECB if
7174 * arg0 (the dtrace_state_t) is equal to the ECB's
7175 * creating state. (This prevents disjoint consumers
7176 * from seeing one another's metaprobes.)
7177 */
7178 if (arg0 != (uint64_t)(uintptr_t)state)
7179 continue;
7180 }
7181
7182 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE) {
7183 /*
7184 * We're not currently active. If our provider isn't
7185 * the dtrace pseudo provider, we're not interested.
7186 */
7187 if (prov != dtrace_provider)
7188 continue;
7189
7190 /*
7191 * Now we must further check if we are in the BEGIN
7192 * probe. If we are, we will only continue processing
7193 * if we're still in WARMUP -- if one BEGIN enabling
7194 * has invoked the exit() action, we don't want to
7195 * evaluate subsequent BEGIN enablings.
7196 */
7197 if (probe->dtpr_id == dtrace_probeid_begin &&
7198 state->dts_activity != DTRACE_ACTIVITY_WARMUP) {
7199 ASSERT(state->dts_activity ==
7200 DTRACE_ACTIVITY_DRAINING);
7201 continue;
7202 }
7203 }
7204
7205 if (ecb->dte_cond) {
7206 /*
7207 * If the dte_cond bits indicate that this
7208 * consumer is only allowed to see user-mode firings
7209 * of this probe, call the provider's dtps_usermode()
7210 * entry point to check that the probe was fired
7211 * while in a user context. Skip this ECB if that's
7212 * not the case.
7213 */
7214 if ((ecb->dte_cond & DTRACE_COND_USERMODE) &&
7215 prov->dtpv_pops.dtps_usermode &&
7216 prov->dtpv_pops.dtps_usermode(prov->dtpv_arg,
7217 probe->dtpr_id, probe->dtpr_arg) == 0)
7218 continue;
7219
7220 /*
7221 * This is more subtle than it looks. We have to be
7222 * absolutely certain that CRED() isn't going to
7223 * change out from under us so it's only legit to
7224 * examine that structure if we're in constrained
7225 * situations. Currently, the only times we'll this
7226 * check is if a non-super-user has enabled the
7227 * profile or syscall providers -- providers that
7228 * allow visibility of all processes. For the
7229 * profile case, the check above will ensure that
7230 * we're examining a user context.
7231 */
7232 if (ecb->dte_cond & DTRACE_COND_OWNER) {
7233 cred_t *cr;
7234 cred_t *s_cr =
7235 ecb->dte_state->dts_cred.dcr_cred;
7236 proc_t *proc;
7237 #pragma unused(proc) /* __APPLE__ */
7238
7239 ASSERT(s_cr != NULL);
7240
7241 /*
7242 * XXX this is hackish, but so is setting a variable
7243 * XXX in a McCarthy OR...
7244 */
7245 if ((cr = dtrace_CRED()) == NULL ||
7246 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_uid ||
7247 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_ruid ||
7248 posix_cred_get(s_cr)->cr_uid != posix_cred_get(cr)->cr_suid ||
7249 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_gid ||
7250 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_rgid ||
7251 posix_cred_get(s_cr)->cr_gid != posix_cred_get(cr)->cr_sgid ||
7252 #if !defined(__APPLE__)
7253 (proc = ttoproc(curthread)) == NULL ||
7254 (proc->p_flag & SNOCD))
7255 #else
7256 1) /* APPLE NOTE: Darwin omits "No Core Dump" flag */
7257 #endif /* __APPLE__ */
7258 continue;
7259 }
7260
7261 if (ecb->dte_cond & DTRACE_COND_ZONEOWNER) {
7262 cred_t *cr;
7263 cred_t *s_cr =
7264 ecb->dte_state->dts_cred.dcr_cred;
7265 #pragma unused(cr, s_cr) /* __APPLE__ */
7266
7267 ASSERT(s_cr != NULL);
7268
7269 #if !defined(__APPLE__)
7270 if ((cr = CRED()) == NULL ||
7271 s_cr->cr_zone->zone_id !=
7272 cr->cr_zone->zone_id)
7273 continue;
7274 #else
7275 /* APPLE NOTE: Darwin doesn't do zones. */
7276 #endif /* __APPLE__ */
7277 }
7278 }
7279
7280 if (now - state->dts_alive > dtrace_deadman_timeout) {
7281 /*
7282 * We seem to be dead. Unless we (a) have kernel
7283 * destructive permissions (b) have expicitly enabled
7284 * destructive actions and (c) destructive actions have
7285 * not been disabled, we're going to transition into
7286 * the KILLED state, from which no further processing
7287 * on this state will be performed.
7288 */
7289 if (!dtrace_priv_kernel_destructive(state) ||
7290 !state->dts_cred.dcr_destructive ||
7291 dtrace_destructive_disallow) {
7292 void *activity = &state->dts_activity;
7293 dtrace_activity_t current;
7294
7295 do {
7296 current = state->dts_activity;
7297 } while (dtrace_cas32(activity, current,
7298 DTRACE_ACTIVITY_KILLED) != current);
7299
7300 continue;
7301 }
7302 }
7303
7304 if ((offs = dtrace_buffer_reserve(buf, ecb->dte_needed,
7305 ecb->dte_alignment, state, &mstate)) < 0)
7306 continue;
7307
7308 tomax = buf->dtb_tomax;
7309 ASSERT(tomax != NULL);
7310
7311 /*
7312 * Build and store the record header corresponding to the ECB.
7313 */
7314 if (ecb->dte_size != 0) {
7315 dtrace_rechdr_t dtrh;
7316
7317 if (!(mstate.dtms_present & DTRACE_MSTATE_TIMESTAMP)) {
7318 mstate.dtms_timestamp = dtrace_gethrtime();
7319 mstate.dtms_present |= DTRACE_MSTATE_TIMESTAMP;
7320 }
7321
7322 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7323
7324 dtrh.dtrh_epid = ecb->dte_epid;
7325 DTRACE_RECORD_STORE_TIMESTAMP(&dtrh, mstate.dtms_timestamp);
7326 DTRACE_STORE(dtrace_rechdr_t, tomax, offs, dtrh);
7327 }
7328
7329 mstate.dtms_epid = ecb->dte_epid;
7330 mstate.dtms_present |= DTRACE_MSTATE_EPID;
7331
7332 if (state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL)
7333 mstate.dtms_access = DTRACE_ACCESS_KERNEL;
7334 else
7335 mstate.dtms_access = 0;
7336
7337 if (pred != NULL) {
7338 dtrace_difo_t *dp = pred->dtp_difo;
7339 uint64_t rval;
7340
7341 rval = dtrace_dif_emulate(dp, &mstate, vstate, state);
7342
7343 if (!(*flags & CPU_DTRACE_ERROR) && !rval) {
7344 dtrace_cacheid_t cid = probe->dtpr_predcache;
7345
7346 if (cid != DTRACE_CACHEIDNONE && !onintr) {
7347 /*
7348 * Update the predicate cache...
7349 */
7350 ASSERT(cid == pred->dtp_cacheid);
7351
7352 dtrace_set_thread_predcache(current_thread(), cid);
7353 }
7354
7355 continue;
7356 }
7357 }
7358
7359 for (act = ecb->dte_action; !(*flags & CPU_DTRACE_ERROR) &&
7360 act != NULL; act = act->dta_next) {
7361 size_t valoffs;
7362 dtrace_difo_t *dp;
7363 dtrace_recdesc_t *rec = &act->dta_rec;
7364
7365 size = rec->dtrd_size;
7366 valoffs = offs + rec->dtrd_offset;
7367
7368 if (DTRACEACT_ISAGG(act->dta_kind)) {
7369 uint64_t v = 0xbad;
7370 dtrace_aggregation_t *agg;
7371
7372 agg = (dtrace_aggregation_t *)act;
7373
7374 if ((dp = act->dta_difo) != NULL)
7375 v = dtrace_dif_emulate(dp,
7376 &mstate, vstate, state);
7377
7378 if (*flags & CPU_DTRACE_ERROR)
7379 continue;
7380
7381 /*
7382 * Note that we always pass the expression
7383 * value from the previous iteration of the
7384 * action loop. This value will only be used
7385 * if there is an expression argument to the
7386 * aggregating action, denoted by the
7387 * dtag_hasarg field.
7388 */
7389 dtrace_aggregate(agg, buf,
7390 offs, aggbuf, v, val);
7391 continue;
7392 }
7393
7394 switch (act->dta_kind) {
7395 case DTRACEACT_STOP:
7396 if (dtrace_priv_proc_destructive(state))
7397 dtrace_action_stop();
7398 continue;
7399
7400 case DTRACEACT_BREAKPOINT:
7401 if (dtrace_priv_kernel_destructive(state))
7402 dtrace_action_breakpoint(ecb);
7403 continue;
7404
7405 case DTRACEACT_PANIC:
7406 if (dtrace_priv_kernel_destructive(state))
7407 dtrace_action_panic(ecb);
7408 continue;
7409
7410 case DTRACEACT_STACK:
7411 if (!dtrace_priv_kernel(state))
7412 continue;
7413
7414 dtrace_getpcstack((pc_t *)(tomax + valoffs),
7415 size / sizeof (pc_t), probe->dtpr_aframes,
7416 DTRACE_ANCHORED(probe) ? NULL :
7417 (uint32_t *)(uintptr_t)arg0);
7418 continue;
7419
7420 case DTRACEACT_JSTACK:
7421 case DTRACEACT_USTACK:
7422 if (!dtrace_priv_proc(state))
7423 continue;
7424
7425 /*
7426 * See comment in DIF_VAR_PID.
7427 */
7428 if (DTRACE_ANCHORED(mstate.dtms_probe) &&
7429 CPU_ON_INTR(CPU)) {
7430 int depth = DTRACE_USTACK_NFRAMES(
7431 rec->dtrd_arg) + 1;
7432
7433 dtrace_bzero((void *)(tomax + valoffs),
7434 DTRACE_USTACK_STRSIZE(rec->dtrd_arg)
7435 + depth * sizeof (uint64_t));
7436
7437 continue;
7438 }
7439
7440 if (DTRACE_USTACK_STRSIZE(rec->dtrd_arg) != 0 &&
7441 curproc->p_dtrace_helpers != NULL) {
7442 /*
7443 * This is the slow path -- we have
7444 * allocated string space, and we're
7445 * getting the stack of a process that
7446 * has helpers. Call into a separate
7447 * routine to perform this processing.
7448 */
7449 dtrace_action_ustack(&mstate, state,
7450 (uint64_t *)(tomax + valoffs),
7451 rec->dtrd_arg);
7452 continue;
7453 }
7454
7455 DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
7456 dtrace_getupcstack((uint64_t *)
7457 (tomax + valoffs),
7458 DTRACE_USTACK_NFRAMES(rec->dtrd_arg) + 1);
7459 DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT);
7460 continue;
7461
7462 default:
7463 break;
7464 }
7465
7466 dp = act->dta_difo;
7467 ASSERT(dp != NULL);
7468
7469 val = dtrace_dif_emulate(dp, &mstate, vstate, state);
7470
7471 if (*flags & CPU_DTRACE_ERROR)
7472 continue;
7473
7474 switch (act->dta_kind) {
7475 case DTRACEACT_SPECULATE: {
7476 dtrace_rechdr_t *dtrh = NULL;
7477
7478 ASSERT(buf == &state->dts_buffer[cpuid]);
7479 buf = dtrace_speculation_buffer(state,
7480 cpuid, val);
7481
7482 if (buf == NULL) {
7483 *flags |= CPU_DTRACE_DROP;
7484 continue;
7485 }
7486
7487 offs = dtrace_buffer_reserve(buf,
7488 ecb->dte_needed, ecb->dte_alignment,
7489 state, NULL);
7490
7491 if (offs < 0) {
7492 *flags |= CPU_DTRACE_DROP;
7493 continue;
7494 }
7495
7496 tomax = buf->dtb_tomax;
7497 ASSERT(tomax != NULL);
7498
7499 if (ecb->dte_size == 0)
7500 continue;
7501
7502 ASSERT(ecb->dte_size >= sizeof(dtrace_rechdr_t));
7503 dtrh = ((void *)(tomax + offs));
7504 dtrh->dtrh_epid = ecb->dte_epid;
7505
7506 /*
7507 * When the speculation is committed, all of
7508 * the records in the speculative buffer will
7509 * have their timestamps set to the commit
7510 * time. Until then, it is set to a sentinel
7511 * value, for debugability.
7512 */
7513 DTRACE_RECORD_STORE_TIMESTAMP(dtrh, UINT64_MAX);
7514
7515 continue;
7516 }
7517
7518 case DTRACEACT_CHILL:
7519 if (dtrace_priv_kernel_destructive(state))
7520 dtrace_action_chill(&mstate, val);
7521 continue;
7522
7523 case DTRACEACT_RAISE:
7524 if (dtrace_priv_proc_destructive(state))
7525 dtrace_action_raise(val);
7526 continue;
7527
7528 case DTRACEACT_PIDRESUME: /* __APPLE__ */
7529 if (dtrace_priv_proc_destructive(state))
7530 dtrace_action_pidresume(val);
7531 continue;
7532
7533 case DTRACEACT_COMMIT:
7534 ASSERT(!committed);
7535
7536 /*
7537 * We need to commit our buffer state.
7538 */
7539 if (ecb->dte_size)
7540 buf->dtb_offset = offs + ecb->dte_size;
7541 buf = &state->dts_buffer[cpuid];
7542 dtrace_speculation_commit(state, cpuid, val);
7543 committed = 1;
7544 continue;
7545
7546 case DTRACEACT_DISCARD:
7547 dtrace_speculation_discard(state, cpuid, val);
7548 continue;
7549
7550 case DTRACEACT_DIFEXPR:
7551 case DTRACEACT_LIBACT:
7552 case DTRACEACT_PRINTF:
7553 case DTRACEACT_PRINTA:
7554 case DTRACEACT_SYSTEM:
7555 case DTRACEACT_FREOPEN:
7556 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
7557 case DTRACEACT_TRACEMEM:
7558 break;
7559
7560 case DTRACEACT_TRACEMEM_DYNSIZE:
7561 tracememsize = val;
7562 break;
7563
7564 case DTRACEACT_SYM:
7565 case DTRACEACT_MOD:
7566 if (!dtrace_priv_kernel(state))
7567 continue;
7568 break;
7569
7570 case DTRACEACT_USYM:
7571 case DTRACEACT_UMOD:
7572 case DTRACEACT_UADDR: {
7573 if (!dtrace_priv_proc(state))
7574 continue;
7575
7576 DTRACE_STORE(uint64_t, tomax,
7577 valoffs, (uint64_t)dtrace_proc_selfpid());
7578 DTRACE_STORE(uint64_t, tomax,
7579 valoffs + sizeof (uint64_t), val);
7580
7581 continue;
7582 }
7583
7584 case DTRACEACT_EXIT: {
7585 /*
7586 * For the exit action, we are going to attempt
7587 * to atomically set our activity to be
7588 * draining. If this fails (either because
7589 * another CPU has beat us to the exit action,
7590 * or because our current activity is something
7591 * other than ACTIVE or WARMUP), we will
7592 * continue. This assures that the exit action
7593 * can be successfully recorded at most once
7594 * when we're in the ACTIVE state. If we're
7595 * encountering the exit() action while in
7596 * COOLDOWN, however, we want to honor the new
7597 * status code. (We know that we're the only
7598 * thread in COOLDOWN, so there is no race.)
7599 */
7600 void *activity = &state->dts_activity;
7601 dtrace_activity_t current = state->dts_activity;
7602
7603 if (current == DTRACE_ACTIVITY_COOLDOWN)
7604 break;
7605
7606 if (current != DTRACE_ACTIVITY_WARMUP)
7607 current = DTRACE_ACTIVITY_ACTIVE;
7608
7609 if (dtrace_cas32(activity, current,
7610 DTRACE_ACTIVITY_DRAINING) != current) {
7611 *flags |= CPU_DTRACE_DROP;
7612 continue;
7613 }
7614
7615 break;
7616 }
7617
7618 default:
7619 ASSERT(0);
7620 }
7621
7622 if (dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF)) {
7623 uintptr_t end = valoffs + size;
7624
7625 if (tracememsize != 0 &&
7626 valoffs + tracememsize < end)
7627 {
7628 end = valoffs + tracememsize;
7629 tracememsize = 0;
7630 }
7631
7632 if (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF &&
7633 !dtrace_vcanload((void *)(uintptr_t)val,
7634 &dp->dtdo_rtype, NULL, &mstate, vstate))
7635 {
7636 continue;
7637 }
7638
7639 dtrace_store_by_ref(dp, tomax, size, &valoffs,
7640 &val, end, act->dta_intuple,
7641 dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF ?
7642 DIF_TF_BYREF: DIF_TF_BYUREF);
7643
7644 continue;
7645 }
7646
7647 switch (size) {
7648 case 0:
7649 break;
7650
7651 case sizeof (uint8_t):
7652 DTRACE_STORE(uint8_t, tomax, valoffs, val);
7653 break;
7654 case sizeof (uint16_t):
7655 DTRACE_STORE(uint16_t, tomax, valoffs, val);
7656 break;
7657 case sizeof (uint32_t):
7658 DTRACE_STORE(uint32_t, tomax, valoffs, val);
7659 break;
7660 case sizeof (uint64_t):
7661 DTRACE_STORE(uint64_t, tomax, valoffs, val);
7662 break;
7663 default:
7664 /*
7665 * Any other size should have been returned by
7666 * reference, not by value.
7667 */
7668 ASSERT(0);
7669 break;
7670 }
7671 }
7672
7673 if (*flags & CPU_DTRACE_DROP)
7674 continue;
7675
7676 if (*flags & CPU_DTRACE_FAULT) {
7677 int ndx;
7678 dtrace_action_t *err;
7679
7680 buf->dtb_errors++;
7681
7682 if (probe->dtpr_id == dtrace_probeid_error) {
7683 /*
7684 * There's nothing we can do -- we had an
7685 * error on the error probe. We bump an
7686 * error counter to at least indicate that
7687 * this condition happened.
7688 */
7689 dtrace_error(&state->dts_dblerrors);
7690 continue;
7691 }
7692
7693 if (vtime) {
7694 /*
7695 * Before recursing on dtrace_probe(), we
7696 * need to explicitly clear out our start
7697 * time to prevent it from being accumulated
7698 * into t_dtrace_vtime.
7699 */
7700
7701 /*
7702 * Darwin sets the sign bit on t_dtrace_tracing
7703 * to suspend accumulation to it.
7704 */
7705 dtrace_set_thread_tracing(current_thread(),
7706 (1ULL<<63) | dtrace_get_thread_tracing(current_thread()));
7707
7708 }
7709
7710 /*
7711 * Iterate over the actions to figure out which action
7712 * we were processing when we experienced the error.
7713 * Note that act points _past_ the faulting action; if
7714 * act is ecb->dte_action, the fault was in the
7715 * predicate, if it's ecb->dte_action->dta_next it's
7716 * in action #1, and so on.
7717 */
7718 for (err = ecb->dte_action, ndx = 0;
7719 err != act; err = err->dta_next, ndx++)
7720 continue;
7721
7722 dtrace_probe_error(state, ecb->dte_epid, ndx,
7723 (mstate.dtms_present & DTRACE_MSTATE_FLTOFFS) ?
7724 mstate.dtms_fltoffs : -1, DTRACE_FLAGS2FLT(*flags),
7725 cpu_core[cpuid].cpuc_dtrace_illval);
7726
7727 continue;
7728 }
7729
7730 if (!committed)
7731 buf->dtb_offset = offs + ecb->dte_size;
7732 }
7733
7734 /* FIXME: On Darwin the time spent leaving DTrace from this point to the rti is attributed
7735 to the current thread. Instead it should accrue to DTrace. */
7736 if (vtime) {
7737 thread_t thread = current_thread();
7738 int64_t t = dtrace_get_thread_tracing(thread);
7739
7740 if (t >= 0) {
7741 /* Usual case, accumulate time spent here into t_dtrace_tracing */
7742 dtrace_set_thread_tracing(thread, t + (dtrace_gethrtime() - now));
7743 } else {
7744 /* Return from error recursion. No accumulation, just clear the sign bit on t_dtrace_tracing. */
7745 dtrace_set_thread_tracing(thread, (~(1ULL<<63)) & t);
7746 }
7747 }
7748
7749 dtrace_probe_exit(cookie);
7750 }
7751
7752 /*
7753 * DTrace Probe Hashing Functions
7754 *
7755 * The functions in this section (and indeed, the functions in remaining
7756 * sections) are not _called_ from probe context. (Any exceptions to this are
7757 * marked with a "Note:".) Rather, they are called from elsewhere in the
7758 * DTrace framework to look-up probes in, add probes to and remove probes from
7759 * the DTrace probe hashes. (Each probe is hashed by each element of the
7760 * probe tuple -- allowing for fast lookups, regardless of what was
7761 * specified.)
7762 */
7763 static uint_t
dtrace_hash_str(const char * p)7764 dtrace_hash_str(const char *p)
7765 {
7766 unsigned int g;
7767 uint_t hval = 0;
7768
7769 while (*p) {
7770 hval = (hval << 4) + *p++;
7771 if ((g = (hval & 0xf0000000)) != 0)
7772 hval ^= g >> 24;
7773 hval &= ~g;
7774 }
7775 return (hval);
7776 }
7777
7778 static const char*
dtrace_strkey_probe_provider(void * elm,uintptr_t offs)7779 dtrace_strkey_probe_provider(void *elm, uintptr_t offs)
7780 {
7781 #pragma unused(offs)
7782 dtrace_probe_t *probe = (dtrace_probe_t*)elm;
7783 return probe->dtpr_provider->dtpv_name;
7784 }
7785
7786 static const char*
dtrace_strkey_offset(void * elm,uintptr_t offs)7787 dtrace_strkey_offset(void *elm, uintptr_t offs)
7788 {
7789 return ((char *)((uintptr_t)(elm) + offs));
7790 }
7791
7792 static const char*
dtrace_strkey_deref_offset(void * elm,uintptr_t offs)7793 dtrace_strkey_deref_offset(void *elm, uintptr_t offs)
7794 {
7795 return *((char **)((uintptr_t)(elm) + offs));
7796 }
7797
7798 static dtrace_hash_t *
dtrace_hash_create(dtrace_strkey_f func,uintptr_t arg,uintptr_t nextoffs,uintptr_t prevoffs)7799 dtrace_hash_create(dtrace_strkey_f func, uintptr_t arg, uintptr_t nextoffs, uintptr_t prevoffs)
7800 {
7801 dtrace_hash_t *hash = kmem_zalloc(sizeof (dtrace_hash_t), KM_SLEEP);
7802
7803 hash->dth_getstr = func;
7804 hash->dth_stroffs = arg;
7805 hash->dth_nextoffs = nextoffs;
7806 hash->dth_prevoffs = prevoffs;
7807
7808 hash->dth_size = 1;
7809 hash->dth_mask = hash->dth_size - 1;
7810
7811 hash->dth_tab = kmem_zalloc(hash->dth_size *
7812 sizeof (dtrace_hashbucket_t *), KM_SLEEP);
7813
7814 return (hash);
7815 }
7816
7817 /*
7818 * APPLE NOTE: dtrace_hash_destroy is not used.
7819 * It is called by dtrace_detach which is not
7820 * currently implemented. Revisit someday.
7821 */
7822 #if !defined(__APPLE__)
7823 static void
dtrace_hash_destroy(dtrace_hash_t * hash)7824 dtrace_hash_destroy(dtrace_hash_t *hash)
7825 {
7826 #if DEBUG
7827 int i;
7828
7829 for (i = 0; i < hash->dth_size; i++)
7830 ASSERT(hash->dth_tab[i] == NULL);
7831 #endif
7832
7833 kmem_free(hash->dth_tab,
7834 hash->dth_size * sizeof (dtrace_hashbucket_t *));
7835 kmem_free(hash, sizeof (dtrace_hash_t));
7836 }
7837 #endif /* __APPLE__ */
7838
7839 static void
dtrace_hash_resize(dtrace_hash_t * hash)7840 dtrace_hash_resize(dtrace_hash_t *hash)
7841 {
7842 int size = hash->dth_size, i, ndx;
7843 int new_size = hash->dth_size << 1;
7844 int new_mask = new_size - 1;
7845 dtrace_hashbucket_t **new_tab, *bucket, *next;
7846
7847 ASSERT((new_size & new_mask) == 0);
7848
7849 new_tab = kmem_zalloc(new_size * sizeof (void *), KM_SLEEP);
7850
7851 for (i = 0; i < size; i++) {
7852 for (bucket = hash->dth_tab[i]; bucket != NULL; bucket = next) {
7853 void *elm = bucket->dthb_chain;
7854
7855 ASSERT(elm != NULL);
7856 ndx = DTRACE_HASHSTR(hash, elm) & new_mask;
7857
7858 next = bucket->dthb_next;
7859 bucket->dthb_next = new_tab[ndx];
7860 new_tab[ndx] = bucket;
7861 }
7862 }
7863
7864 kmem_free(hash->dth_tab, hash->dth_size * sizeof (void *));
7865 hash->dth_tab = new_tab;
7866 hash->dth_size = new_size;
7867 hash->dth_mask = new_mask;
7868 }
7869
7870 static void
dtrace_hash_add(dtrace_hash_t * hash,void * new)7871 dtrace_hash_add(dtrace_hash_t *hash, void *new)
7872 {
7873 int hashval = DTRACE_HASHSTR(hash, new);
7874 int ndx = hashval & hash->dth_mask;
7875 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7876 void **nextp, **prevp;
7877
7878 for (; bucket != NULL; bucket = bucket->dthb_next) {
7879 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, new))
7880 goto add;
7881 }
7882
7883 if ((hash->dth_nbuckets >> 1) > hash->dth_size) {
7884 dtrace_hash_resize(hash);
7885 dtrace_hash_add(hash, new);
7886 return;
7887 }
7888
7889 bucket = kmem_zalloc(sizeof (dtrace_hashbucket_t), KM_SLEEP);
7890 bucket->dthb_next = hash->dth_tab[ndx];
7891 hash->dth_tab[ndx] = bucket;
7892 hash->dth_nbuckets++;
7893
7894 add:
7895 nextp = DTRACE_HASHNEXT(hash, new);
7896 ASSERT(*nextp == NULL && *(DTRACE_HASHPREV(hash, new)) == NULL);
7897 *nextp = bucket->dthb_chain;
7898
7899 if (bucket->dthb_chain != NULL) {
7900 prevp = DTRACE_HASHPREV(hash, bucket->dthb_chain);
7901 ASSERT(*prevp == NULL);
7902 *prevp = new;
7903 }
7904
7905 bucket->dthb_chain = new;
7906 bucket->dthb_len++;
7907 }
7908
7909 static void *
dtrace_hash_lookup_string(dtrace_hash_t * hash,const char * str)7910 dtrace_hash_lookup_string(dtrace_hash_t *hash, const char *str)
7911 {
7912 int hashval = dtrace_hash_str(str);
7913 int ndx = hashval & hash->dth_mask;
7914 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7915
7916 for (; bucket != NULL; bucket = bucket->dthb_next) {
7917 if (strcmp(str, DTRACE_GETSTR(hash, bucket->dthb_chain)) == 0)
7918 return (bucket->dthb_chain);
7919 }
7920
7921 return (NULL);
7922 }
7923
7924 static dtrace_probe_t *
dtrace_hash_lookup(dtrace_hash_t * hash,void * template)7925 dtrace_hash_lookup(dtrace_hash_t *hash, void *template)
7926 {
7927 return dtrace_hash_lookup_string(hash, DTRACE_GETSTR(hash, template));
7928 }
7929
7930 static int
dtrace_hash_collisions(dtrace_hash_t * hash,void * template)7931 dtrace_hash_collisions(dtrace_hash_t *hash, void *template)
7932 {
7933 int hashval = DTRACE_HASHSTR(hash, template);
7934 int ndx = hashval & hash->dth_mask;
7935 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7936
7937 for (; bucket != NULL; bucket = bucket->dthb_next) {
7938 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, template))
7939 return (bucket->dthb_len);
7940 }
7941
7942 return (0);
7943 }
7944
7945 static void
dtrace_hash_remove(dtrace_hash_t * hash,void * elm)7946 dtrace_hash_remove(dtrace_hash_t *hash, void *elm)
7947 {
7948 int ndx = DTRACE_HASHSTR(hash, elm) & hash->dth_mask;
7949 dtrace_hashbucket_t *bucket = hash->dth_tab[ndx];
7950
7951 void **prevp = DTRACE_HASHPREV(hash, elm);
7952 void **nextp = DTRACE_HASHNEXT(hash, elm);
7953
7954 /*
7955 * Find the bucket that we're removing this elm from.
7956 */
7957 for (; bucket != NULL; bucket = bucket->dthb_next) {
7958 if (DTRACE_HASHEQ(hash, bucket->dthb_chain, elm))
7959 break;
7960 }
7961
7962 ASSERT(bucket != NULL);
7963
7964 if (*prevp == NULL) {
7965 if (*nextp == NULL) {
7966 /*
7967 * The removed element was the only element on this
7968 * bucket; we need to remove the bucket.
7969 */
7970 dtrace_hashbucket_t *b = hash->dth_tab[ndx];
7971
7972 ASSERT(bucket->dthb_chain == elm);
7973 ASSERT(b != NULL);
7974
7975 if (b == bucket) {
7976 hash->dth_tab[ndx] = bucket->dthb_next;
7977 } else {
7978 while (b->dthb_next != bucket)
7979 b = b->dthb_next;
7980 b->dthb_next = bucket->dthb_next;
7981 }
7982
7983 ASSERT(hash->dth_nbuckets > 0);
7984 hash->dth_nbuckets--;
7985 kmem_free(bucket, sizeof (dtrace_hashbucket_t));
7986 return;
7987 }
7988
7989 bucket->dthb_chain = *nextp;
7990 } else {
7991 *(DTRACE_HASHNEXT(hash, *prevp)) = *nextp;
7992 }
7993
7994 if (*nextp != NULL)
7995 *(DTRACE_HASHPREV(hash, *nextp)) = *prevp;
7996 }
7997
7998 /*
7999 * DTrace Utility Functions
8000 *
8001 * These are random utility functions that are _not_ called from probe context.
8002 */
8003 static int
dtrace_badattr(const dtrace_attribute_t * a)8004 dtrace_badattr(const dtrace_attribute_t *a)
8005 {
8006 return (a->dtat_name > DTRACE_STABILITY_MAX ||
8007 a->dtat_data > DTRACE_STABILITY_MAX ||
8008 a->dtat_class > DTRACE_CLASS_MAX);
8009 }
8010
8011 /*
8012 * Returns a dtrace-managed copy of a string, and will
8013 * deduplicate copies of the same string.
8014 * If the specified string is NULL, returns an empty string
8015 */
8016 static char *
dtrace_strref(const char * str)8017 dtrace_strref(const char *str)
8018 {
8019 dtrace_string_t *s = NULL;
8020 size_t bufsize = (str != NULL ? strlen(str) : 0) + 1;
8021
8022 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8023
8024 if (str == NULL)
8025 str = "";
8026
8027 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8028 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8029 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8030 continue;
8031 }
8032 ASSERT(s->dtst_refcount != UINT32_MAX);
8033 s->dtst_refcount++;
8034 return s->dtst_str;
8035 }
8036
8037 s = kmem_zalloc(sizeof(dtrace_string_t) + bufsize, KM_SLEEP);
8038 s->dtst_refcount = 1;
8039 (void) strlcpy(s->dtst_str, str, bufsize);
8040
8041 dtrace_hash_add(dtrace_strings, s);
8042
8043 return s->dtst_str;
8044 }
8045
8046 static void
dtrace_strunref(const char * str)8047 dtrace_strunref(const char *str)
8048 {
8049 ASSERT(str != NULL);
8050 dtrace_string_t *s = NULL;
8051 size_t bufsize = strlen(str) + 1;
8052
8053 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8054
8055 for (s = dtrace_hash_lookup_string(dtrace_strings, str); s != NULL;
8056 s = *(DTRACE_HASHNEXT(dtrace_strings, s))) {
8057 if (strncmp(str, s->dtst_str, bufsize) != 0) {
8058 continue;
8059 }
8060 ASSERT(s->dtst_refcount != 0);
8061 s->dtst_refcount--;
8062 if (s->dtst_refcount == 0) {
8063 dtrace_hash_remove(dtrace_strings, s);
8064 kmem_free(s, sizeof(dtrace_string_t) + bufsize);
8065 }
8066 return;
8067 }
8068 panic("attempt to unref non-existent string %s", str);
8069 }
8070
8071 #define DTRACE_ISALPHA(c) \
8072 (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
8073
8074 static int
dtrace_badname(const char * s)8075 dtrace_badname(const char *s)
8076 {
8077 char c;
8078
8079 if (s == NULL || (c = *s++) == '\0')
8080 return (0);
8081
8082 if (!DTRACE_ISALPHA(c) && c != '-' && c != '_' && c != '.')
8083 return (1);
8084
8085 while ((c = *s++) != '\0') {
8086 if (!DTRACE_ISALPHA(c) && (c < '0' || c > '9') &&
8087 c != '-' && c != '_' && c != '.' && c != '`')
8088 return (1);
8089 }
8090
8091 return (0);
8092 }
8093
8094 static void
dtrace_cred2priv(cred_t * cr,uint32_t * privp,uid_t * uidp,zoneid_t * zoneidp)8095 dtrace_cred2priv(cred_t *cr, uint32_t *privp, uid_t *uidp, zoneid_t *zoneidp)
8096 {
8097 uint32_t priv;
8098
8099 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
8100 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
8101 priv = DTRACE_PRIV_USER | DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER;
8102 }
8103 else {
8104 priv = DTRACE_PRIV_ALL;
8105 }
8106 *uidp = 0;
8107 *zoneidp = 0;
8108 } else {
8109 *uidp = crgetuid(cr);
8110 *zoneidp = crgetzoneid(cr);
8111
8112 priv = 0;
8113 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE))
8114 priv |= DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER;
8115 else if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE))
8116 priv |= DTRACE_PRIV_USER;
8117 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE))
8118 priv |= DTRACE_PRIV_PROC;
8119 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
8120 priv |= DTRACE_PRIV_OWNER;
8121 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
8122 priv |= DTRACE_PRIV_ZONEOWNER;
8123 }
8124
8125 *privp = priv;
8126 }
8127
8128 #ifdef DTRACE_ERRDEBUG
8129 static void
dtrace_errdebug(const char * str)8130 dtrace_errdebug(const char *str)
8131 {
8132 int hval = dtrace_hash_str(str) % DTRACE_ERRHASHSZ;
8133 int occupied = 0;
8134
8135 lck_mtx_lock(&dtrace_errlock);
8136 dtrace_errlast = str;
8137 dtrace_errthread = (kthread_t *)current_thread();
8138
8139 while (occupied++ < DTRACE_ERRHASHSZ) {
8140 if (dtrace_errhash[hval].dter_msg == str) {
8141 dtrace_errhash[hval].dter_count++;
8142 goto out;
8143 }
8144
8145 if (dtrace_errhash[hval].dter_msg != NULL) {
8146 hval = (hval + 1) % DTRACE_ERRHASHSZ;
8147 continue;
8148 }
8149
8150 dtrace_errhash[hval].dter_msg = str;
8151 dtrace_errhash[hval].dter_count = 1;
8152 goto out;
8153 }
8154
8155 panic("dtrace: undersized error hash");
8156 out:
8157 lck_mtx_unlock(&dtrace_errlock);
8158 }
8159 #endif
8160
8161 /*
8162 * DTrace Matching Functions
8163 *
8164 * These functions are used to match groups of probes, given some elements of
8165 * a probe tuple, or some globbed expressions for elements of a probe tuple.
8166 */
8167 static int
dtrace_match_priv(const dtrace_probe_t * prp,uint32_t priv,uid_t uid,zoneid_t zoneid)8168 dtrace_match_priv(const dtrace_probe_t *prp, uint32_t priv, uid_t uid,
8169 zoneid_t zoneid)
8170 {
8171 if (priv != DTRACE_PRIV_ALL) {
8172 uint32_t ppriv = prp->dtpr_provider->dtpv_priv.dtpp_flags;
8173 uint32_t match = priv & ppriv;
8174
8175 /*
8176 * No PRIV_DTRACE_* privileges...
8177 */
8178 if ((priv & (DTRACE_PRIV_PROC | DTRACE_PRIV_USER |
8179 DTRACE_PRIV_KERNEL)) == 0)
8180 return (0);
8181
8182 /*
8183 * No matching bits, but there were bits to match...
8184 */
8185 if (match == 0 && ppriv != 0)
8186 return (0);
8187
8188 /*
8189 * Need to have permissions to the process, but don't...
8190 */
8191 if (((ppriv & ~match) & DTRACE_PRIV_OWNER) != 0 &&
8192 uid != prp->dtpr_provider->dtpv_priv.dtpp_uid) {
8193 return (0);
8194 }
8195
8196 /*
8197 * Need to be in the same zone unless we possess the
8198 * privilege to examine all zones.
8199 */
8200 if (((ppriv & ~match) & DTRACE_PRIV_ZONEOWNER) != 0 &&
8201 zoneid != prp->dtpr_provider->dtpv_priv.dtpp_zoneid) {
8202 return (0);
8203 }
8204 }
8205
8206 return (1);
8207 }
8208
8209 /*
8210 * dtrace_match_probe compares a dtrace_probe_t to a pre-compiled key, which
8211 * consists of input pattern strings and an ops-vector to evaluate them.
8212 * This function returns >0 for match, 0 for no match, and <0 for error.
8213 */
8214 static int
dtrace_match_probe(const dtrace_probe_t * prp,const dtrace_probekey_t * pkp,uint32_t priv,uid_t uid,zoneid_t zoneid)8215 dtrace_match_probe(const dtrace_probe_t *prp, const dtrace_probekey_t *pkp,
8216 uint32_t priv, uid_t uid, zoneid_t zoneid)
8217 {
8218 dtrace_provider_t *pvp = prp->dtpr_provider;
8219 int rv;
8220
8221 if (pvp->dtpv_defunct)
8222 return (0);
8223
8224 if ((rv = pkp->dtpk_pmatch(pvp->dtpv_name, pkp->dtpk_prov, 0)) <= 0)
8225 return (rv);
8226
8227 if ((rv = pkp->dtpk_mmatch(prp->dtpr_mod, pkp->dtpk_mod, 0)) <= 0)
8228 return (rv);
8229
8230 if ((rv = pkp->dtpk_fmatch(prp->dtpr_func, pkp->dtpk_func, 0)) <= 0)
8231 return (rv);
8232
8233 if ((rv = pkp->dtpk_nmatch(prp->dtpr_name, pkp->dtpk_name, 0)) <= 0)
8234 return (rv);
8235
8236 if (dtrace_match_priv(prp, priv, uid, zoneid) == 0)
8237 return (0);
8238
8239 return (rv);
8240 }
8241
8242 /*
8243 * dtrace_match_glob() is a safe kernel implementation of the gmatch(3GEN)
8244 * interface for matching a glob pattern 'p' to an input string 's'. Unlike
8245 * libc's version, the kernel version only applies to 8-bit ASCII strings.
8246 * In addition, all of the recursion cases except for '*' matching have been
8247 * unwound. For '*', we still implement recursive evaluation, but a depth
8248 * counter is maintained and matching is aborted if we recurse too deep.
8249 * The function returns 0 if no match, >0 if match, and <0 if recursion error.
8250 */
8251 static int
dtrace_match_glob(const char * s,const char * p,int depth)8252 dtrace_match_glob(const char *s, const char *p, int depth)
8253 {
8254 const char *olds;
8255 char s1, c;
8256 int gs;
8257
8258 if (depth > DTRACE_PROBEKEY_MAXDEPTH)
8259 return (-1);
8260
8261 if (s == NULL)
8262 s = ""; /* treat NULL as empty string */
8263
8264 top:
8265 olds = s;
8266 s1 = *s++;
8267
8268 if (p == NULL)
8269 return (0);
8270
8271 if ((c = *p++) == '\0')
8272 return (s1 == '\0');
8273
8274 switch (c) {
8275 case '[': {
8276 int ok = 0, notflag = 0;
8277 char lc = '\0';
8278
8279 if (s1 == '\0')
8280 return (0);
8281
8282 if (*p == '!') {
8283 notflag = 1;
8284 p++;
8285 }
8286
8287 if ((c = *p++) == '\0')
8288 return (0);
8289
8290 do {
8291 if (c == '-' && lc != '\0' && *p != ']') {
8292 if ((c = *p++) == '\0')
8293 return (0);
8294 if (c == '\\' && (c = *p++) == '\0')
8295 return (0);
8296
8297 if (notflag) {
8298 if (s1 < lc || s1 > c)
8299 ok++;
8300 else
8301 return (0);
8302 } else if (lc <= s1 && s1 <= c)
8303 ok++;
8304
8305 } else if (c == '\\' && (c = *p++) == '\0')
8306 return (0);
8307
8308 lc = c; /* save left-hand 'c' for next iteration */
8309
8310 if (notflag) {
8311 if (s1 != c)
8312 ok++;
8313 else
8314 return (0);
8315 } else if (s1 == c)
8316 ok++;
8317
8318 if ((c = *p++) == '\0')
8319 return (0);
8320
8321 } while (c != ']');
8322
8323 if (ok)
8324 goto top;
8325
8326 return (0);
8327 }
8328
8329 case '\\':
8330 if ((c = *p++) == '\0')
8331 return (0);
8332 OS_FALLTHROUGH;
8333
8334 default:
8335 if (c != s1)
8336 return (0);
8337 OS_FALLTHROUGH;
8338
8339 case '?':
8340 if (s1 != '\0')
8341 goto top;
8342 return (0);
8343
8344 case '*':
8345 while (*p == '*')
8346 p++; /* consecutive *'s are identical to a single one */
8347
8348 if (*p == '\0')
8349 return (1);
8350
8351 for (s = olds; *s != '\0'; s++) {
8352 if ((gs = dtrace_match_glob(s, p, depth + 1)) != 0)
8353 return (gs);
8354 }
8355
8356 return (0);
8357 }
8358 }
8359
8360 /*ARGSUSED*/
8361 static int
dtrace_match_string(const char * s,const char * p,int depth)8362 dtrace_match_string(const char *s, const char *p, int depth)
8363 {
8364 #pragma unused(depth) /* __APPLE__ */
8365 return (s != NULL && s == p);
8366 }
8367
8368 /*ARGSUSED*/
8369 static int
dtrace_match_module(const char * s,const char * p,int depth)8370 dtrace_match_module(const char *s, const char *p, int depth)
8371 {
8372 #pragma unused(depth) /* __APPLE__ */
8373 size_t len;
8374 if (s == NULL || p == NULL)
8375 return (0);
8376
8377 len = strlen(p);
8378
8379 if (strncmp(p, s, len) != 0)
8380 return (0);
8381
8382 if (s[len] == '.' || s[len] == '\0')
8383 return (1);
8384
8385 return (0);
8386 }
8387
8388 /*ARGSUSED*/
8389 static int
dtrace_match_nul(const char * s,const char * p,int depth)8390 dtrace_match_nul(const char *s, const char *p, int depth)
8391 {
8392 #pragma unused(s, p, depth) /* __APPLE__ */
8393 return (1); /* always match the empty pattern */
8394 }
8395
8396 /*ARGSUSED*/
8397 static int
dtrace_match_nonzero(const char * s,const char * p,int depth)8398 dtrace_match_nonzero(const char *s, const char *p, int depth)
8399 {
8400 #pragma unused(p, depth) /* __APPLE__ */
8401 return (s != NULL && s[0] != '\0');
8402 }
8403
8404 static int
dtrace_match(const dtrace_probekey_t * pkp,uint32_t priv,uid_t uid,zoneid_t zoneid,int (* matched)(dtrace_probe_t *,void *,void *),void * arg1,void * arg2)8405 dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
8406 zoneid_t zoneid, int (*matched)(dtrace_probe_t *, void *, void *), void *arg1, void *arg2)
8407 {
8408 dtrace_probe_t *probe;
8409 dtrace_provider_t prov_template = {
8410 .dtpv_name = (char *)(uintptr_t)pkp->dtpk_prov
8411 };
8412
8413 dtrace_probe_t template = {
8414 .dtpr_provider = &prov_template,
8415 .dtpr_mod = (char *)(uintptr_t)pkp->dtpk_mod,
8416 .dtpr_func = (char *)(uintptr_t)pkp->dtpk_func,
8417 .dtpr_name = (char *)(uintptr_t)pkp->dtpk_name
8418 };
8419
8420 dtrace_hash_t *hash = NULL;
8421 int len, rc, best = INT_MAX, nmatched = 0;
8422 dtrace_id_t i;
8423
8424 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8425
8426 /*
8427 * If the probe ID is specified in the key, just lookup by ID and
8428 * invoke the match callback once if a matching probe is found.
8429 */
8430 if (pkp->dtpk_id != DTRACE_IDNONE) {
8431 if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
8432 dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
8433 if ((*matched)(probe, arg1, arg2) == DTRACE_MATCH_FAIL)
8434 return (DTRACE_MATCH_FAIL);
8435 nmatched++;
8436 }
8437 return (nmatched);
8438 }
8439
8440 /*
8441 * We want to find the most distinct of the provider name, module name,
8442 * function name, and name. So for each one that is not a glob
8443 * pattern or empty string, we perform a lookup in the corresponding
8444 * hash and use the hash table with the fewest collisions to do our
8445 * search.
8446 */
8447 if (pkp->dtpk_pmatch == &dtrace_match_string &&
8448 (len = dtrace_hash_collisions(dtrace_byprov, &template)) < best) {
8449 best = len;
8450 hash = dtrace_byprov;
8451 }
8452
8453 if (pkp->dtpk_mmatch == &dtrace_match_string &&
8454 (len = dtrace_hash_collisions(dtrace_bymod, &template)) < best) {
8455 best = len;
8456 hash = dtrace_bymod;
8457 }
8458
8459 if (pkp->dtpk_fmatch == &dtrace_match_string &&
8460 (len = dtrace_hash_collisions(dtrace_byfunc, &template)) < best) {
8461 best = len;
8462 hash = dtrace_byfunc;
8463 }
8464
8465 if (pkp->dtpk_nmatch == &dtrace_match_string &&
8466 (len = dtrace_hash_collisions(dtrace_byname, &template)) < best) {
8467 best = len;
8468 hash = dtrace_byname;
8469 }
8470
8471 /*
8472 * If we did not select a hash table, iterate over every probe and
8473 * invoke our callback for each one that matches our input probe key.
8474 */
8475 if (hash == NULL) {
8476 for (i = 0; i < (dtrace_id_t)dtrace_nprobes; i++) {
8477 if ((probe = dtrace_probes[i]) == NULL ||
8478 dtrace_match_probe(probe, pkp, priv, uid,
8479 zoneid) <= 0)
8480 continue;
8481
8482 nmatched++;
8483
8484 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8485 if (rc == DTRACE_MATCH_FAIL)
8486 return (DTRACE_MATCH_FAIL);
8487 break;
8488 }
8489 }
8490
8491 return (nmatched);
8492 }
8493
8494 /*
8495 * If we selected a hash table, iterate over each probe of the same key
8496 * name and invoke the callback for every probe that matches the other
8497 * attributes of our input probe key.
8498 */
8499 for (probe = dtrace_hash_lookup(hash, &template); probe != NULL;
8500 probe = *(DTRACE_HASHNEXT(hash, probe))) {
8501
8502 if (dtrace_match_probe(probe, pkp, priv, uid, zoneid) <= 0)
8503 continue;
8504
8505 nmatched++;
8506
8507 if ((rc = (*matched)(probe, arg1, arg2)) != DTRACE_MATCH_NEXT) {
8508 if (rc == DTRACE_MATCH_FAIL)
8509 return (DTRACE_MATCH_FAIL);
8510 break;
8511 }
8512 }
8513
8514 return (nmatched);
8515 }
8516
8517 /*
8518 * Return the function pointer dtrace_probecmp() should use to compare the
8519 * specified pattern with a string. For NULL or empty patterns, we select
8520 * dtrace_match_nul(). For glob pattern strings, we use dtrace_match_glob().
8521 * For non-empty non-glob strings, we use dtrace_match_string().
8522 */
8523 static dtrace_probekey_f *
dtrace_probekey_func(const char * p)8524 dtrace_probekey_func(const char *p)
8525 {
8526 char c;
8527
8528 if (p == NULL || *p == '\0')
8529 return (&dtrace_match_nul);
8530
8531 while ((c = *p++) != '\0') {
8532 if (c == '[' || c == '?' || c == '*' || c == '\\')
8533 return (&dtrace_match_glob);
8534 }
8535
8536 return (&dtrace_match_string);
8537 }
8538
8539 static dtrace_probekey_f *
dtrace_probekey_module_func(const char * p)8540 dtrace_probekey_module_func(const char *p)
8541 {
8542 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8543
8544 dtrace_probekey_f *f = dtrace_probekey_func(p);
8545 if (f == &dtrace_match_string) {
8546 dtrace_probe_t template = {
8547 .dtpr_mod = (char *)(uintptr_t)p,
8548 };
8549 if (dtrace_hash_lookup(dtrace_bymod, &template) == NULL) {
8550 return (&dtrace_match_module);
8551 }
8552 return (&dtrace_match_string);
8553 }
8554 return f;
8555 }
8556
8557 /*
8558 * Build a probe comparison key for use with dtrace_match_probe() from the
8559 * given probe description. By convention, a null key only matches anchored
8560 * probes: if each field is the empty string, reset dtpk_fmatch to
8561 * dtrace_match_nonzero().
8562 */
8563 static void
dtrace_probekey(const dtrace_probedesc_t * pdp,dtrace_probekey_t * pkp)8564 dtrace_probekey(const dtrace_probedesc_t *pdp, dtrace_probekey_t *pkp)
8565 {
8566
8567 pkp->dtpk_prov = dtrace_strref(pdp->dtpd_provider);
8568 pkp->dtpk_pmatch = dtrace_probekey_func(pdp->dtpd_provider);
8569
8570 pkp->dtpk_mod = dtrace_strref(pdp->dtpd_mod);
8571 pkp->dtpk_mmatch = dtrace_probekey_module_func(pdp->dtpd_mod);
8572
8573 pkp->dtpk_func = dtrace_strref(pdp->dtpd_func);
8574 pkp->dtpk_fmatch = dtrace_probekey_func(pdp->dtpd_func);
8575
8576 pkp->dtpk_name = dtrace_strref(pdp->dtpd_name);
8577 pkp->dtpk_nmatch = dtrace_probekey_func(pdp->dtpd_name);
8578
8579 pkp->dtpk_id = pdp->dtpd_id;
8580
8581 if (pkp->dtpk_id == DTRACE_IDNONE &&
8582 pkp->dtpk_pmatch == &dtrace_match_nul &&
8583 pkp->dtpk_mmatch == &dtrace_match_nul &&
8584 pkp->dtpk_fmatch == &dtrace_match_nul &&
8585 pkp->dtpk_nmatch == &dtrace_match_nul)
8586 pkp->dtpk_fmatch = &dtrace_match_nonzero;
8587 }
8588
8589 static void
dtrace_probekey_release(dtrace_probekey_t * pkp)8590 dtrace_probekey_release(dtrace_probekey_t *pkp)
8591 {
8592 dtrace_strunref(pkp->dtpk_prov);
8593 dtrace_strunref(pkp->dtpk_mod);
8594 dtrace_strunref(pkp->dtpk_func);
8595 dtrace_strunref(pkp->dtpk_name);
8596 }
8597
8598 static int
dtrace_cond_provider_match(dtrace_probedesc_t * desc,void * data)8599 dtrace_cond_provider_match(dtrace_probedesc_t *desc, void *data)
8600 {
8601 if (desc == NULL)
8602 return 1;
8603
8604 dtrace_probekey_f *func = dtrace_probekey_func(desc->dtpd_provider);
8605
8606 return func((char*)data, desc->dtpd_provider, 0);
8607 }
8608
8609 /*
8610 * DTrace Provider-to-Framework API Functions
8611 *
8612 * These functions implement much of the Provider-to-Framework API, as
8613 * described in <sys/dtrace.h>. The parts of the API not in this section are
8614 * the functions in the API for probe management (found below), and
8615 * dtrace_probe() itself (found above).
8616 */
8617
8618 /*
8619 * Register the calling provider with the DTrace framework. This should
8620 * generally be called by DTrace providers in their attach(9E) entry point.
8621 */
8622 int
dtrace_register(const char * name,const dtrace_pattr_t * pap,uint32_t priv,cred_t * cr,const dtrace_pops_t * pops,void * arg,dtrace_provider_id_t * idp)8623 dtrace_register(const char *name, const dtrace_pattr_t *pap, uint32_t priv,
8624 cred_t *cr, const dtrace_pops_t *pops, void *arg, dtrace_provider_id_t *idp)
8625 {
8626 dtrace_provider_t *provider;
8627
8628 if (name == NULL || pap == NULL || pops == NULL || idp == NULL) {
8629 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8630 "arguments", name ? name : "<NULL>");
8631 return (EINVAL);
8632 }
8633
8634 if (name[0] == '\0' || dtrace_badname(name)) {
8635 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8636 "provider name", name);
8637 return (EINVAL);
8638 }
8639
8640 if ((pops->dtps_provide == NULL && pops->dtps_provide_module == NULL) ||
8641 pops->dtps_enable == NULL || pops->dtps_disable == NULL ||
8642 pops->dtps_destroy == NULL ||
8643 ((pops->dtps_resume == NULL) != (pops->dtps_suspend == NULL))) {
8644 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8645 "provider ops", name);
8646 return (EINVAL);
8647 }
8648
8649 if (dtrace_badattr(&pap->dtpa_provider) ||
8650 dtrace_badattr(&pap->dtpa_mod) ||
8651 dtrace_badattr(&pap->dtpa_func) ||
8652 dtrace_badattr(&pap->dtpa_name) ||
8653 dtrace_badattr(&pap->dtpa_args)) {
8654 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8655 "provider attributes", name);
8656 return (EINVAL);
8657 }
8658
8659 if (priv & ~DTRACE_PRIV_ALL) {
8660 cmn_err(CE_WARN, "failed to register provider '%s': invalid "
8661 "privilege attributes", name);
8662 return (EINVAL);
8663 }
8664
8665 if ((priv & DTRACE_PRIV_KERNEL) &&
8666 (priv & (DTRACE_PRIV_USER | DTRACE_PRIV_OWNER)) &&
8667 pops->dtps_usermode == NULL) {
8668 cmn_err(CE_WARN, "failed to register provider '%s': need "
8669 "dtps_usermode() op for given privilege attributes", name);
8670 return (EINVAL);
8671 }
8672
8673 provider = kmem_zalloc(sizeof (dtrace_provider_t), KM_SLEEP);
8674
8675 provider->dtpv_attr = *pap;
8676 provider->dtpv_priv.dtpp_flags = priv;
8677 if (cr != NULL) {
8678 provider->dtpv_priv.dtpp_uid = crgetuid(cr);
8679 provider->dtpv_priv.dtpp_zoneid = crgetzoneid(cr);
8680 }
8681 provider->dtpv_pops = *pops;
8682
8683 if (pops->dtps_provide == NULL) {
8684 ASSERT(pops->dtps_provide_module != NULL);
8685 provider->dtpv_pops.dtps_provide = dtrace_provide_nullop;
8686 }
8687
8688 if (pops->dtps_provide_module == NULL) {
8689 ASSERT(pops->dtps_provide != NULL);
8690 provider->dtpv_pops.dtps_provide_module =
8691 dtrace_provide_module_nullop;
8692 }
8693
8694 if (pops->dtps_suspend == NULL) {
8695 ASSERT(pops->dtps_resume == NULL);
8696 provider->dtpv_pops.dtps_suspend = dtrace_suspend_nullop;
8697 provider->dtpv_pops.dtps_resume = dtrace_resume_nullop;
8698 }
8699
8700 provider->dtpv_arg = arg;
8701 *idp = (dtrace_provider_id_t)provider;
8702
8703 if (pops == &dtrace_provider_ops) {
8704 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8705 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8706
8707 provider->dtpv_name = dtrace_strref(name);
8708
8709 ASSERT(dtrace_anon.dta_enabling == NULL);
8710
8711 /*
8712 * We make sure that the DTrace provider is at the head of
8713 * the provider chain.
8714 */
8715 provider->dtpv_next = dtrace_provider;
8716 dtrace_provider = provider;
8717 return (0);
8718 }
8719
8720 lck_mtx_lock(&dtrace_provider_lock);
8721 lck_mtx_lock(&dtrace_lock);
8722
8723 provider->dtpv_name = dtrace_strref(name);
8724
8725 /*
8726 * If there is at least one provider registered, we'll add this
8727 * provider after the first provider.
8728 */
8729 if (dtrace_provider != NULL) {
8730 provider->dtpv_next = dtrace_provider->dtpv_next;
8731 dtrace_provider->dtpv_next = provider;
8732 } else {
8733 dtrace_provider = provider;
8734 }
8735
8736 if (dtrace_retained != NULL) {
8737 dtrace_enabling_provide(provider);
8738
8739 /*
8740 * Now we need to call dtrace_enabling_matchall_with_cond() --
8741 * with a condition matching the provider name we just added,
8742 * which will acquire cpu_lock and dtrace_lock. We therefore need
8743 * to drop all of our locks before calling into it...
8744 */
8745 lck_mtx_unlock(&dtrace_lock);
8746 lck_mtx_unlock(&dtrace_provider_lock);
8747
8748 dtrace_match_cond_t cond = {dtrace_cond_provider_match, provider->dtpv_name};
8749 dtrace_enabling_matchall_with_cond(&cond);
8750
8751 return (0);
8752 }
8753
8754 lck_mtx_unlock(&dtrace_lock);
8755 lck_mtx_unlock(&dtrace_provider_lock);
8756
8757 return (0);
8758 }
8759
8760 /*
8761 * Unregister the specified provider from the DTrace framework. This should
8762 * generally be called by DTrace providers in their detach(9E) entry point.
8763 */
8764 int
dtrace_unregister(dtrace_provider_id_t id)8765 dtrace_unregister(dtrace_provider_id_t id)
8766 {
8767 dtrace_provider_t *old = (dtrace_provider_t *)id;
8768 dtrace_provider_t *prev = NULL;
8769 int self = 0;
8770 dtrace_probe_t *probe, *first = NULL, *next = NULL;
8771 dtrace_probe_t template = {
8772 .dtpr_provider = old
8773 };
8774
8775 if (old->dtpv_pops.dtps_enable ==
8776 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
8777 /*
8778 * If DTrace itself is the provider, we're called with locks
8779 * already held.
8780 */
8781 ASSERT(old == dtrace_provider);
8782 ASSERT(dtrace_devi != NULL);
8783 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
8784 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
8785 self = 1;
8786
8787 if (dtrace_provider->dtpv_next != NULL) {
8788 /*
8789 * There's another provider here; return failure.
8790 */
8791 return (EBUSY);
8792 }
8793 } else {
8794 lck_mtx_lock(&dtrace_provider_lock);
8795 lck_mtx_lock(&mod_lock);
8796 lck_mtx_lock(&dtrace_lock);
8797 }
8798
8799 /*
8800 * If anyone has /dev/dtrace open, or if there are anonymous enabled
8801 * probes, we refuse to let providers slither away, unless this
8802 * provider has already been explicitly invalidated.
8803 */
8804 if (!old->dtpv_defunct &&
8805 (dtrace_opens || (dtrace_anon.dta_state != NULL &&
8806 dtrace_anon.dta_state->dts_necbs > 0))) {
8807 if (!self) {
8808 lck_mtx_unlock(&dtrace_lock);
8809 lck_mtx_unlock(&mod_lock);
8810 lck_mtx_unlock(&dtrace_provider_lock);
8811 }
8812 return (EBUSY);
8813 }
8814
8815 /*
8816 * Attempt to destroy the probes associated with this provider.
8817 */
8818 if (old->dtpv_ecb_count!=0) {
8819 /*
8820 * We have at least one ECB; we can't remove this provider.
8821 */
8822 if (!self) {
8823 lck_mtx_unlock(&dtrace_lock);
8824 lck_mtx_unlock(&mod_lock);
8825 lck_mtx_unlock(&dtrace_provider_lock);
8826 }
8827 return (EBUSY);
8828 }
8829
8830 /*
8831 * All of the probes for this provider are disabled; we can safely
8832 * remove all of them from their hash chains and from the probe array.
8833 */
8834 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8835 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8836 if (probe->dtpr_provider != old)
8837 continue;
8838
8839 dtrace_probes[probe->dtpr_id - 1] = NULL;
8840 old->dtpv_probe_count--;
8841
8842 dtrace_hash_remove(dtrace_bymod, probe);
8843 dtrace_hash_remove(dtrace_byfunc, probe);
8844 dtrace_hash_remove(dtrace_byname, probe);
8845
8846 if (first == NULL) {
8847 first = probe;
8848 probe->dtpr_nextmod = NULL;
8849 } else {
8850 /*
8851 * Use nextmod as the chain of probes to remove
8852 */
8853 probe->dtpr_nextmod = first;
8854 first = probe;
8855 }
8856 }
8857
8858 for (probe = first; probe != NULL; probe = next) {
8859 next = probe->dtpr_nextmod;
8860 dtrace_hash_remove(dtrace_byprov, probe);
8861 }
8862
8863 /*
8864 * The provider's probes have been removed from the hash chains and
8865 * from the probe array. Now issue a dtrace_sync() to be sure that
8866 * everyone has cleared out from any probe array processing.
8867 */
8868 dtrace_sync();
8869
8870 for (probe = first; probe != NULL; probe = next) {
8871 next = probe->dtpr_nextmod;
8872
8873 old->dtpv_pops.dtps_destroy(old->dtpv_arg, probe->dtpr_id,
8874 probe->dtpr_arg);
8875 dtrace_strunref(probe->dtpr_mod);
8876 dtrace_strunref(probe->dtpr_func);
8877 dtrace_strunref(probe->dtpr_name);
8878 vmem_free(dtrace_arena, (void *)(uintptr_t)(probe->dtpr_id), 1);
8879 zfree(dtrace_probe_t_zone, probe);
8880 }
8881
8882 if ((prev = dtrace_provider) == old) {
8883 ASSERT(self || dtrace_devi == NULL);
8884 ASSERT(old->dtpv_next == NULL || dtrace_devi == NULL);
8885 dtrace_provider = old->dtpv_next;
8886 } else {
8887 while (prev != NULL && prev->dtpv_next != old)
8888 prev = prev->dtpv_next;
8889
8890 if (prev == NULL) {
8891 panic("attempt to unregister non-existent "
8892 "dtrace provider %p\n", (void *)id);
8893 }
8894
8895 prev->dtpv_next = old->dtpv_next;
8896 }
8897
8898 dtrace_strunref(old->dtpv_name);
8899
8900 if (!self) {
8901 lck_mtx_unlock(&dtrace_lock);
8902 lck_mtx_unlock(&mod_lock);
8903 lck_mtx_unlock(&dtrace_provider_lock);
8904 }
8905
8906 kmem_free(old, sizeof (dtrace_provider_t));
8907
8908 return (0);
8909 }
8910
8911 /*
8912 * Invalidate the specified provider. All subsequent probe lookups for the
8913 * specified provider will fail, but its probes will not be removed.
8914 */
8915 void
dtrace_invalidate(dtrace_provider_id_t id)8916 dtrace_invalidate(dtrace_provider_id_t id)
8917 {
8918 dtrace_provider_t *pvp = (dtrace_provider_t *)id;
8919
8920 ASSERT(pvp->dtpv_pops.dtps_enable !=
8921 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8922
8923 lck_mtx_lock(&dtrace_provider_lock);
8924 lck_mtx_lock(&dtrace_lock);
8925
8926 pvp->dtpv_defunct = 1;
8927
8928 lck_mtx_unlock(&dtrace_lock);
8929 lck_mtx_unlock(&dtrace_provider_lock);
8930 }
8931
8932 /*
8933 * Indicate whether or not DTrace has attached.
8934 */
8935 int
dtrace_attached(void)8936 dtrace_attached(void)
8937 {
8938 /*
8939 * dtrace_provider will be non-NULL iff the DTrace driver has
8940 * attached. (It's non-NULL because DTrace is always itself a
8941 * provider.)
8942 */
8943 return (dtrace_provider != NULL);
8944 }
8945
8946 /*
8947 * Remove all the unenabled probes for the given provider. This function is
8948 * not unlike dtrace_unregister(), except that it doesn't remove the provider
8949 * -- just as many of its associated probes as it can.
8950 */
8951 int
dtrace_condense(dtrace_provider_id_t id)8952 dtrace_condense(dtrace_provider_id_t id)
8953 {
8954 dtrace_provider_t *prov = (dtrace_provider_t *)id;
8955 dtrace_probe_t *probe, *first = NULL;
8956 dtrace_probe_t template = {
8957 .dtpr_provider = prov
8958 };
8959
8960 /*
8961 * Make sure this isn't the dtrace provider itself.
8962 */
8963 ASSERT(prov->dtpv_pops.dtps_enable !=
8964 (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
8965
8966 lck_mtx_lock(&dtrace_provider_lock);
8967 lck_mtx_lock(&dtrace_lock);
8968
8969 /*
8970 * Attempt to destroy the probes associated with this provider.
8971 */
8972 for (probe = dtrace_hash_lookup(dtrace_byprov, &template); probe != NULL;
8973 probe = *(DTRACE_HASHNEXT(dtrace_byprov, probe))) {
8974
8975 if (probe->dtpr_provider != prov)
8976 continue;
8977
8978 if (probe->dtpr_ecb != NULL)
8979 continue;
8980
8981 dtrace_probes[probe->dtpr_id - 1] = NULL;
8982 prov->dtpv_probe_count--;
8983
8984 dtrace_hash_remove(dtrace_bymod, probe);
8985 dtrace_hash_remove(dtrace_byfunc, probe);
8986 dtrace_hash_remove(dtrace_byname, probe);
8987
8988 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
8989 probe->dtpr_arg);
8990 dtrace_strunref(probe->dtpr_mod);
8991 dtrace_strunref(probe->dtpr_func);
8992 dtrace_strunref(probe->dtpr_name);
8993 if (first == NULL) {
8994 first = probe;
8995 probe->dtpr_nextmod = NULL;
8996 } else {
8997 /*
8998 * Use nextmod as the chain of probes to remove
8999 */
9000 probe->dtpr_nextmod = first;
9001 first = probe;
9002 }
9003 }
9004
9005 for (probe = first; probe != NULL; probe = first) {
9006 first = probe->dtpr_nextmod;
9007 dtrace_hash_remove(dtrace_byprov, probe);
9008 vmem_free(dtrace_arena, (void *)((uintptr_t)probe->dtpr_id), 1);
9009 zfree(dtrace_probe_t_zone, probe);
9010 }
9011
9012 lck_mtx_unlock(&dtrace_lock);
9013 lck_mtx_unlock(&dtrace_provider_lock);
9014
9015 return (0);
9016 }
9017
9018 /*
9019 * DTrace Probe Management Functions
9020 *
9021 * The functions in this section perform the DTrace probe management,
9022 * including functions to create probes, look-up probes, and call into the
9023 * providers to request that probes be provided. Some of these functions are
9024 * in the Provider-to-Framework API; these functions can be identified by the
9025 * fact that they are not declared "static".
9026 */
9027
9028 /*
9029 * Create a probe with the specified module name, function name, and name.
9030 */
9031 dtrace_id_t
dtrace_probe_create(dtrace_provider_id_t prov,const char * mod,const char * func,const char * name,int aframes,void * arg)9032 dtrace_probe_create(dtrace_provider_id_t prov, const char *mod,
9033 const char *func, const char *name, int aframes, void *arg)
9034 {
9035 dtrace_probe_t *probe, **probes;
9036 dtrace_provider_t *provider = (dtrace_provider_t *)prov;
9037 dtrace_id_t id;
9038
9039 if (provider == dtrace_provider) {
9040 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9041 } else {
9042 lck_mtx_lock(&dtrace_lock);
9043 }
9044
9045 id = (dtrace_id_t)(uintptr_t)vmem_alloc(dtrace_arena, 1,
9046 VM_BESTFIT | VM_SLEEP);
9047
9048 probe = zalloc_flags(dtrace_probe_t_zone, Z_WAITOK | Z_ZERO);
9049
9050 probe->dtpr_id = id;
9051 probe->dtpr_gen = dtrace_probegen++;
9052 probe->dtpr_mod = dtrace_strref(mod);
9053 probe->dtpr_func = dtrace_strref(func);
9054 probe->dtpr_name = dtrace_strref(name);
9055 probe->dtpr_arg = arg;
9056 probe->dtpr_aframes = aframes;
9057 probe->dtpr_provider = provider;
9058
9059 dtrace_hash_add(dtrace_byprov, probe);
9060 dtrace_hash_add(dtrace_bymod, probe);
9061 dtrace_hash_add(dtrace_byfunc, probe);
9062 dtrace_hash_add(dtrace_byname, probe);
9063
9064 if (id - 1 >= (dtrace_id_t)dtrace_nprobes) {
9065 size_t osize = dtrace_nprobes * sizeof (dtrace_probe_t *);
9066 size_t nsize = osize * 2;
9067
9068 probes = kmem_zalloc(nsize, KM_SLEEP);
9069
9070 dtrace_probe_t **oprobes = dtrace_probes;
9071
9072 bcopy(oprobes, probes, osize);
9073 dtrace_membar_producer();
9074 dtrace_probes = probes;
9075
9076 dtrace_sync();
9077
9078 /*
9079 * All CPUs are now seeing the new probes array; we can
9080 * safely free the old array.
9081 */
9082 kmem_free(oprobes, osize);
9083 dtrace_nprobes *= 2;
9084
9085 ASSERT(id - 1 < (dtrace_id_t)dtrace_nprobes);
9086 }
9087
9088 ASSERT(dtrace_probes[id - 1] == NULL);
9089 dtrace_probes[id - 1] = probe;
9090 provider->dtpv_probe_count++;
9091
9092 if (provider != dtrace_provider)
9093 lck_mtx_unlock(&dtrace_lock);
9094
9095 return (id);
9096 }
9097
9098 static dtrace_probe_t *
dtrace_probe_lookup_id(dtrace_id_t id)9099 dtrace_probe_lookup_id(dtrace_id_t id)
9100 {
9101 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9102
9103 if (id == 0 || id > (dtrace_id_t)dtrace_nprobes)
9104 return (NULL);
9105
9106 return (dtrace_probes[id - 1]);
9107 }
9108
9109 static int
dtrace_probe_lookup_match(dtrace_probe_t * probe,void * arg1,void * arg2)9110 dtrace_probe_lookup_match(dtrace_probe_t *probe, void *arg1, void *arg2)
9111 {
9112 #pragma unused(arg2)
9113 *((dtrace_id_t *)arg1) = probe->dtpr_id;
9114
9115 return (DTRACE_MATCH_DONE);
9116 }
9117
9118 /*
9119 * Look up a probe based on provider and one or more of module name, function
9120 * name and probe name.
9121 */
9122 dtrace_id_t
dtrace_probe_lookup(dtrace_provider_id_t prid,const char * mod,const char * func,const char * name)9123 dtrace_probe_lookup(dtrace_provider_id_t prid, const char *mod,
9124 const char *func, const char *name)
9125 {
9126 dtrace_probekey_t pkey;
9127 dtrace_id_t id;
9128 int match;
9129
9130 lck_mtx_lock(&dtrace_lock);
9131
9132 pkey.dtpk_prov = dtrace_strref(((dtrace_provider_t *)prid)->dtpv_name);
9133 pkey.dtpk_pmatch = &dtrace_match_string;
9134 pkey.dtpk_mod = dtrace_strref(mod);
9135 pkey.dtpk_mmatch = mod ? &dtrace_match_string : &dtrace_match_nul;
9136 pkey.dtpk_func = dtrace_strref(func);
9137 pkey.dtpk_fmatch = func ? &dtrace_match_string : &dtrace_match_nul;
9138 pkey.dtpk_name = dtrace_strref(name);
9139 pkey.dtpk_nmatch = name ? &dtrace_match_string : &dtrace_match_nul;
9140 pkey.dtpk_id = DTRACE_IDNONE;
9141
9142 match = dtrace_match(&pkey, DTRACE_PRIV_ALL, 0, 0,
9143 dtrace_probe_lookup_match, &id, NULL);
9144
9145 dtrace_probekey_release(&pkey);
9146
9147 lck_mtx_unlock(&dtrace_lock);
9148
9149 ASSERT(match == 1 || match == 0);
9150 return (match ? id : 0);
9151 }
9152
9153 /*
9154 * Returns the probe argument associated with the specified probe.
9155 */
9156 void *
dtrace_probe_arg(dtrace_provider_id_t id,dtrace_id_t pid)9157 dtrace_probe_arg(dtrace_provider_id_t id, dtrace_id_t pid)
9158 {
9159 dtrace_probe_t *probe;
9160 void *rval = NULL;
9161
9162 lck_mtx_lock(&dtrace_lock);
9163
9164 if ((probe = dtrace_probe_lookup_id(pid)) != NULL &&
9165 probe->dtpr_provider == (dtrace_provider_t *)id)
9166 rval = probe->dtpr_arg;
9167
9168 lck_mtx_unlock(&dtrace_lock);
9169
9170 return (rval);
9171 }
9172
9173 /*
9174 * Copy a probe into a probe description.
9175 */
9176 static void
dtrace_probe_description(const dtrace_probe_t * prp,dtrace_probedesc_t * pdp)9177 dtrace_probe_description(const dtrace_probe_t *prp, dtrace_probedesc_t *pdp)
9178 {
9179 bzero(pdp, sizeof (dtrace_probedesc_t));
9180 pdp->dtpd_id = prp->dtpr_id;
9181
9182 /* APPLE NOTE: Darwin employs size bounded string operation. */
9183 (void) strlcpy(pdp->dtpd_provider,
9184 prp->dtpr_provider->dtpv_name, DTRACE_PROVNAMELEN);
9185
9186 (void) strlcpy(pdp->dtpd_mod, prp->dtpr_mod, DTRACE_MODNAMELEN);
9187 (void) strlcpy(pdp->dtpd_func, prp->dtpr_func, DTRACE_FUNCNAMELEN);
9188 (void) strlcpy(pdp->dtpd_name, prp->dtpr_name, DTRACE_NAMELEN);
9189 }
9190
9191 /*
9192 * Called to indicate that a probe -- or probes -- should be provided by a
9193 * specfied provider. If the specified description is NULL, the provider will
9194 * be told to provide all of its probes. (This is done whenever a new
9195 * consumer comes along, or whenever a retained enabling is to be matched.) If
9196 * the specified description is non-NULL, the provider is given the
9197 * opportunity to dynamically provide the specified probe, allowing providers
9198 * to support the creation of probes on-the-fly. (So-called _autocreated_
9199 * probes.) If the provider is NULL, the operations will be applied to all
9200 * providers; if the provider is non-NULL the operations will only be applied
9201 * to the specified provider. The dtrace_provider_lock must be held, and the
9202 * dtrace_lock must _not_ be held -- the provider's dtps_provide() operation
9203 * will need to grab the dtrace_lock when it reenters the framework through
9204 * dtrace_probe_lookup(), dtrace_probe_create(), etc.
9205 */
9206 static void
dtrace_probe_provide(dtrace_probedesc_t * desc,dtrace_provider_t * prv)9207 dtrace_probe_provide(dtrace_probedesc_t *desc, dtrace_provider_t *prv)
9208 {
9209 struct modctl *ctl;
9210 int all = 0;
9211
9212 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
9213
9214 if (prv == NULL) {
9215 all = 1;
9216 prv = dtrace_provider;
9217 }
9218
9219 do {
9220 /*
9221 * First, call the blanket provide operation.
9222 */
9223 prv->dtpv_pops.dtps_provide(prv->dtpv_arg, desc);
9224
9225 /*
9226 * Now call the per-module provide operation. We will grab
9227 * mod_lock to prevent the list from being modified. Note
9228 * that this also prevents the mod_busy bits from changing.
9229 * (mod_busy can only be changed with mod_lock held.)
9230 */
9231 lck_mtx_lock(&mod_lock);
9232
9233 ctl = dtrace_modctl_list;
9234 while (ctl) {
9235 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
9236 ctl = ctl->mod_next;
9237 }
9238
9239 lck_mtx_unlock(&mod_lock);
9240 } while (all && (prv = prv->dtpv_next) != NULL);
9241 }
9242
9243 /*
9244 * Iterate over each probe, and call the Framework-to-Provider API function
9245 * denoted by offs.
9246 */
9247 static void
dtrace_probe_foreach(uintptr_t offs)9248 dtrace_probe_foreach(uintptr_t offs)
9249 {
9250 dtrace_provider_t *prov;
9251 void (*func)(void *, dtrace_id_t, void *);
9252 dtrace_probe_t *probe;
9253 dtrace_icookie_t cookie;
9254 int i;
9255
9256 /*
9257 * We disable interrupts to walk through the probe array. This is
9258 * safe -- the dtrace_sync() in dtrace_unregister() assures that we
9259 * won't see stale data.
9260 */
9261 cookie = dtrace_interrupt_disable();
9262
9263 for (i = 0; i < dtrace_nprobes; i++) {
9264 if ((probe = dtrace_probes[i]) == NULL)
9265 continue;
9266
9267 if (probe->dtpr_ecb == NULL) {
9268 /*
9269 * This probe isn't enabled -- don't call the function.
9270 */
9271 continue;
9272 }
9273
9274 prov = probe->dtpr_provider;
9275 func = *((void(**)(void *, dtrace_id_t, void *))
9276 ((uintptr_t)&prov->dtpv_pops + offs));
9277
9278 func(prov->dtpv_arg, i + 1, probe->dtpr_arg);
9279 }
9280
9281 dtrace_interrupt_enable(cookie);
9282 }
9283
9284 static int
dtrace_probe_enable(const dtrace_probedesc_t * desc,dtrace_enabling_t * enab,dtrace_ecbdesc_t * ep)9285 dtrace_probe_enable(const dtrace_probedesc_t *desc, dtrace_enabling_t *enab, dtrace_ecbdesc_t *ep)
9286 {
9287 dtrace_probekey_t pkey;
9288 uint32_t priv;
9289 uid_t uid;
9290 zoneid_t zoneid;
9291 int err;
9292
9293 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
9294
9295 dtrace_ecb_create_cache = NULL;
9296
9297 if (desc == NULL) {
9298 /*
9299 * If we're passed a NULL description, we're being asked to
9300 * create an ECB with a NULL probe.
9301 */
9302 (void) dtrace_ecb_create_enable(NULL, enab, ep);
9303 return (0);
9304 }
9305
9306 dtrace_probekey(desc, &pkey);
9307 dtrace_cred2priv(enab->dten_vstate->dtvs_state->dts_cred.dcr_cred,
9308 &priv, &uid, &zoneid);
9309
9310 err = dtrace_match(&pkey, priv, uid, zoneid, dtrace_ecb_create_enable, enab, ep);
9311
9312 dtrace_probekey_release(&pkey);
9313
9314 return err;
9315 }
9316
9317 /*
9318 * DTrace Helper Provider Functions
9319 */
9320 static void
dtrace_dofattr2attr(dtrace_attribute_t * attr,const dof_attr_t dofattr)9321 dtrace_dofattr2attr(dtrace_attribute_t *attr, const dof_attr_t dofattr)
9322 {
9323 attr->dtat_name = DOF_ATTR_NAME(dofattr);
9324 attr->dtat_data = DOF_ATTR_DATA(dofattr);
9325 attr->dtat_class = DOF_ATTR_CLASS(dofattr);
9326 }
9327
9328 static void
dtrace_dofprov2hprov(dtrace_helper_provdesc_t * hprov,const dof_provider_t * dofprov,char * strtab)9329 dtrace_dofprov2hprov(dtrace_helper_provdesc_t *hprov,
9330 const dof_provider_t *dofprov, char *strtab)
9331 {
9332 hprov->dthpv_provname = strtab + dofprov->dofpv_name;
9333 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_provider,
9334 dofprov->dofpv_provattr);
9335 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_mod,
9336 dofprov->dofpv_modattr);
9337 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_func,
9338 dofprov->dofpv_funcattr);
9339 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_name,
9340 dofprov->dofpv_nameattr);
9341 dtrace_dofattr2attr(&hprov->dthpv_pattr.dtpa_args,
9342 dofprov->dofpv_argsattr);
9343 }
9344
9345 static void
dtrace_helper_provide_one(dof_helper_t * dhp,dof_sec_t * sec,proc_t * p)9346 dtrace_helper_provide_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9347 {
9348 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9349 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9350 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
9351 dof_provider_t *provider;
9352 dof_probe_t *probe;
9353 uint32_t *off, *enoff;
9354 uint8_t *arg;
9355 char *strtab;
9356 uint_t i, nprobes;
9357 dtrace_helper_provdesc_t dhpv;
9358 dtrace_helper_probedesc_t dhpb;
9359 dtrace_meta_t *meta = dtrace_meta_pid;
9360 dtrace_mops_t *mops = &meta->dtm_mops;
9361 void *parg;
9362
9363 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9364 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9365 provider->dofpv_strtab * dof->dofh_secsize);
9366 prb_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9367 provider->dofpv_probes * dof->dofh_secsize);
9368 arg_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9369 provider->dofpv_prargs * dof->dofh_secsize);
9370 off_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9371 provider->dofpv_proffs * dof->dofh_secsize);
9372
9373 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9374 off = (uint32_t *)(uintptr_t)(daddr + off_sec->dofs_offset);
9375 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
9376 enoff = NULL;
9377
9378 /*
9379 * See dtrace_helper_provider_validate().
9380 */
9381 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
9382 provider->dofpv_prenoffs != DOF_SECT_NONE) {
9383 enoff_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9384 provider->dofpv_prenoffs * dof->dofh_secsize);
9385 enoff = (uint32_t *)(uintptr_t)(daddr + enoff_sec->dofs_offset);
9386 }
9387
9388 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
9389
9390 /*
9391 * Create the provider.
9392 */
9393 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9394
9395 if ((parg = mops->dtms_provide_proc(meta->dtm_arg, &dhpv, p)) == NULL)
9396 return;
9397
9398 meta->dtm_count++;
9399
9400 /*
9401 * Create the probes.
9402 */
9403 for (i = 0; i < nprobes; i++) {
9404 probe = (dof_probe_t *)(uintptr_t)(daddr +
9405 prb_sec->dofs_offset + i * prb_sec->dofs_entsize);
9406
9407 dhpb.dthpb_mod = dhp->dofhp_mod;
9408 dhpb.dthpb_func = strtab + probe->dofpr_func;
9409 dhpb.dthpb_name = strtab + probe->dofpr_name;
9410 #if !defined(__APPLE__)
9411 dhpb.dthpb_base = probe->dofpr_addr;
9412 #else
9413 dhpb.dthpb_base = dhp->dofhp_addr; /* FIXME: James, why? */
9414 #endif
9415 dhpb.dthpb_offs = (int32_t *)(off + probe->dofpr_offidx);
9416 dhpb.dthpb_noffs = probe->dofpr_noffs;
9417 if (enoff != NULL) {
9418 dhpb.dthpb_enoffs = (int32_t *)(enoff + probe->dofpr_enoffidx);
9419 dhpb.dthpb_nenoffs = probe->dofpr_nenoffs;
9420 } else {
9421 dhpb.dthpb_enoffs = NULL;
9422 dhpb.dthpb_nenoffs = 0;
9423 }
9424 dhpb.dthpb_args = arg + probe->dofpr_argidx;
9425 dhpb.dthpb_nargc = probe->dofpr_nargc;
9426 dhpb.dthpb_xargc = probe->dofpr_xargc;
9427 dhpb.dthpb_ntypes = strtab + probe->dofpr_nargv;
9428 dhpb.dthpb_xtypes = strtab + probe->dofpr_xargv;
9429
9430 mops->dtms_create_probe(meta->dtm_arg, parg, &dhpb);
9431 }
9432
9433 /*
9434 * Since we just created probes, we need to match our enablings
9435 * against those, with a precondition knowing that we have only
9436 * added probes from this provider
9437 */
9438 char *prov_name = mops->dtms_provider_name(parg);
9439 ASSERT(prov_name != NULL);
9440 dtrace_match_cond_t cond = {dtrace_cond_provider_match, (void*)prov_name};
9441
9442 dtrace_enabling_matchall_with_cond(&cond);
9443 }
9444
9445 static void
dtrace_helper_provide(dof_helper_t * dhp,proc_t * p)9446 dtrace_helper_provide(dof_helper_t *dhp, proc_t *p)
9447 {
9448 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9449 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9450 uint32_t i;
9451
9452 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9453
9454 for (i = 0; i < dof->dofh_secnum; i++) {
9455 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9456 dof->dofh_secoff + i * dof->dofh_secsize);
9457
9458 if (sec->dofs_type != DOF_SECT_PROVIDER)
9459 continue;
9460
9461 dtrace_helper_provide_one(dhp, sec, p);
9462 }
9463 }
9464
9465 static void
dtrace_helper_provider_remove_one(dof_helper_t * dhp,dof_sec_t * sec,proc_t * p)9466 dtrace_helper_provider_remove_one(dof_helper_t *dhp, dof_sec_t *sec, proc_t *p)
9467 {
9468 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9469 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9470 dof_sec_t *str_sec;
9471 dof_provider_t *provider;
9472 char *strtab;
9473 dtrace_helper_provdesc_t dhpv;
9474 dtrace_meta_t *meta = dtrace_meta_pid;
9475 dtrace_mops_t *mops = &meta->dtm_mops;
9476
9477 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
9478 str_sec = (dof_sec_t *)(uintptr_t)(daddr + dof->dofh_secoff +
9479 provider->dofpv_strtab * dof->dofh_secsize);
9480
9481 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
9482
9483 /*
9484 * Create the provider.
9485 */
9486 dtrace_dofprov2hprov(&dhpv, provider, strtab);
9487
9488 mops->dtms_remove_proc(meta->dtm_arg, &dhpv, p);
9489
9490 meta->dtm_count--;
9491 }
9492
9493 static void
dtrace_helper_provider_remove(dof_helper_t * dhp,proc_t * p)9494 dtrace_helper_provider_remove(dof_helper_t *dhp, proc_t *p)
9495 {
9496 uintptr_t daddr = (uintptr_t)dhp->dofhp_dof;
9497 dof_hdr_t *dof = (dof_hdr_t *)daddr;
9498 uint32_t i;
9499
9500 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
9501
9502 for (i = 0; i < dof->dofh_secnum; i++) {
9503 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
9504 dof->dofh_secoff + i * dof->dofh_secsize);
9505
9506 if (sec->dofs_type != DOF_SECT_PROVIDER)
9507 continue;
9508
9509 dtrace_helper_provider_remove_one(dhp, sec, p);
9510 }
9511 }
9512
9513 /*
9514 * DTrace Meta Provider-to-Framework API Functions
9515 *
9516 * These functions implement the Meta Provider-to-Framework API, as described
9517 * in <sys/dtrace.h>.
9518 */
9519 int
dtrace_meta_register(const char * name,const dtrace_mops_t * mops,void * arg,dtrace_meta_provider_id_t * idp)9520 dtrace_meta_register(const char *name, const dtrace_mops_t *mops, void *arg,
9521 dtrace_meta_provider_id_t *idp)
9522 {
9523 dtrace_meta_t *meta;
9524 dtrace_helpers_t *help, *next;
9525 uint_t i;
9526
9527 *idp = DTRACE_METAPROVNONE;
9528
9529 /*
9530 * We strictly don't need the name, but we hold onto it for
9531 * debuggability. All hail error queues!
9532 */
9533 if (name == NULL) {
9534 cmn_err(CE_WARN, "failed to register meta-provider: "
9535 "invalid name");
9536 return (EINVAL);
9537 }
9538
9539 if (mops == NULL ||
9540 mops->dtms_create_probe == NULL ||
9541 mops->dtms_provide_proc == NULL ||
9542 mops->dtms_remove_proc == NULL) {
9543 cmn_err(CE_WARN, "failed to register meta-register %s: "
9544 "invalid ops", name);
9545 return (EINVAL);
9546 }
9547
9548 meta = kmem_zalloc(sizeof (dtrace_meta_t), KM_SLEEP);
9549 meta->dtm_mops = *mops;
9550 meta->dtm_arg = arg;
9551
9552 lck_mtx_lock(&dtrace_meta_lock);
9553 lck_mtx_lock(&dtrace_lock);
9554
9555 if (dtrace_meta_pid != NULL) {
9556 lck_mtx_unlock(&dtrace_lock);
9557 lck_mtx_unlock(&dtrace_meta_lock);
9558 cmn_err(CE_WARN, "failed to register meta-register %s: "
9559 "user-land meta-provider exists", name);
9560 kmem_free(meta, sizeof (dtrace_meta_t));
9561 return (EINVAL);
9562 }
9563
9564 meta->dtm_name = dtrace_strref(name);
9565
9566 dtrace_meta_pid = meta;
9567 *idp = (dtrace_meta_provider_id_t)meta;
9568
9569 /*
9570 * If there are providers and probes ready to go, pass them
9571 * off to the new meta provider now.
9572 */
9573
9574 help = dtrace_deferred_pid;
9575 dtrace_deferred_pid = NULL;
9576
9577 lck_mtx_unlock(&dtrace_lock);
9578
9579 while (help != NULL) {
9580 for (i = 0; i < help->dthps_nprovs; i++) {
9581 proc_t *p = proc_find(help->dthps_pid);
9582 if (p == PROC_NULL)
9583 continue;
9584 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
9585 p);
9586 proc_rele(p);
9587 }
9588
9589 next = help->dthps_next;
9590 help->dthps_next = NULL;
9591 help->dthps_prev = NULL;
9592 help->dthps_deferred = 0;
9593 help = next;
9594 }
9595
9596 lck_mtx_unlock(&dtrace_meta_lock);
9597
9598 return (0);
9599 }
9600
9601 int
dtrace_meta_unregister(dtrace_meta_provider_id_t id)9602 dtrace_meta_unregister(dtrace_meta_provider_id_t id)
9603 {
9604 dtrace_meta_t **pp, *old = (dtrace_meta_t *)id;
9605
9606 lck_mtx_lock(&dtrace_meta_lock);
9607 lck_mtx_lock(&dtrace_lock);
9608
9609 if (old == dtrace_meta_pid) {
9610 pp = &dtrace_meta_pid;
9611 } else {
9612 panic("attempt to unregister non-existent "
9613 "dtrace meta-provider %p\n", (void *)old);
9614 }
9615
9616 if (old->dtm_count != 0) {
9617 lck_mtx_unlock(&dtrace_lock);
9618 lck_mtx_unlock(&dtrace_meta_lock);
9619 return (EBUSY);
9620 }
9621
9622 *pp = NULL;
9623
9624 dtrace_strunref(old->dtm_name);
9625
9626 lck_mtx_unlock(&dtrace_lock);
9627 lck_mtx_unlock(&dtrace_meta_lock);
9628
9629 kmem_free(old, sizeof (dtrace_meta_t));
9630
9631 return (0);
9632 }
9633
9634
9635 /*
9636 * DTrace DIF Object Functions
9637 */
9638 static int
dtrace_difo_err(uint_t pc,const char * format,...)9639 dtrace_difo_err(uint_t pc, const char *format, ...)
9640 {
9641 if (dtrace_err_verbose) {
9642 va_list alist;
9643
9644 (void) uprintf("dtrace DIF object error: [%u]: ", pc);
9645 va_start(alist, format);
9646 (void) vuprintf(format, alist);
9647 va_end(alist);
9648 }
9649
9650 #ifdef DTRACE_ERRDEBUG
9651 dtrace_errdebug(format);
9652 #endif
9653 return (1);
9654 }
9655
9656 /*
9657 * Validate a DTrace DIF object by checking the IR instructions. The following
9658 * rules are currently enforced by dtrace_difo_validate():
9659 *
9660 * 1. Each instruction must have a valid opcode
9661 * 2. Each register, string, variable, or subroutine reference must be valid
9662 * 3. No instruction can modify register %r0 (must be zero)
9663 * 4. All instruction reserved bits must be set to zero
9664 * 5. The last instruction must be a "ret" instruction
9665 * 6. All branch targets must reference a valid instruction _after_ the branch
9666 */
9667 static int
dtrace_difo_validate(dtrace_difo_t * dp,dtrace_vstate_t * vstate,uint_t nregs,cred_t * cr)9668 dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
9669 cred_t *cr)
9670 {
9671 int err = 0;
9672 uint_t i;
9673
9674 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
9675 int kcheckload;
9676 uint_t pc;
9677 int maxglobal = -1, maxlocal = -1, maxtlocal = -1;
9678
9679 kcheckload = cr == NULL ||
9680 (vstate->dtvs_state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) == 0;
9681
9682 dp->dtdo_destructive = 0;
9683
9684 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
9685 dif_instr_t instr = dp->dtdo_buf[pc];
9686
9687 uint_t r1 = DIF_INSTR_R1(instr);
9688 uint_t r2 = DIF_INSTR_R2(instr);
9689 uint_t rd = DIF_INSTR_RD(instr);
9690 uint_t rs = DIF_INSTR_RS(instr);
9691 uint_t label = DIF_INSTR_LABEL(instr);
9692 uint_t v = DIF_INSTR_VAR(instr);
9693 uint_t subr = DIF_INSTR_SUBR(instr);
9694 uint_t type = DIF_INSTR_TYPE(instr);
9695 uint_t op = DIF_INSTR_OP(instr);
9696
9697 switch (op) {
9698 case DIF_OP_OR:
9699 case DIF_OP_XOR:
9700 case DIF_OP_AND:
9701 case DIF_OP_SLL:
9702 case DIF_OP_SRL:
9703 case DIF_OP_SRA:
9704 case DIF_OP_SUB:
9705 case DIF_OP_ADD:
9706 case DIF_OP_MUL:
9707 case DIF_OP_SDIV:
9708 case DIF_OP_UDIV:
9709 case DIF_OP_SREM:
9710 case DIF_OP_UREM:
9711 case DIF_OP_COPYS:
9712 if (r1 >= nregs)
9713 err += efunc(pc, "invalid register %u\n", r1);
9714 if (r2 >= nregs)
9715 err += efunc(pc, "invalid register %u\n", r2);
9716 if (rd >= nregs)
9717 err += efunc(pc, "invalid register %u\n", rd);
9718 if (rd == 0)
9719 err += efunc(pc, "cannot write to %%r0\n");
9720 break;
9721 case DIF_OP_NOT:
9722 case DIF_OP_MOV:
9723 case DIF_OP_ALLOCS:
9724 if (r1 >= nregs)
9725 err += efunc(pc, "invalid register %u\n", r1);
9726 if (r2 != 0)
9727 err += efunc(pc, "non-zero reserved bits\n");
9728 if (rd >= nregs)
9729 err += efunc(pc, "invalid register %u\n", rd);
9730 if (rd == 0)
9731 err += efunc(pc, "cannot write to %%r0\n");
9732 break;
9733 case DIF_OP_LDSB:
9734 case DIF_OP_LDSH:
9735 case DIF_OP_LDSW:
9736 case DIF_OP_LDUB:
9737 case DIF_OP_LDUH:
9738 case DIF_OP_LDUW:
9739 case DIF_OP_LDX:
9740 if (r1 >= nregs)
9741 err += efunc(pc, "invalid register %u\n", r1);
9742 if (r2 != 0)
9743 err += efunc(pc, "non-zero reserved bits\n");
9744 if (rd >= nregs)
9745 err += efunc(pc, "invalid register %u\n", rd);
9746 if (rd == 0)
9747 err += efunc(pc, "cannot write to %%r0\n");
9748 if (kcheckload)
9749 dp->dtdo_buf[pc] = DIF_INSTR_LOAD(op +
9750 DIF_OP_RLDSB - DIF_OP_LDSB, r1, rd);
9751 break;
9752 case DIF_OP_RLDSB:
9753 case DIF_OP_RLDSH:
9754 case DIF_OP_RLDSW:
9755 case DIF_OP_RLDUB:
9756 case DIF_OP_RLDUH:
9757 case DIF_OP_RLDUW:
9758 case DIF_OP_RLDX:
9759 if (r1 >= nregs)
9760 err += efunc(pc, "invalid register %u\n", r1);
9761 if (r2 != 0)
9762 err += efunc(pc, "non-zero reserved bits\n");
9763 if (rd >= nregs)
9764 err += efunc(pc, "invalid register %u\n", rd);
9765 if (rd == 0)
9766 err += efunc(pc, "cannot write to %%r0\n");
9767 break;
9768 case DIF_OP_ULDSB:
9769 case DIF_OP_ULDSH:
9770 case DIF_OP_ULDSW:
9771 case DIF_OP_ULDUB:
9772 case DIF_OP_ULDUH:
9773 case DIF_OP_ULDUW:
9774 case DIF_OP_ULDX:
9775 if (r1 >= nregs)
9776 err += efunc(pc, "invalid register %u\n", r1);
9777 if (r2 != 0)
9778 err += efunc(pc, "non-zero reserved bits\n");
9779 if (rd >= nregs)
9780 err += efunc(pc, "invalid register %u\n", rd);
9781 if (rd == 0)
9782 err += efunc(pc, "cannot write to %%r0\n");
9783 break;
9784 case DIF_OP_STB:
9785 case DIF_OP_STH:
9786 case DIF_OP_STW:
9787 case DIF_OP_STX:
9788 if (r1 >= nregs)
9789 err += efunc(pc, "invalid register %u\n", r1);
9790 if (r2 != 0)
9791 err += efunc(pc, "non-zero reserved bits\n");
9792 if (rd >= nregs)
9793 err += efunc(pc, "invalid register %u\n", rd);
9794 if (rd == 0)
9795 err += efunc(pc, "cannot write to 0 address\n");
9796 break;
9797 case DIF_OP_CMP:
9798 case DIF_OP_SCMP:
9799 if (r1 >= nregs)
9800 err += efunc(pc, "invalid register %u\n", r1);
9801 if (r2 >= nregs)
9802 err += efunc(pc, "invalid register %u\n", r2);
9803 if (rd != 0)
9804 err += efunc(pc, "non-zero reserved bits\n");
9805 break;
9806 case DIF_OP_TST:
9807 if (r1 >= nregs)
9808 err += efunc(pc, "invalid register %u\n", r1);
9809 if (r2 != 0 || rd != 0)
9810 err += efunc(pc, "non-zero reserved bits\n");
9811 break;
9812 case DIF_OP_BA:
9813 case DIF_OP_BE:
9814 case DIF_OP_BNE:
9815 case DIF_OP_BG:
9816 case DIF_OP_BGU:
9817 case DIF_OP_BGE:
9818 case DIF_OP_BGEU:
9819 case DIF_OP_BL:
9820 case DIF_OP_BLU:
9821 case DIF_OP_BLE:
9822 case DIF_OP_BLEU:
9823 if (label >= dp->dtdo_len) {
9824 err += efunc(pc, "invalid branch target %u\n",
9825 label);
9826 }
9827 if (label <= pc) {
9828 err += efunc(pc, "backward branch to %u\n",
9829 label);
9830 }
9831 break;
9832 case DIF_OP_RET:
9833 if (r1 != 0 || r2 != 0)
9834 err += efunc(pc, "non-zero reserved bits\n");
9835 if (rd >= nregs)
9836 err += efunc(pc, "invalid register %u\n", rd);
9837 break;
9838 case DIF_OP_NOP:
9839 case DIF_OP_POPTS:
9840 case DIF_OP_FLUSHTS:
9841 if (r1 != 0 || r2 != 0 || rd != 0)
9842 err += efunc(pc, "non-zero reserved bits\n");
9843 break;
9844 case DIF_OP_SETX:
9845 if (DIF_INSTR_INTEGER(instr) >= dp->dtdo_intlen) {
9846 err += efunc(pc, "invalid integer ref %u\n",
9847 DIF_INSTR_INTEGER(instr));
9848 }
9849 if (rd >= nregs)
9850 err += efunc(pc, "invalid register %u\n", rd);
9851 if (rd == 0)
9852 err += efunc(pc, "cannot write to %%r0\n");
9853 break;
9854 case DIF_OP_SETS:
9855 if (DIF_INSTR_STRING(instr) >= dp->dtdo_strlen) {
9856 err += efunc(pc, "invalid string ref %u\n",
9857 DIF_INSTR_STRING(instr));
9858 }
9859 if (rd >= nregs)
9860 err += efunc(pc, "invalid register %u\n", rd);
9861 if (rd == 0)
9862 err += efunc(pc, "cannot write to %%r0\n");
9863 break;
9864 case DIF_OP_LDGA:
9865 case DIF_OP_LDTA:
9866 if (r1 > DIF_VAR_ARRAY_MAX)
9867 err += efunc(pc, "invalid array %u\n", r1);
9868 if (r2 >= nregs)
9869 err += efunc(pc, "invalid register %u\n", r2);
9870 if (rd >= nregs)
9871 err += efunc(pc, "invalid register %u\n", rd);
9872 if (rd == 0)
9873 err += efunc(pc, "cannot write to %%r0\n");
9874 break;
9875 case DIF_OP_LDGS:
9876 case DIF_OP_LDTS:
9877 case DIF_OP_LDLS:
9878 case DIF_OP_LDGAA:
9879 case DIF_OP_LDTAA:
9880 if (v < DIF_VAR_OTHER_MIN || v > DIF_VAR_OTHER_MAX)
9881 err += efunc(pc, "invalid variable %u\n", v);
9882 if (rd >= nregs)
9883 err += efunc(pc, "invalid register %u\n", rd);
9884 if (rd == 0)
9885 err += efunc(pc, "cannot write to %%r0\n");
9886 break;
9887 case DIF_OP_STGS:
9888 case DIF_OP_STTS:
9889 case DIF_OP_STLS:
9890 case DIF_OP_STGAA:
9891 case DIF_OP_STTAA:
9892 if (v < DIF_VAR_OTHER_UBASE || v > DIF_VAR_OTHER_MAX)
9893 err += efunc(pc, "invalid variable %u\n", v);
9894 if (rs >= nregs)
9895 err += efunc(pc, "invalid register %u\n", rd);
9896 break;
9897 case DIF_OP_CALL:
9898 if (subr > DIF_SUBR_MAX &&
9899 !(subr >= DIF_SUBR_APPLE_MIN && subr <= DIF_SUBR_APPLE_MAX))
9900 err += efunc(pc, "invalid subr %u\n", subr);
9901 if (rd >= nregs)
9902 err += efunc(pc, "invalid register %u\n", rd);
9903 if (rd == 0)
9904 err += efunc(pc, "cannot write to %%r0\n");
9905
9906 switch (subr) {
9907 case DIF_SUBR_COPYOUT:
9908 case DIF_SUBR_COPYOUTSTR:
9909 case DIF_SUBR_KDEBUG_TRACE:
9910 case DIF_SUBR_KDEBUG_TRACE_STRING:
9911 case DIF_SUBR_PHYSMEM_READ:
9912 case DIF_SUBR_PHYSMEM_WRITE:
9913 case DIF_SUBR_LIVEDUMP:
9914 dp->dtdo_destructive = 1;
9915 break;
9916 default:
9917 break;
9918 }
9919 break;
9920 case DIF_OP_PUSHTR:
9921 if (type != DIF_TYPE_STRING && type != DIF_TYPE_CTF)
9922 err += efunc(pc, "invalid ref type %u\n", type);
9923 if (r2 >= nregs)
9924 err += efunc(pc, "invalid register %u\n", r2);
9925 if (rs >= nregs)
9926 err += efunc(pc, "invalid register %u\n", rs);
9927 break;
9928 case DIF_OP_PUSHTV:
9929 if (type != DIF_TYPE_CTF)
9930 err += efunc(pc, "invalid val type %u\n", type);
9931 if (r2 >= nregs)
9932 err += efunc(pc, "invalid register %u\n", r2);
9933 if (rs >= nregs)
9934 err += efunc(pc, "invalid register %u\n", rs);
9935 break;
9936 case DIF_OP_STRIP:
9937 if (r1 >= nregs)
9938 err += efunc(pc, "invalid register %u\n", r1);
9939 if (!dtrace_is_valid_ptrauth_key(r2))
9940 err += efunc(pc, "invalid key\n");
9941 if (rd >= nregs)
9942 err += efunc(pc, "invalid register %u\n", rd);
9943 if (rd == 0)
9944 err += efunc(pc, "cannot write to %%r0\n");
9945 break;
9946 default:
9947 err += efunc(pc, "invalid opcode %u\n",
9948 DIF_INSTR_OP(instr));
9949 }
9950 }
9951
9952 if (dp->dtdo_len != 0 &&
9953 DIF_INSTR_OP(dp->dtdo_buf[dp->dtdo_len - 1]) != DIF_OP_RET) {
9954 err += efunc(dp->dtdo_len - 1,
9955 "expected 'ret' as last DIF instruction\n");
9956 }
9957
9958 if (!(dp->dtdo_rtype.dtdt_flags & (DIF_TF_BYREF | DIF_TF_BYUREF))) {
9959 /*
9960 * If we're not returning by reference, the size must be either
9961 * 0 or the size of one of the base types.
9962 */
9963 switch (dp->dtdo_rtype.dtdt_size) {
9964 case 0:
9965 case sizeof (uint8_t):
9966 case sizeof (uint16_t):
9967 case sizeof (uint32_t):
9968 case sizeof (uint64_t):
9969 break;
9970
9971 default:
9972 err += efunc(dp->dtdo_len - 1, "bad return size\n");
9973 }
9974 }
9975
9976 for (i = 0; i < dp->dtdo_varlen && err == 0; i++) {
9977 dtrace_difv_t *v = &dp->dtdo_vartab[i], *existing = NULL;
9978 dtrace_diftype_t *vt, *et;
9979 uint_t id;
9980 int ndx;
9981
9982 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL &&
9983 v->dtdv_scope != DIFV_SCOPE_THREAD &&
9984 v->dtdv_scope != DIFV_SCOPE_LOCAL) {
9985 err += efunc(i, "unrecognized variable scope %d\n",
9986 v->dtdv_scope);
9987 break;
9988 }
9989
9990 if (v->dtdv_kind != DIFV_KIND_ARRAY &&
9991 v->dtdv_kind != DIFV_KIND_SCALAR) {
9992 err += efunc(i, "unrecognized variable type %d\n",
9993 v->dtdv_kind);
9994 break;
9995 }
9996
9997 if ((id = v->dtdv_id) > DIF_VARIABLE_MAX) {
9998 err += efunc(i, "%d exceeds variable id limit\n", id);
9999 break;
10000 }
10001
10002 if (id < DIF_VAR_OTHER_UBASE)
10003 continue;
10004
10005 /*
10006 * For user-defined variables, we need to check that this
10007 * definition is identical to any previous definition that we
10008 * encountered.
10009 */
10010 ndx = id - DIF_VAR_OTHER_UBASE;
10011
10012 switch (v->dtdv_scope) {
10013 case DIFV_SCOPE_GLOBAL:
10014 if (maxglobal == -1 || ndx > maxglobal)
10015 maxglobal = ndx;
10016
10017 if (ndx < vstate->dtvs_nglobals) {
10018 dtrace_statvar_t *svar;
10019
10020 if ((svar = vstate->dtvs_globals[ndx]) != NULL)
10021 existing = &svar->dtsv_var;
10022 }
10023
10024 break;
10025
10026 case DIFV_SCOPE_THREAD:
10027 if (maxtlocal == -1 || ndx > maxtlocal)
10028 maxtlocal = ndx;
10029
10030 if (ndx < vstate->dtvs_ntlocals)
10031 existing = &vstate->dtvs_tlocals[ndx];
10032 break;
10033
10034 case DIFV_SCOPE_LOCAL:
10035 if (maxlocal == -1 || ndx > maxlocal)
10036 maxlocal = ndx;
10037 if (ndx < vstate->dtvs_nlocals) {
10038 dtrace_statvar_t *svar;
10039
10040 if ((svar = vstate->dtvs_locals[ndx]) != NULL)
10041 existing = &svar->dtsv_var;
10042 }
10043
10044 break;
10045 }
10046
10047 vt = &v->dtdv_type;
10048
10049 if (vt->dtdt_flags & DIF_TF_BYREF) {
10050 if (vt->dtdt_size == 0) {
10051 err += efunc(i, "zero-sized variable\n");
10052 break;
10053 }
10054
10055 if ((v->dtdv_scope == DIFV_SCOPE_GLOBAL ||
10056 v->dtdv_scope == DIFV_SCOPE_LOCAL) &&
10057 vt->dtdt_size > dtrace_statvar_maxsize) {
10058 err += efunc(i, "oversized by-ref static\n");
10059 break;
10060 }
10061 }
10062
10063 if (existing == NULL || existing->dtdv_id == 0)
10064 continue;
10065
10066 ASSERT(existing->dtdv_id == v->dtdv_id);
10067 ASSERT(existing->dtdv_scope == v->dtdv_scope);
10068
10069 if (existing->dtdv_kind != v->dtdv_kind)
10070 err += efunc(i, "%d changed variable kind\n", id);
10071
10072 et = &existing->dtdv_type;
10073
10074 if (vt->dtdt_flags != et->dtdt_flags) {
10075 err += efunc(i, "%d changed variable type flags\n", id);
10076 break;
10077 }
10078
10079 if (vt->dtdt_size != 0 && vt->dtdt_size != et->dtdt_size) {
10080 err += efunc(i, "%d changed variable type size\n", id);
10081 break;
10082 }
10083 }
10084
10085 for (pc = 0; pc < dp->dtdo_len && err == 0; pc++) {
10086 dif_instr_t instr = dp->dtdo_buf[pc];
10087
10088 uint_t v = DIF_INSTR_VAR(instr);
10089 uint_t op = DIF_INSTR_OP(instr);
10090
10091 switch (op) {
10092 case DIF_OP_LDGS:
10093 case DIF_OP_LDGAA:
10094 case DIF_OP_STGS:
10095 case DIF_OP_STGAA:
10096 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxglobal))
10097 err += efunc(pc, "invalid variable %u\n", v);
10098 break;
10099 case DIF_OP_LDTS:
10100 case DIF_OP_LDTAA:
10101 case DIF_OP_STTS:
10102 case DIF_OP_STTAA:
10103 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxtlocal))
10104 err += efunc(pc, "invalid variable %u\n", v);
10105 break;
10106 case DIF_OP_LDLS:
10107 case DIF_OP_STLS:
10108 if (v > (uint_t)(DIF_VAR_OTHER_UBASE + maxlocal))
10109 err += efunc(pc, "invalid variable %u\n", v);
10110 break;
10111 default:
10112 break;
10113 }
10114 }
10115
10116 return (err);
10117 }
10118
10119 /*
10120 * Validate a DTrace DIF object that it is to be used as a helper. Helpers
10121 * are much more constrained than normal DIFOs. Specifically, they may
10122 * not:
10123 *
10124 * 1. Make calls to subroutines other than copyin(), copyinstr() or
10125 * miscellaneous string routines
10126 * 2. Access DTrace variables other than the args[] array, and the
10127 * curthread, pid, ppid, tid, execname, zonename, uid and gid variables.
10128 * 3. Have thread-local variables.
10129 * 4. Have dynamic variables.
10130 */
10131 static int
dtrace_difo_validate_helper(dtrace_difo_t * dp)10132 dtrace_difo_validate_helper(dtrace_difo_t *dp)
10133 {
10134 int (*efunc)(uint_t pc, const char *, ...) = dtrace_difo_err;
10135 int err = 0;
10136 uint_t pc;
10137
10138 for (pc = 0; pc < dp->dtdo_len; pc++) {
10139 dif_instr_t instr = dp->dtdo_buf[pc];
10140
10141 uint_t v = DIF_INSTR_VAR(instr);
10142 uint_t subr = DIF_INSTR_SUBR(instr);
10143 uint_t op = DIF_INSTR_OP(instr);
10144
10145 switch (op) {
10146 case DIF_OP_OR:
10147 case DIF_OP_XOR:
10148 case DIF_OP_AND:
10149 case DIF_OP_SLL:
10150 case DIF_OP_SRL:
10151 case DIF_OP_SRA:
10152 case DIF_OP_SUB:
10153 case DIF_OP_ADD:
10154 case DIF_OP_MUL:
10155 case DIF_OP_SDIV:
10156 case DIF_OP_UDIV:
10157 case DIF_OP_SREM:
10158 case DIF_OP_UREM:
10159 case DIF_OP_COPYS:
10160 case DIF_OP_NOT:
10161 case DIF_OP_MOV:
10162 case DIF_OP_RLDSB:
10163 case DIF_OP_RLDSH:
10164 case DIF_OP_RLDSW:
10165 case DIF_OP_RLDUB:
10166 case DIF_OP_RLDUH:
10167 case DIF_OP_RLDUW:
10168 case DIF_OP_RLDX:
10169 case DIF_OP_ULDSB:
10170 case DIF_OP_ULDSH:
10171 case DIF_OP_ULDSW:
10172 case DIF_OP_ULDUB:
10173 case DIF_OP_ULDUH:
10174 case DIF_OP_ULDUW:
10175 case DIF_OP_ULDX:
10176 case DIF_OP_STB:
10177 case DIF_OP_STH:
10178 case DIF_OP_STW:
10179 case DIF_OP_STX:
10180 case DIF_OP_ALLOCS:
10181 case DIF_OP_CMP:
10182 case DIF_OP_SCMP:
10183 case DIF_OP_TST:
10184 case DIF_OP_BA:
10185 case DIF_OP_BE:
10186 case DIF_OP_BNE:
10187 case DIF_OP_BG:
10188 case DIF_OP_BGU:
10189 case DIF_OP_BGE:
10190 case DIF_OP_BGEU:
10191 case DIF_OP_BL:
10192 case DIF_OP_BLU:
10193 case DIF_OP_BLE:
10194 case DIF_OP_BLEU:
10195 case DIF_OP_RET:
10196 case DIF_OP_NOP:
10197 case DIF_OP_POPTS:
10198 case DIF_OP_FLUSHTS:
10199 case DIF_OP_SETX:
10200 case DIF_OP_SETS:
10201 case DIF_OP_LDGA:
10202 case DIF_OP_LDLS:
10203 case DIF_OP_STGS:
10204 case DIF_OP_STLS:
10205 case DIF_OP_PUSHTR:
10206 case DIF_OP_PUSHTV:
10207 break;
10208
10209 case DIF_OP_LDGS:
10210 if (v >= DIF_VAR_OTHER_UBASE)
10211 break;
10212
10213 if (v >= DIF_VAR_ARG0 && v <= DIF_VAR_ARG9)
10214 break;
10215
10216 if (v == DIF_VAR_CURTHREAD || v == DIF_VAR_PID ||
10217 v == DIF_VAR_PPID || v == DIF_VAR_TID ||
10218 v == DIF_VAR_EXECNAME || v == DIF_VAR_ZONENAME ||
10219 v == DIF_VAR_UID || v == DIF_VAR_GID)
10220 break;
10221
10222 err += efunc(pc, "illegal variable %u\n", v);
10223 break;
10224
10225 case DIF_OP_LDTA:
10226 case DIF_OP_LDTS:
10227 case DIF_OP_LDGAA:
10228 case DIF_OP_LDTAA:
10229 err += efunc(pc, "illegal dynamic variable load\n");
10230 break;
10231
10232 case DIF_OP_STTS:
10233 case DIF_OP_STGAA:
10234 case DIF_OP_STTAA:
10235 err += efunc(pc, "illegal dynamic variable store\n");
10236 break;
10237
10238 case DIF_OP_CALL:
10239 switch (subr) {
10240 case DIF_SUBR_ALLOCA:
10241 case DIF_SUBR_BCOPY:
10242 case DIF_SUBR_COPYIN:
10243 case DIF_SUBR_COPYINTO:
10244 case DIF_SUBR_COPYINSTR:
10245 case DIF_SUBR_HTONS:
10246 case DIF_SUBR_HTONL:
10247 case DIF_SUBR_HTONLL:
10248 case DIF_SUBR_INDEX:
10249 case DIF_SUBR_INET_NTOA:
10250 case DIF_SUBR_INET_NTOA6:
10251 case DIF_SUBR_INET_NTOP:
10252 case DIF_SUBR_JSON:
10253 case DIF_SUBR_LLTOSTR:
10254 case DIF_SUBR_NTOHS:
10255 case DIF_SUBR_NTOHL:
10256 case DIF_SUBR_NTOHLL:
10257 case DIF_SUBR_RINDEX:
10258 case DIF_SUBR_STRCHR:
10259 case DIF_SUBR_STRTOLL:
10260 case DIF_SUBR_STRJOIN:
10261 case DIF_SUBR_STRRCHR:
10262 case DIF_SUBR_STRSTR:
10263 break;
10264 default:
10265 err += efunc(pc, "invalid subr %u\n", subr);
10266 }
10267 break;
10268
10269 default:
10270 err += efunc(pc, "invalid opcode %u\n",
10271 DIF_INSTR_OP(instr));
10272 }
10273 }
10274
10275 return (err);
10276 }
10277
10278 /*
10279 * Returns 1 if the expression in the DIF object can be cached on a per-thread
10280 * basis; 0 if not.
10281 */
10282 static int
dtrace_difo_cacheable(dtrace_difo_t * dp)10283 dtrace_difo_cacheable(dtrace_difo_t *dp)
10284 {
10285 uint_t i;
10286
10287 if (dp == NULL)
10288 return (0);
10289
10290 for (i = 0; i < dp->dtdo_varlen; i++) {
10291 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10292
10293 if (v->dtdv_scope != DIFV_SCOPE_GLOBAL)
10294 continue;
10295
10296 switch (v->dtdv_id) {
10297 case DIF_VAR_CURTHREAD:
10298 case DIF_VAR_PID:
10299 case DIF_VAR_TID:
10300 case DIF_VAR_EXECNAME:
10301 case DIF_VAR_ZONENAME:
10302 break;
10303
10304 default:
10305 return (0);
10306 }
10307 }
10308
10309 /*
10310 * This DIF object may be cacheable. Now we need to look for any
10311 * array loading instructions, any memory loading instructions, or
10312 * any stores to thread-local variables.
10313 */
10314 for (i = 0; i < dp->dtdo_len; i++) {
10315 uint_t op = DIF_INSTR_OP(dp->dtdo_buf[i]);
10316
10317 if ((op >= DIF_OP_LDSB && op <= DIF_OP_LDX) ||
10318 (op >= DIF_OP_ULDSB && op <= DIF_OP_ULDX) ||
10319 (op >= DIF_OP_RLDSB && op <= DIF_OP_RLDX) ||
10320 op == DIF_OP_LDGA || op == DIF_OP_STTS)
10321 return (0);
10322 }
10323
10324 return (1);
10325 }
10326
10327 static void
dtrace_difo_hold(dtrace_difo_t * dp)10328 dtrace_difo_hold(dtrace_difo_t *dp)
10329 {
10330 uint_t i;
10331
10332 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10333
10334 dp->dtdo_refcnt++;
10335 ASSERT(dp->dtdo_refcnt != 0);
10336
10337 /*
10338 * We need to check this DIF object for references to the variable
10339 * DIF_VAR_VTIMESTAMP.
10340 */
10341 for (i = 0; i < dp->dtdo_varlen; i++) {
10342 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10343
10344 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10345 continue;
10346
10347 if (dtrace_vtime_references++ == 0)
10348 dtrace_vtime_enable();
10349 }
10350 }
10351
10352 /*
10353 * This routine calculates the dynamic variable chunksize for a given DIF
10354 * object. The calculation is not fool-proof, and can probably be tricked by
10355 * malicious DIF -- but it works for all compiler-generated DIF. Because this
10356 * calculation is likely imperfect, dtrace_dynvar() is able to gracefully fail
10357 * if a dynamic variable size exceeds the chunksize.
10358 */
10359 static void
dtrace_difo_chunksize(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10360 dtrace_difo_chunksize(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10361 {
10362 uint64_t sval = 0;
10363 dtrace_key_t tupregs[DIF_DTR_NREGS + 2]; /* +2 for thread and id */
10364 const dif_instr_t *text = dp->dtdo_buf;
10365 uint_t pc, srd = 0;
10366 uint_t ttop = 0;
10367 size_t size, ksize;
10368 uint_t id, i;
10369
10370 for (pc = 0; pc < dp->dtdo_len; pc++) {
10371 dif_instr_t instr = text[pc];
10372 uint_t op = DIF_INSTR_OP(instr);
10373 uint_t rd = DIF_INSTR_RD(instr);
10374 uint_t r1 = DIF_INSTR_R1(instr);
10375 uint_t nkeys = 0;
10376 uchar_t scope;
10377
10378 dtrace_key_t *key = tupregs;
10379
10380 switch (op) {
10381 case DIF_OP_SETX:
10382 sval = dp->dtdo_inttab[DIF_INSTR_INTEGER(instr)];
10383 srd = rd;
10384 continue;
10385
10386 case DIF_OP_STTS:
10387 key = &tupregs[DIF_DTR_NREGS];
10388 key[0].dttk_size = 0;
10389 key[1].dttk_size = 0;
10390 nkeys = 2;
10391 scope = DIFV_SCOPE_THREAD;
10392 break;
10393
10394 case DIF_OP_STGAA:
10395 case DIF_OP_STTAA:
10396 nkeys = ttop;
10397
10398 if (DIF_INSTR_OP(instr) == DIF_OP_STTAA)
10399 key[nkeys++].dttk_size = 0;
10400
10401 key[nkeys++].dttk_size = 0;
10402
10403 if (op == DIF_OP_STTAA) {
10404 scope = DIFV_SCOPE_THREAD;
10405 } else {
10406 scope = DIFV_SCOPE_GLOBAL;
10407 }
10408
10409 break;
10410
10411 case DIF_OP_PUSHTR:
10412 if (ttop == DIF_DTR_NREGS)
10413 return;
10414
10415 if ((srd == 0 || sval == 0) && r1 == DIF_TYPE_STRING) {
10416 /*
10417 * If the register for the size of the "pushtr"
10418 * is %r0 (or the value is 0) and the type is
10419 * a string, we'll use the system-wide default
10420 * string size.
10421 */
10422 tupregs[ttop++].dttk_size =
10423 dtrace_strsize_default;
10424 } else {
10425 if (srd == 0)
10426 return;
10427
10428 if (sval > LONG_MAX)
10429 return;
10430
10431 tupregs[ttop++].dttk_size = sval;
10432 }
10433
10434 break;
10435
10436 case DIF_OP_PUSHTV:
10437 if (ttop == DIF_DTR_NREGS)
10438 return;
10439
10440 tupregs[ttop++].dttk_size = 0;
10441 break;
10442
10443 case DIF_OP_FLUSHTS:
10444 ttop = 0;
10445 break;
10446
10447 case DIF_OP_POPTS:
10448 if (ttop != 0)
10449 ttop--;
10450 break;
10451 }
10452
10453 sval = 0;
10454 srd = 0;
10455
10456 if (nkeys == 0)
10457 continue;
10458
10459 /*
10460 * We have a dynamic variable allocation; calculate its size.
10461 */
10462 for (ksize = 0, i = 0; i < nkeys; i++)
10463 ksize += P2ROUNDUP(key[i].dttk_size, sizeof (uint64_t));
10464
10465 size = sizeof (dtrace_dynvar_t);
10466 size += sizeof (dtrace_key_t) * (nkeys - 1);
10467 size += ksize;
10468
10469 /*
10470 * Now we need to determine the size of the stored data.
10471 */
10472 id = DIF_INSTR_VAR(instr);
10473
10474 for (i = 0; i < dp->dtdo_varlen; i++) {
10475 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10476
10477 if (v->dtdv_id == id && v->dtdv_scope == scope) {
10478 size += v->dtdv_type.dtdt_size;
10479 break;
10480 }
10481 }
10482
10483 if (i == dp->dtdo_varlen)
10484 return;
10485
10486 /*
10487 * We have the size. If this is larger than the chunk size
10488 * for our dynamic variable state, reset the chunk size.
10489 */
10490 size = P2ROUNDUP(size, sizeof (uint64_t));
10491
10492 /*
10493 * Before setting the chunk size, check that we're not going
10494 * to set it to a negative value...
10495 */
10496 if (size > LONG_MAX)
10497 return;
10498
10499 /*
10500 * ...and make certain that we didn't badly overflow.
10501 */
10502 if (size < ksize || size < sizeof (dtrace_dynvar_t))
10503 return;
10504
10505 if (size > vstate->dtvs_dynvars.dtds_chunksize)
10506 vstate->dtvs_dynvars.dtds_chunksize = size;
10507 }
10508 }
10509
10510 static void
dtrace_difo_init(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10511 dtrace_difo_init(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10512 {
10513 int oldsvars, osz, nsz, otlocals, ntlocals;
10514 uint_t i, id;
10515
10516 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10517 ASSERT(dp->dtdo_buf != NULL && dp->dtdo_len != 0);
10518
10519 for (i = 0; i < dp->dtdo_varlen; i++) {
10520 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10521 dtrace_statvar_t *svar;
10522 dtrace_statvar_t ***svarp = NULL;
10523 size_t dsize = 0;
10524 uint8_t scope = v->dtdv_scope;
10525 int *np = (int *)NULL;
10526
10527 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10528 continue;
10529
10530 id -= DIF_VAR_OTHER_UBASE;
10531
10532 switch (scope) {
10533 case DIFV_SCOPE_THREAD:
10534 while (id >= (uint_t)(otlocals = vstate->dtvs_ntlocals)) {
10535 dtrace_difv_t *tlocals;
10536
10537 if ((ntlocals = (otlocals << 1)) == 0)
10538 ntlocals = 1;
10539
10540 osz = otlocals * sizeof (dtrace_difv_t);
10541 nsz = ntlocals * sizeof (dtrace_difv_t);
10542
10543 tlocals = kmem_zalloc(nsz, KM_SLEEP);
10544
10545 if (osz != 0) {
10546 bcopy(vstate->dtvs_tlocals,
10547 tlocals, osz);
10548 kmem_free(vstate->dtvs_tlocals, osz);
10549 }
10550
10551 vstate->dtvs_tlocals = tlocals;
10552 vstate->dtvs_ntlocals = ntlocals;
10553 }
10554
10555 vstate->dtvs_tlocals[id] = *v;
10556 continue;
10557
10558 case DIFV_SCOPE_LOCAL:
10559 np = &vstate->dtvs_nlocals;
10560 svarp = &vstate->dtvs_locals;
10561
10562 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10563 dsize = (int)NCPU * (v->dtdv_type.dtdt_size +
10564 sizeof (uint64_t));
10565 else
10566 dsize = (int)NCPU * sizeof (uint64_t);
10567
10568 break;
10569
10570 case DIFV_SCOPE_GLOBAL:
10571 np = &vstate->dtvs_nglobals;
10572 svarp = &vstate->dtvs_globals;
10573
10574 if (v->dtdv_type.dtdt_flags & DIF_TF_BYREF)
10575 dsize = v->dtdv_type.dtdt_size +
10576 sizeof (uint64_t);
10577
10578 break;
10579
10580 default:
10581 ASSERT(0);
10582 }
10583
10584 while (id >= (uint_t)(oldsvars = *np)) {
10585 dtrace_statvar_t **statics;
10586 int newsvars, oldsize, newsize;
10587
10588 if ((newsvars = (oldsvars << 1)) == 0)
10589 newsvars = 1;
10590
10591 oldsize = oldsvars * sizeof (dtrace_statvar_t *);
10592 newsize = newsvars * sizeof (dtrace_statvar_t *);
10593
10594 statics = kmem_zalloc(newsize, KM_SLEEP);
10595
10596 if (oldsize != 0) {
10597 bcopy(*svarp, statics, oldsize);
10598 kmem_free(*svarp, oldsize);
10599 }
10600
10601 *svarp = statics;
10602 *np = newsvars;
10603 }
10604
10605 if ((svar = (*svarp)[id]) == NULL) {
10606 svar = kmem_zalloc(sizeof (dtrace_statvar_t), KM_SLEEP);
10607 svar->dtsv_var = *v;
10608
10609 if ((svar->dtsv_size = dsize) != 0) {
10610 svar->dtsv_data = (uint64_t)(uintptr_t)
10611 kmem_zalloc(dsize, KM_SLEEP);
10612 }
10613
10614 (*svarp)[id] = svar;
10615 }
10616
10617 svar->dtsv_refcnt++;
10618 }
10619
10620 dtrace_difo_chunksize(dp, vstate);
10621 dtrace_difo_hold(dp);
10622 }
10623
10624 static dtrace_difo_t *
dtrace_difo_duplicate(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10625 dtrace_difo_duplicate(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10626 {
10627 dtrace_difo_t *new;
10628 size_t sz;
10629
10630 ASSERT(dp->dtdo_buf != NULL);
10631 ASSERT(dp->dtdo_refcnt != 0);
10632
10633 new = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
10634
10635 ASSERT(dp->dtdo_buf != NULL);
10636 sz = dp->dtdo_len * sizeof (dif_instr_t);
10637 new->dtdo_buf = kmem_alloc(sz, KM_SLEEP);
10638 bcopy(dp->dtdo_buf, new->dtdo_buf, sz);
10639 new->dtdo_len = dp->dtdo_len;
10640
10641 if (dp->dtdo_strtab != NULL) {
10642 ASSERT(dp->dtdo_strlen != 0);
10643 new->dtdo_strtab = kmem_alloc(dp->dtdo_strlen, KM_SLEEP);
10644 bcopy(dp->dtdo_strtab, new->dtdo_strtab, dp->dtdo_strlen);
10645 new->dtdo_strlen = dp->dtdo_strlen;
10646 }
10647
10648 if (dp->dtdo_inttab != NULL) {
10649 ASSERT(dp->dtdo_intlen != 0);
10650 sz = dp->dtdo_intlen * sizeof (uint64_t);
10651 new->dtdo_inttab = kmem_alloc(sz, KM_SLEEP);
10652 bcopy(dp->dtdo_inttab, new->dtdo_inttab, sz);
10653 new->dtdo_intlen = dp->dtdo_intlen;
10654 }
10655
10656 if (dp->dtdo_vartab != NULL) {
10657 ASSERT(dp->dtdo_varlen != 0);
10658 sz = dp->dtdo_varlen * sizeof (dtrace_difv_t);
10659 new->dtdo_vartab = kmem_alloc(sz, KM_SLEEP);
10660 bcopy(dp->dtdo_vartab, new->dtdo_vartab, sz);
10661 new->dtdo_varlen = dp->dtdo_varlen;
10662 }
10663
10664 dtrace_difo_init(new, vstate);
10665 return (new);
10666 }
10667
10668 static void
dtrace_difo_destroy(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10669 dtrace_difo_destroy(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10670 {
10671 uint_t i;
10672
10673 ASSERT(dp->dtdo_refcnt == 0);
10674
10675 for (i = 0; i < dp->dtdo_varlen; i++) {
10676 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10677 dtrace_statvar_t *svar;
10678 dtrace_statvar_t **svarp = NULL;
10679 uint_t id;
10680 uint8_t scope = v->dtdv_scope;
10681 int *np = NULL;
10682
10683 switch (scope) {
10684 case DIFV_SCOPE_THREAD:
10685 continue;
10686
10687 case DIFV_SCOPE_LOCAL:
10688 np = &vstate->dtvs_nlocals;
10689 svarp = vstate->dtvs_locals;
10690 break;
10691
10692 case DIFV_SCOPE_GLOBAL:
10693 np = &vstate->dtvs_nglobals;
10694 svarp = vstate->dtvs_globals;
10695 break;
10696
10697 default:
10698 ASSERT(0);
10699 }
10700
10701 if ((id = v->dtdv_id) < DIF_VAR_OTHER_UBASE)
10702 continue;
10703
10704 id -= DIF_VAR_OTHER_UBASE;
10705
10706 ASSERT(id < (uint_t)*np);
10707
10708 svar = svarp[id];
10709 ASSERT(svar != NULL);
10710 ASSERT(svar->dtsv_refcnt > 0);
10711
10712 if (--svar->dtsv_refcnt > 0)
10713 continue;
10714
10715 if (svar->dtsv_size != 0) {
10716 ASSERT(svar->dtsv_data != 0);
10717 kmem_free((void *)(uintptr_t)svar->dtsv_data,
10718 svar->dtsv_size);
10719 }
10720
10721 kmem_free(svar, sizeof (dtrace_statvar_t));
10722 svarp[id] = NULL;
10723 }
10724
10725 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
10726 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
10727 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
10728 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
10729
10730 kmem_free(dp, sizeof (dtrace_difo_t));
10731 }
10732
10733 static void
dtrace_difo_release(dtrace_difo_t * dp,dtrace_vstate_t * vstate)10734 dtrace_difo_release(dtrace_difo_t *dp, dtrace_vstate_t *vstate)
10735 {
10736 uint_t i;
10737
10738 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10739 ASSERT(dp->dtdo_refcnt != 0);
10740
10741 for (i = 0; i < dp->dtdo_varlen; i++) {
10742 dtrace_difv_t *v = &dp->dtdo_vartab[i];
10743
10744 if (v->dtdv_id != DIF_VAR_VTIMESTAMP)
10745 continue;
10746
10747 ASSERT(dtrace_vtime_references > 0);
10748 if (--dtrace_vtime_references == 0)
10749 dtrace_vtime_disable();
10750 }
10751
10752 if (--dp->dtdo_refcnt == 0)
10753 dtrace_difo_destroy(dp, vstate);
10754 }
10755
10756 /*
10757 * DTrace Format Functions
10758 */
10759
10760 static dtrace_format_t*
dtrace_format_new(char * str)10761 dtrace_format_new(char *str)
10762 {
10763 dtrace_format_t *fmt = NULL;
10764 size_t bufsize = strlen(str) + 1;
10765
10766 fmt = kmem_zalloc(sizeof(*fmt) + bufsize, KM_SLEEP);
10767
10768 fmt->dtf_refcount = 1;
10769 (void) strlcpy(fmt->dtf_str, str, bufsize);
10770
10771 return fmt;
10772 }
10773
10774 static uint16_t
dtrace_format_add(dtrace_state_t * state,char * str)10775 dtrace_format_add(dtrace_state_t *state, char *str)
10776 {
10777 dtrace_format_t **new;
10778 uint16_t ndx;
10779
10780 for (ndx = 0; ndx < state->dts_nformats; ndx++) {
10781 if (state->dts_formats[ndx] == NULL) {
10782 state->dts_formats[ndx] = dtrace_format_new(str);
10783 return (ndx + 1);
10784 }
10785 else if (strcmp(state->dts_formats[ndx]->dtf_str, str) == 0) {
10786 VERIFY(state->dts_formats[ndx]->dtf_refcount < UINT64_MAX);
10787 state->dts_formats[ndx]->dtf_refcount++;
10788 return (ndx + 1);
10789 }
10790 }
10791
10792 if (state->dts_nformats == USHRT_MAX) {
10793 /*
10794 * This is only likely if a denial-of-service attack is being
10795 * attempted. As such, it's okay to fail silently here.
10796 */
10797 return (0);
10798 }
10799
10800 /*
10801 * For simplicity, we always resize the formats array to be exactly the
10802 * number of formats.
10803 */
10804 ndx = state->dts_nformats++;
10805 new = kmem_alloc((ndx + 1) * sizeof (*state->dts_formats), KM_SLEEP);
10806
10807 if (state->dts_formats != NULL) {
10808 ASSERT(ndx != 0);
10809 bcopy(state->dts_formats, new, ndx * sizeof (*state->dts_formats));
10810 kmem_free(state->dts_formats, ndx * sizeof (*state->dts_formats));
10811 }
10812
10813 state->dts_formats = new;
10814 state->dts_formats[ndx] = dtrace_format_new(str);
10815
10816 return (ndx + 1);
10817 }
10818
10819 static void
dtrace_format_remove(dtrace_state_t * state,uint16_t format)10820 dtrace_format_remove(dtrace_state_t *state, uint16_t format)
10821 {
10822 dtrace_format_t *fmt;
10823
10824 ASSERT(state->dts_formats != NULL);
10825 ASSERT(format <= state->dts_nformats);
10826
10827 fmt = state->dts_formats[format - 1];
10828
10829 ASSERT(fmt != NULL);
10830 VERIFY(fmt->dtf_refcount > 0);
10831
10832 fmt->dtf_refcount--;
10833
10834 if (fmt->dtf_refcount == 0) {
10835 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10836 state->dts_formats[format - 1] = NULL;
10837 }
10838 }
10839
10840 static void
dtrace_format_destroy(dtrace_state_t * state)10841 dtrace_format_destroy(dtrace_state_t *state)
10842 {
10843 int i;
10844
10845 if (state->dts_nformats == 0) {
10846 ASSERT(state->dts_formats == NULL);
10847 return;
10848 }
10849
10850 ASSERT(state->dts_formats != NULL);
10851
10852 for (i = 0; i < state->dts_nformats; i++) {
10853 dtrace_format_t *fmt = state->dts_formats[i];
10854
10855 if (fmt == NULL)
10856 continue;
10857
10858 kmem_free(fmt, DTRACE_FORMAT_SIZE(fmt));
10859 }
10860
10861 kmem_free(state->dts_formats, state->dts_nformats * sizeof (*state->dts_formats));
10862 state->dts_nformats = 0;
10863 state->dts_formats = NULL;
10864 }
10865
10866 /*
10867 * DTrace Predicate Functions
10868 */
10869 static dtrace_predicate_t *
dtrace_predicate_create(dtrace_difo_t * dp)10870 dtrace_predicate_create(dtrace_difo_t *dp)
10871 {
10872 dtrace_predicate_t *pred;
10873
10874 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10875 ASSERT(dp->dtdo_refcnt != 0);
10876
10877 pred = kmem_zalloc(sizeof (dtrace_predicate_t), KM_SLEEP);
10878 pred->dtp_difo = dp;
10879 pred->dtp_refcnt = 1;
10880
10881 if (!dtrace_difo_cacheable(dp))
10882 return (pred);
10883
10884 if (dtrace_predcache_id == DTRACE_CACHEIDNONE) {
10885 /*
10886 * This is only theoretically possible -- we have had 2^32
10887 * cacheable predicates on this machine. We cannot allow any
10888 * more predicates to become cacheable: as unlikely as it is,
10889 * there may be a thread caching a (now stale) predicate cache
10890 * ID. (N.B.: the temptation is being successfully resisted to
10891 * have this cmn_err() "Holy shit -- we executed this code!")
10892 */
10893 return (pred);
10894 }
10895
10896 pred->dtp_cacheid = dtrace_predcache_id++;
10897
10898 return (pred);
10899 }
10900
10901 static void
dtrace_predicate_hold(dtrace_predicate_t * pred)10902 dtrace_predicate_hold(dtrace_predicate_t *pred)
10903 {
10904 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10905 ASSERT(pred->dtp_difo != NULL && pred->dtp_difo->dtdo_refcnt != 0);
10906 ASSERT(pred->dtp_refcnt > 0);
10907
10908 pred->dtp_refcnt++;
10909 }
10910
10911 static void
dtrace_predicate_release(dtrace_predicate_t * pred,dtrace_vstate_t * vstate)10912 dtrace_predicate_release(dtrace_predicate_t *pred, dtrace_vstate_t *vstate)
10913 {
10914 dtrace_difo_t *dp = pred->dtp_difo;
10915 #pragma unused(dp) /* __APPLE__ */
10916
10917 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10918 ASSERT(dp != NULL && dp->dtdo_refcnt != 0);
10919 ASSERT(pred->dtp_refcnt > 0);
10920
10921 if (--pred->dtp_refcnt == 0) {
10922 dtrace_difo_release(pred->dtp_difo, vstate);
10923 kmem_free(pred, sizeof (dtrace_predicate_t));
10924 }
10925 }
10926
10927 /*
10928 * DTrace Action Description Functions
10929 */
10930 static dtrace_actdesc_t *
dtrace_actdesc_create(dtrace_actkind_t kind,uint32_t ntuple,uint64_t uarg,uint64_t arg)10931 dtrace_actdesc_create(dtrace_actkind_t kind, uint32_t ntuple,
10932 uint64_t uarg, uint64_t arg)
10933 {
10934 dtrace_actdesc_t *act;
10935
10936 ASSERT(!DTRACEACT_ISPRINTFLIKE(kind) || (arg != 0 &&
10937 arg >= KERNELBASE) || (arg == 0 && kind == DTRACEACT_PRINTA));
10938
10939 act = kmem_zalloc(sizeof (dtrace_actdesc_t), KM_SLEEP);
10940 act->dtad_kind = kind;
10941 act->dtad_ntuple = ntuple;
10942 act->dtad_uarg = uarg;
10943 act->dtad_arg = arg;
10944 act->dtad_refcnt = 1;
10945
10946 return (act);
10947 }
10948
10949 static void
dtrace_actdesc_hold(dtrace_actdesc_t * act)10950 dtrace_actdesc_hold(dtrace_actdesc_t *act)
10951 {
10952 ASSERT(act->dtad_refcnt >= 1);
10953 act->dtad_refcnt++;
10954 }
10955
10956 static void
dtrace_actdesc_release(dtrace_actdesc_t * act,dtrace_vstate_t * vstate)10957 dtrace_actdesc_release(dtrace_actdesc_t *act, dtrace_vstate_t *vstate)
10958 {
10959 dtrace_actkind_t kind = act->dtad_kind;
10960 dtrace_difo_t *dp;
10961
10962 ASSERT(act->dtad_refcnt >= 1);
10963
10964 if (--act->dtad_refcnt != 0)
10965 return;
10966
10967 if ((dp = act->dtad_difo) != NULL)
10968 dtrace_difo_release(dp, vstate);
10969
10970 if (DTRACEACT_ISPRINTFLIKE(kind)) {
10971 char *str = (char *)(uintptr_t)act->dtad_arg;
10972
10973 ASSERT((str != NULL && (uintptr_t)str >= KERNELBASE) ||
10974 (str == NULL && act->dtad_kind == DTRACEACT_PRINTA));
10975
10976 if (str != NULL)
10977 kmem_free(str, strlen(str) + 1);
10978 }
10979
10980 kmem_free(act, sizeof (dtrace_actdesc_t));
10981 }
10982
10983 /*
10984 * DTrace ECB Functions
10985 */
10986 static dtrace_ecb_t *
dtrace_ecb_add(dtrace_state_t * state,dtrace_probe_t * probe)10987 dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
10988 {
10989 dtrace_ecb_t *ecb;
10990 dtrace_epid_t epid;
10991
10992 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
10993
10994 ecb = kmem_zalloc(sizeof (dtrace_ecb_t), KM_SLEEP);
10995 ecb->dte_predicate = NULL;
10996 ecb->dte_probe = probe;
10997
10998 /*
10999 * The default size is the size of the default action: recording
11000 * the header.
11001 */
11002 ecb->dte_size = ecb->dte_needed = sizeof (dtrace_rechdr_t);
11003 ecb->dte_alignment = sizeof (dtrace_epid_t);
11004
11005 epid = state->dts_epid++;
11006
11007 if (epid - 1 >= (dtrace_epid_t)state->dts_necbs) {
11008 dtrace_ecb_t **oecbs = state->dts_ecbs, **ecbs;
11009 int necbs = state->dts_necbs << 1;
11010
11011 ASSERT(epid == (dtrace_epid_t)state->dts_necbs + 1);
11012
11013 if (necbs == 0) {
11014 ASSERT(oecbs == NULL);
11015 necbs = 1;
11016 }
11017
11018 ecbs = kmem_zalloc(necbs * sizeof (*ecbs), KM_SLEEP);
11019
11020 if (oecbs != NULL)
11021 bcopy(oecbs, ecbs, state->dts_necbs * sizeof (*ecbs));
11022
11023 dtrace_membar_producer();
11024 state->dts_ecbs = ecbs;
11025
11026 if (oecbs != NULL) {
11027 /*
11028 * If this state is active, we must dtrace_sync()
11029 * before we can free the old dts_ecbs array: we're
11030 * coming in hot, and there may be active ring
11031 * buffer processing (which indexes into the dts_ecbs
11032 * array) on another CPU.
11033 */
11034 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
11035 dtrace_sync();
11036
11037 kmem_free(oecbs, state->dts_necbs * sizeof (*ecbs));
11038 }
11039
11040 dtrace_membar_producer();
11041 state->dts_necbs = necbs;
11042 }
11043
11044 ecb->dte_state = state;
11045
11046 ASSERT(state->dts_ecbs[epid - 1] == NULL);
11047 dtrace_membar_producer();
11048 state->dts_ecbs[(ecb->dte_epid = epid) - 1] = ecb;
11049
11050 return (ecb);
11051 }
11052
11053 static int
dtrace_ecb_enable(dtrace_ecb_t * ecb)11054 dtrace_ecb_enable(dtrace_ecb_t *ecb)
11055 {
11056 dtrace_probe_t *probe = ecb->dte_probe;
11057
11058 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
11059 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11060 ASSERT(ecb->dte_next == NULL);
11061
11062 if (probe == NULL) {
11063 /*
11064 * This is the NULL probe -- there's nothing to do.
11065 */
11066 return(0);
11067 }
11068
11069 probe->dtpr_provider->dtpv_ecb_count++;
11070 if (probe->dtpr_ecb == NULL) {
11071 dtrace_provider_t *prov = probe->dtpr_provider;
11072
11073 /*
11074 * We're the first ECB on this probe.
11075 */
11076 probe->dtpr_ecb = probe->dtpr_ecb_last = ecb;
11077
11078 if (ecb->dte_predicate != NULL)
11079 probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
11080
11081 return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
11082 probe->dtpr_id, probe->dtpr_arg));
11083 } else {
11084 /*
11085 * This probe is already active. Swing the last pointer to
11086 * point to the new ECB, and issue a dtrace_sync() to assure
11087 * that all CPUs have seen the change.
11088 */
11089 ASSERT(probe->dtpr_ecb_last != NULL);
11090 probe->dtpr_ecb_last->dte_next = ecb;
11091 probe->dtpr_ecb_last = ecb;
11092 probe->dtpr_predcache = 0;
11093
11094 dtrace_sync();
11095 return(0);
11096 }
11097 }
11098
11099 static int
dtrace_ecb_resize(dtrace_ecb_t * ecb)11100 dtrace_ecb_resize(dtrace_ecb_t *ecb)
11101 {
11102 dtrace_action_t *act;
11103 uint32_t curneeded = UINT32_MAX;
11104 uint32_t aggbase = UINT32_MAX;
11105
11106 /*
11107 * If we record anything, we always record the dtrace_rechdr_t. (And
11108 * we always record it first.)
11109 */
11110 ecb->dte_size = sizeof (dtrace_rechdr_t);
11111 ecb->dte_alignment = sizeof (dtrace_epid_t);
11112
11113 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11114 dtrace_recdesc_t *rec = &act->dta_rec;
11115 ASSERT(rec->dtrd_size > 0 || rec->dtrd_alignment == 1);
11116
11117 ecb->dte_alignment = MAX(ecb->dte_alignment, rec->dtrd_alignment);
11118
11119 if (DTRACEACT_ISAGG(act->dta_kind)) {
11120 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11121
11122 ASSERT(rec->dtrd_size != 0);
11123 ASSERT(agg->dtag_first != NULL);
11124 ASSERT(act->dta_prev->dta_intuple);
11125 ASSERT(aggbase != UINT32_MAX);
11126 ASSERT(curneeded != UINT32_MAX);
11127
11128 agg->dtag_base = aggbase;
11129 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11130 rec->dtrd_offset = curneeded;
11131 if (curneeded + rec->dtrd_size < curneeded)
11132 return (EINVAL);
11133 curneeded += rec->dtrd_size;
11134 ecb->dte_needed = MAX(ecb->dte_needed, curneeded);
11135
11136 aggbase = UINT32_MAX;
11137 curneeded = UINT32_MAX;
11138 } else if (act->dta_intuple) {
11139 if (curneeded == UINT32_MAX) {
11140 /*
11141 * This is the first record in a tuple. Align
11142 * curneeded to be at offset 4 in an 8-byte
11143 * aligned block.
11144 */
11145 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11146 ASSERT(aggbase == UINT32_MAX);
11147
11148 curneeded = P2PHASEUP(ecb->dte_size,
11149 sizeof (uint64_t), sizeof (dtrace_aggid_t));
11150
11151 aggbase = curneeded - sizeof (dtrace_aggid_t);
11152 ASSERT(IS_P2ALIGNED(aggbase,
11153 sizeof (uint64_t)));
11154 }
11155
11156 curneeded = P2ROUNDUP(curneeded, rec->dtrd_alignment);
11157 rec->dtrd_offset = curneeded;
11158 curneeded += rec->dtrd_size;
11159 if (curneeded + rec->dtrd_size < curneeded)
11160 return (EINVAL);
11161 } else {
11162 /* tuples must be followed by an aggregation */
11163 ASSERT(act->dta_prev == NULL || !act->dta_prev->dta_intuple);
11164 ecb->dte_size = P2ROUNDUP(ecb->dte_size, rec->dtrd_alignment);
11165 rec->dtrd_offset = ecb->dte_size;
11166 if (ecb->dte_size + rec->dtrd_size < ecb->dte_size)
11167 return (EINVAL);
11168 ecb->dte_size += rec->dtrd_size;
11169 ecb->dte_needed = MAX(ecb->dte_needed, ecb->dte_size);
11170 }
11171 }
11172
11173 if ((act = ecb->dte_action) != NULL &&
11174 !(act->dta_kind == DTRACEACT_SPECULATE && act->dta_next == NULL) &&
11175 ecb->dte_size == sizeof (dtrace_rechdr_t)) {
11176 /*
11177 * If the size is still sizeof (dtrace_rechdr_t), then all
11178 * actions store no data; set the size to 0.
11179 */
11180 ecb->dte_size = 0;
11181 }
11182
11183 ecb->dte_size = P2ROUNDUP(ecb->dte_size, sizeof (dtrace_epid_t));
11184 ecb->dte_needed = P2ROUNDUP(ecb->dte_needed, (sizeof (dtrace_epid_t)));
11185 ecb->dte_state->dts_needed = MAX(ecb->dte_state->dts_needed, ecb->dte_needed);
11186 return (0);
11187 }
11188
11189 static dtrace_action_t *
dtrace_ecb_aggregation_create(dtrace_ecb_t * ecb,dtrace_actdesc_t * desc)11190 dtrace_ecb_aggregation_create(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11191 {
11192 dtrace_aggregation_t *agg;
11193 size_t size = sizeof (uint64_t);
11194 int ntuple = desc->dtad_ntuple;
11195 dtrace_action_t *act;
11196 dtrace_recdesc_t *frec;
11197 dtrace_aggid_t aggid;
11198 dtrace_state_t *state = ecb->dte_state;
11199
11200 agg = kmem_zalloc(sizeof (dtrace_aggregation_t), KM_SLEEP);
11201 agg->dtag_ecb = ecb;
11202
11203 ASSERT(DTRACEACT_ISAGG(desc->dtad_kind));
11204
11205 switch (desc->dtad_kind) {
11206 case DTRACEAGG_MIN:
11207 agg->dtag_initial = INT64_MAX;
11208 agg->dtag_aggregate = dtrace_aggregate_min;
11209 break;
11210
11211 case DTRACEAGG_MAX:
11212 agg->dtag_initial = INT64_MIN;
11213 agg->dtag_aggregate = dtrace_aggregate_max;
11214 break;
11215
11216 case DTRACEAGG_COUNT:
11217 agg->dtag_aggregate = dtrace_aggregate_count;
11218 break;
11219
11220 case DTRACEAGG_QUANTIZE:
11221 agg->dtag_aggregate = dtrace_aggregate_quantize;
11222 size = (((sizeof (uint64_t) * NBBY) - 1) * 2 + 1) *
11223 sizeof (uint64_t);
11224 break;
11225
11226 case DTRACEAGG_LQUANTIZE: {
11227 uint16_t step = DTRACE_LQUANTIZE_STEP(desc->dtad_arg);
11228 uint16_t levels = DTRACE_LQUANTIZE_LEVELS(desc->dtad_arg);
11229
11230 agg->dtag_initial = desc->dtad_arg;
11231 agg->dtag_aggregate = dtrace_aggregate_lquantize;
11232
11233 if (step == 0 || levels == 0)
11234 goto err;
11235
11236 size = levels * sizeof (uint64_t) + 3 * sizeof (uint64_t);
11237 break;
11238 }
11239
11240 case DTRACEAGG_LLQUANTIZE: {
11241 uint16_t factor = DTRACE_LLQUANTIZE_FACTOR(desc->dtad_arg);
11242 uint16_t low = DTRACE_LLQUANTIZE_LOW(desc->dtad_arg);
11243 uint16_t high = DTRACE_LLQUANTIZE_HIGH(desc->dtad_arg);
11244 uint16_t nsteps = DTRACE_LLQUANTIZE_NSTEP(desc->dtad_arg);
11245 int64_t v;
11246
11247 agg->dtag_initial = desc->dtad_arg;
11248 agg->dtag_aggregate = dtrace_aggregate_llquantize;
11249
11250 if (factor < 2 || low >= high || nsteps < factor)
11251 goto err;
11252
11253 /*
11254 * Now check that the number of steps evenly divides a power
11255 * of the factor. (This assures both integer bucket size and
11256 * linearity within each magnitude.)
11257 */
11258 for (v = factor; v < nsteps; v *= factor)
11259 continue;
11260
11261 if ((v % nsteps) || (nsteps % factor))
11262 goto err;
11263
11264 size = (dtrace_aggregate_llquantize_bucket(factor, low, high, nsteps, INT64_MAX) + 2) * sizeof (uint64_t);
11265 break;
11266 }
11267
11268 case DTRACEAGG_AVG:
11269 agg->dtag_aggregate = dtrace_aggregate_avg;
11270 size = sizeof (uint64_t) * 2;
11271 break;
11272
11273 case DTRACEAGG_STDDEV:
11274 agg->dtag_aggregate = dtrace_aggregate_stddev;
11275 size = sizeof (uint64_t) * 4;
11276 break;
11277
11278 case DTRACEAGG_SUM:
11279 agg->dtag_aggregate = dtrace_aggregate_sum;
11280 break;
11281
11282 default:
11283 goto err;
11284 }
11285
11286 agg->dtag_action.dta_rec.dtrd_size = size;
11287
11288 if (ntuple == 0)
11289 goto err;
11290
11291 /*
11292 * We must make sure that we have enough actions for the n-tuple.
11293 */
11294 for (act = ecb->dte_action_last; act != NULL; act = act->dta_prev) {
11295 if (DTRACEACT_ISAGG(act->dta_kind))
11296 break;
11297
11298 if (--ntuple == 0) {
11299 /*
11300 * This is the action with which our n-tuple begins.
11301 */
11302 agg->dtag_first = act;
11303 goto success;
11304 }
11305 }
11306
11307 /*
11308 * This n-tuple is short by ntuple elements. Return failure.
11309 */
11310 ASSERT(ntuple != 0);
11311 err:
11312 kmem_free(agg, sizeof (dtrace_aggregation_t));
11313 return (NULL);
11314
11315 success:
11316 /*
11317 * If the last action in the tuple has a size of zero, it's actually
11318 * an expression argument for the aggregating action.
11319 */
11320 ASSERT(ecb->dte_action_last != NULL);
11321 act = ecb->dte_action_last;
11322
11323 if (act->dta_kind == DTRACEACT_DIFEXPR) {
11324 ASSERT(act->dta_difo != NULL);
11325
11326 if (act->dta_difo->dtdo_rtype.dtdt_size == 0)
11327 agg->dtag_hasarg = 1;
11328 }
11329
11330 /*
11331 * We need to allocate an id for this aggregation.
11332 */
11333 aggid = (dtrace_aggid_t)(uintptr_t)vmem_alloc(state->dts_aggid_arena, 1,
11334 VM_BESTFIT | VM_SLEEP);
11335
11336 if (aggid - 1 >= (dtrace_aggid_t)state->dts_naggregations) {
11337 dtrace_aggregation_t **oaggs = state->dts_aggregations;
11338 dtrace_aggregation_t **aggs;
11339 int naggs = state->dts_naggregations << 1;
11340 int onaggs = state->dts_naggregations;
11341
11342 ASSERT(aggid == (dtrace_aggid_t)state->dts_naggregations + 1);
11343
11344 if (naggs == 0) {
11345 ASSERT(oaggs == NULL);
11346 naggs = 1;
11347 }
11348
11349 aggs = kmem_zalloc(naggs * sizeof (*aggs), KM_SLEEP);
11350
11351 if (oaggs != NULL) {
11352 bcopy(oaggs, aggs, onaggs * sizeof (*aggs));
11353 kmem_free(oaggs, onaggs * sizeof (*aggs));
11354 }
11355
11356 state->dts_aggregations = aggs;
11357 state->dts_naggregations = naggs;
11358 }
11359
11360 ASSERT(state->dts_aggregations[aggid - 1] == NULL);
11361 state->dts_aggregations[(agg->dtag_id = aggid) - 1] = agg;
11362
11363 frec = &agg->dtag_first->dta_rec;
11364 if (frec->dtrd_alignment < sizeof (dtrace_aggid_t))
11365 frec->dtrd_alignment = sizeof (dtrace_aggid_t);
11366
11367 for (act = agg->dtag_first; act != NULL; act = act->dta_next) {
11368 ASSERT(!act->dta_intuple);
11369 act->dta_intuple = 1;
11370 }
11371
11372 return (&agg->dtag_action);
11373 }
11374
11375 static void
dtrace_ecb_aggregation_destroy(dtrace_ecb_t * ecb,dtrace_action_t * act)11376 dtrace_ecb_aggregation_destroy(dtrace_ecb_t *ecb, dtrace_action_t *act)
11377 {
11378 dtrace_aggregation_t *agg = (dtrace_aggregation_t *)act;
11379 dtrace_state_t *state = ecb->dte_state;
11380 dtrace_aggid_t aggid = agg->dtag_id;
11381
11382 ASSERT(DTRACEACT_ISAGG(act->dta_kind));
11383 vmem_free(state->dts_aggid_arena, (void *)(uintptr_t)aggid, 1);
11384
11385 ASSERT(state->dts_aggregations[aggid - 1] == agg);
11386 state->dts_aggregations[aggid - 1] = NULL;
11387
11388 kmem_free(agg, sizeof (dtrace_aggregation_t));
11389 }
11390
11391 static int
dtrace_ecb_action_add(dtrace_ecb_t * ecb,dtrace_actdesc_t * desc)11392 dtrace_ecb_action_add(dtrace_ecb_t *ecb, dtrace_actdesc_t *desc)
11393 {
11394 dtrace_action_t *action, *last;
11395 dtrace_difo_t *dp = desc->dtad_difo;
11396 uint32_t size = 0, align = sizeof (uint8_t), mask;
11397 uint16_t format = 0;
11398 dtrace_recdesc_t *rec;
11399 dtrace_state_t *state = ecb->dte_state;
11400 dtrace_optval_t *opt = state->dts_options;
11401 dtrace_optval_t nframes=0, strsize;
11402 uint64_t arg = desc->dtad_arg;
11403
11404 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11405 ASSERT(ecb->dte_action == NULL || ecb->dte_action->dta_refcnt == 1);
11406
11407 if (DTRACEACT_ISAGG(desc->dtad_kind)) {
11408 /*
11409 * If this is an aggregating action, there must be neither
11410 * a speculate nor a commit on the action chain.
11411 */
11412 dtrace_action_t *act;
11413
11414 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
11415 if (act->dta_kind == DTRACEACT_COMMIT)
11416 return (EINVAL);
11417
11418 if (act->dta_kind == DTRACEACT_SPECULATE)
11419 return (EINVAL);
11420 }
11421
11422 action = dtrace_ecb_aggregation_create(ecb, desc);
11423
11424 if (action == NULL)
11425 return (EINVAL);
11426 } else {
11427 if (DTRACEACT_ISDESTRUCTIVE(desc->dtad_kind) ||
11428 (desc->dtad_kind == DTRACEACT_DIFEXPR &&
11429 dp != NULL && dp->dtdo_destructive)) {
11430 state->dts_destructive = 1;
11431 }
11432
11433 switch (desc->dtad_kind) {
11434 case DTRACEACT_PRINTF:
11435 case DTRACEACT_PRINTA:
11436 case DTRACEACT_SYSTEM:
11437 case DTRACEACT_FREOPEN:
11438 case DTRACEACT_DIFEXPR:
11439 /*
11440 * We know that our arg is a string -- turn it into a
11441 * format.
11442 */
11443 if (arg == 0) {
11444 ASSERT(desc->dtad_kind == DTRACEACT_PRINTA ||
11445 desc->dtad_kind == DTRACEACT_DIFEXPR);
11446 format = 0;
11447 } else {
11448 ASSERT(arg != 0);
11449 ASSERT(arg > KERNELBASE);
11450 format = dtrace_format_add(state,
11451 (char *)(uintptr_t)arg);
11452 }
11453
11454 OS_FALLTHROUGH;
11455 case DTRACEACT_LIBACT:
11456 case DTRACEACT_TRACEMEM:
11457 case DTRACEACT_TRACEMEM_DYNSIZE:
11458 case DTRACEACT_APPLEBINARY: /* __APPLE__ */
11459 if (dp == NULL)
11460 return (EINVAL);
11461
11462 if ((size = dp->dtdo_rtype.dtdt_size) != 0)
11463 break;
11464
11465 if (dp->dtdo_rtype.dtdt_kind == DIF_TYPE_STRING) {
11466 if (!(dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11467 return (EINVAL);
11468
11469 size = opt[DTRACEOPT_STRSIZE];
11470 }
11471
11472 break;
11473
11474 case DTRACEACT_STACK:
11475 if ((nframes = arg) == 0) {
11476 nframes = opt[DTRACEOPT_STACKFRAMES];
11477 ASSERT(nframes > 0);
11478 arg = nframes;
11479 }
11480
11481 size = nframes * sizeof (pc_t);
11482 break;
11483
11484 case DTRACEACT_JSTACK:
11485 if ((strsize = DTRACE_USTACK_STRSIZE(arg)) == 0)
11486 strsize = opt[DTRACEOPT_JSTACKSTRSIZE];
11487
11488 if ((nframes = DTRACE_USTACK_NFRAMES(arg)) == 0)
11489 nframes = opt[DTRACEOPT_JSTACKFRAMES];
11490
11491 arg = DTRACE_USTACK_ARG(nframes, strsize);
11492
11493 OS_FALLTHROUGH;
11494 case DTRACEACT_USTACK:
11495 if (desc->dtad_kind != DTRACEACT_JSTACK &&
11496 (nframes = DTRACE_USTACK_NFRAMES(arg)) == 0) {
11497 strsize = DTRACE_USTACK_STRSIZE(arg);
11498 nframes = opt[DTRACEOPT_USTACKFRAMES];
11499 ASSERT(nframes > 0);
11500 arg = DTRACE_USTACK_ARG(nframes, strsize);
11501 }
11502
11503 /*
11504 * Save a slot for the pid.
11505 */
11506 size = (nframes + 1) * sizeof (uint64_t);
11507 size += DTRACE_USTACK_STRSIZE(arg);
11508 size = P2ROUNDUP(size, (uint32_t)(sizeof (uintptr_t)));
11509
11510 break;
11511
11512 case DTRACEACT_SYM:
11513 case DTRACEACT_MOD:
11514 if (dp == NULL || ((size = dp->dtdo_rtype.dtdt_size) !=
11515 sizeof (uint64_t)) ||
11516 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11517 return (EINVAL);
11518 break;
11519
11520 case DTRACEACT_USYM:
11521 case DTRACEACT_UMOD:
11522 case DTRACEACT_UADDR:
11523 if (dp == NULL ||
11524 (dp->dtdo_rtype.dtdt_size != sizeof (uint64_t)) ||
11525 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11526 return (EINVAL);
11527
11528 /*
11529 * We have a slot for the pid, plus a slot for the
11530 * argument. To keep things simple (aligned with
11531 * bitness-neutral sizing), we store each as a 64-bit
11532 * quantity.
11533 */
11534 size = 2 * sizeof (uint64_t);
11535 break;
11536
11537 case DTRACEACT_STOP:
11538 case DTRACEACT_BREAKPOINT:
11539 case DTRACEACT_PANIC:
11540 break;
11541
11542 case DTRACEACT_CHILL:
11543 case DTRACEACT_DISCARD:
11544 case DTRACEACT_RAISE:
11545 case DTRACEACT_PIDRESUME: /* __APPLE__ */
11546 if (dp == NULL)
11547 return (EINVAL);
11548 break;
11549
11550 case DTRACEACT_EXIT:
11551 if (dp == NULL ||
11552 (size = dp->dtdo_rtype.dtdt_size) != sizeof (int) ||
11553 (dp->dtdo_rtype.dtdt_flags & DIF_TF_BYREF))
11554 return (EINVAL);
11555 break;
11556
11557 case DTRACEACT_SPECULATE:
11558 if (ecb->dte_size > sizeof (dtrace_rechdr_t))
11559 return (EINVAL);
11560
11561 if (dp == NULL)
11562 return (EINVAL);
11563
11564 state->dts_speculates = 1;
11565 break;
11566
11567 case DTRACEACT_COMMIT: {
11568 dtrace_action_t *act = ecb->dte_action;
11569
11570 for (; act != NULL; act = act->dta_next) {
11571 if (act->dta_kind == DTRACEACT_COMMIT)
11572 return (EINVAL);
11573 }
11574
11575 if (dp == NULL)
11576 return (EINVAL);
11577 break;
11578 }
11579
11580 default:
11581 return (EINVAL);
11582 }
11583
11584 if (size != 0 || desc->dtad_kind == DTRACEACT_SPECULATE) {
11585 /*
11586 * If this is a data-storing action or a speculate,
11587 * we must be sure that there isn't a commit on the
11588 * action chain.
11589 */
11590 dtrace_action_t *act = ecb->dte_action;
11591
11592 for (; act != NULL; act = act->dta_next) {
11593 if (act->dta_kind == DTRACEACT_COMMIT)
11594 return (EINVAL);
11595 }
11596 }
11597
11598 action = kmem_zalloc(sizeof (dtrace_action_t), KM_SLEEP);
11599 action->dta_rec.dtrd_size = size;
11600 }
11601
11602 action->dta_refcnt = 1;
11603 rec = &action->dta_rec;
11604 size = rec->dtrd_size;
11605
11606 for (mask = sizeof (uint64_t) - 1; size != 0 && mask > 0; mask >>= 1) {
11607 if (!(size & mask)) {
11608 align = mask + 1;
11609 break;
11610 }
11611 }
11612
11613 action->dta_kind = desc->dtad_kind;
11614
11615 if ((action->dta_difo = dp) != NULL)
11616 dtrace_difo_hold(dp);
11617
11618 rec->dtrd_action = action->dta_kind;
11619 rec->dtrd_arg = arg;
11620 rec->dtrd_uarg = desc->dtad_uarg;
11621 rec->dtrd_alignment = (uint16_t)align;
11622 rec->dtrd_format = format;
11623
11624 if ((last = ecb->dte_action_last) != NULL) {
11625 ASSERT(ecb->dte_action != NULL);
11626 action->dta_prev = last;
11627 last->dta_next = action;
11628 } else {
11629 ASSERT(ecb->dte_action == NULL);
11630 ecb->dte_action = action;
11631 }
11632
11633 ecb->dte_action_last = action;
11634
11635 return (0);
11636 }
11637
11638 static void
dtrace_ecb_action_remove(dtrace_ecb_t * ecb)11639 dtrace_ecb_action_remove(dtrace_ecb_t *ecb)
11640 {
11641 dtrace_action_t *act = ecb->dte_action, *next;
11642 dtrace_vstate_t *vstate = &ecb->dte_state->dts_vstate;
11643 dtrace_difo_t *dp;
11644 uint16_t format;
11645
11646 if (act != NULL && act->dta_refcnt > 1) {
11647 ASSERT(act->dta_next == NULL || act->dta_next->dta_refcnt == 1);
11648 act->dta_refcnt--;
11649 } else {
11650 for (; act != NULL; act = next) {
11651 next = act->dta_next;
11652 ASSERT(next != NULL || act == ecb->dte_action_last);
11653 ASSERT(act->dta_refcnt == 1);
11654
11655 if ((format = act->dta_rec.dtrd_format) != 0)
11656 dtrace_format_remove(ecb->dte_state, format);
11657
11658 if ((dp = act->dta_difo) != NULL)
11659 dtrace_difo_release(dp, vstate);
11660
11661 if (DTRACEACT_ISAGG(act->dta_kind)) {
11662 dtrace_ecb_aggregation_destroy(ecb, act);
11663 } else {
11664 kmem_free(act, sizeof (dtrace_action_t));
11665 }
11666 }
11667 }
11668
11669 ecb->dte_action = NULL;
11670 ecb->dte_action_last = NULL;
11671 ecb->dte_size = 0;
11672 }
11673
11674 static void
dtrace_ecb_disable(dtrace_ecb_t * ecb)11675 dtrace_ecb_disable(dtrace_ecb_t *ecb)
11676 {
11677 /*
11678 * We disable the ECB by removing it from its probe.
11679 */
11680 dtrace_ecb_t *pecb, *prev = NULL;
11681 dtrace_probe_t *probe = ecb->dte_probe;
11682
11683 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11684
11685 if (probe == NULL) {
11686 /*
11687 * This is the NULL probe; there is nothing to disable.
11688 */
11689 return;
11690 }
11691
11692 for (pecb = probe->dtpr_ecb; pecb != NULL; pecb = pecb->dte_next) {
11693 if (pecb == ecb)
11694 break;
11695 prev = pecb;
11696 }
11697
11698 ASSERT(pecb != NULL);
11699
11700 if (prev == NULL) {
11701 probe->dtpr_ecb = ecb->dte_next;
11702 } else {
11703 prev->dte_next = ecb->dte_next;
11704 }
11705
11706 if (ecb == probe->dtpr_ecb_last) {
11707 ASSERT(ecb->dte_next == NULL);
11708 probe->dtpr_ecb_last = prev;
11709 }
11710
11711 probe->dtpr_provider->dtpv_ecb_count--;
11712 /*
11713 * The ECB has been disconnected from the probe; now sync to assure
11714 * that all CPUs have seen the change before returning.
11715 */
11716 dtrace_sync();
11717
11718 if (probe->dtpr_ecb == NULL) {
11719 /*
11720 * That was the last ECB on the probe; clear the predicate
11721 * cache ID for the probe, disable it and sync one more time
11722 * to assure that we'll never hit it again.
11723 */
11724 dtrace_provider_t *prov = probe->dtpr_provider;
11725
11726 ASSERT(ecb->dte_next == NULL);
11727 ASSERT(probe->dtpr_ecb_last == NULL);
11728 probe->dtpr_predcache = DTRACE_CACHEIDNONE;
11729 prov->dtpv_pops.dtps_disable(prov->dtpv_arg,
11730 probe->dtpr_id, probe->dtpr_arg);
11731 dtrace_sync();
11732 } else {
11733 /*
11734 * There is at least one ECB remaining on the probe. If there
11735 * is _exactly_ one, set the probe's predicate cache ID to be
11736 * the predicate cache ID of the remaining ECB.
11737 */
11738 ASSERT(probe->dtpr_ecb_last != NULL);
11739 ASSERT(probe->dtpr_predcache == DTRACE_CACHEIDNONE);
11740
11741 if (probe->dtpr_ecb == probe->dtpr_ecb_last) {
11742 dtrace_predicate_t *p = probe->dtpr_ecb->dte_predicate;
11743
11744 ASSERT(probe->dtpr_ecb->dte_next == NULL);
11745
11746 if (p != NULL)
11747 probe->dtpr_predcache = p->dtp_cacheid;
11748 }
11749
11750 ecb->dte_next = NULL;
11751 }
11752 }
11753
11754 static void
dtrace_ecb_destroy(dtrace_ecb_t * ecb)11755 dtrace_ecb_destroy(dtrace_ecb_t *ecb)
11756 {
11757 dtrace_state_t *state = ecb->dte_state;
11758 dtrace_vstate_t *vstate = &state->dts_vstate;
11759 dtrace_predicate_t *pred;
11760 dtrace_epid_t epid = ecb->dte_epid;
11761
11762 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11763 ASSERT(ecb->dte_next == NULL);
11764 ASSERT(ecb->dte_probe == NULL || ecb->dte_probe->dtpr_ecb != ecb);
11765
11766 if ((pred = ecb->dte_predicate) != NULL)
11767 dtrace_predicate_release(pred, vstate);
11768
11769 dtrace_ecb_action_remove(ecb);
11770
11771 ASSERT(state->dts_ecbs[epid - 1] == ecb);
11772 state->dts_ecbs[epid - 1] = NULL;
11773
11774 kmem_free(ecb, sizeof (dtrace_ecb_t));
11775 }
11776
11777 static dtrace_ecb_t *
dtrace_ecb_create(dtrace_state_t * state,dtrace_probe_t * probe,dtrace_enabling_t * enab)11778 dtrace_ecb_create(dtrace_state_t *state, dtrace_probe_t *probe,
11779 dtrace_enabling_t *enab)
11780 {
11781 dtrace_ecb_t *ecb;
11782 dtrace_predicate_t *pred;
11783 dtrace_actdesc_t *act;
11784 dtrace_provider_t *prov;
11785 dtrace_ecbdesc_t *desc = enab->dten_current;
11786
11787 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11788 ASSERT(state != NULL);
11789
11790 ecb = dtrace_ecb_add(state, probe);
11791 ecb->dte_uarg = desc->dted_uarg;
11792
11793 if ((pred = desc->dted_pred.dtpdd_predicate) != NULL) {
11794 dtrace_predicate_hold(pred);
11795 ecb->dte_predicate = pred;
11796 }
11797
11798 if (probe != NULL) {
11799 /*
11800 * If the provider shows more leg than the consumer is old
11801 * enough to see, we need to enable the appropriate implicit
11802 * predicate bits to prevent the ecb from activating at
11803 * revealing times.
11804 *
11805 * Providers specifying DTRACE_PRIV_USER at register time
11806 * are stating that they need the /proc-style privilege
11807 * model to be enforced, and this is what DTRACE_COND_OWNER
11808 * and DTRACE_COND_ZONEOWNER will then do at probe time.
11809 */
11810 prov = probe->dtpr_provider;
11811 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLPROC) &&
11812 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11813 ecb->dte_cond |= DTRACE_COND_OWNER;
11814
11815 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_ALLZONE) &&
11816 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_USER))
11817 ecb->dte_cond |= DTRACE_COND_ZONEOWNER;
11818
11819 /*
11820 * If the provider shows us kernel innards and the user
11821 * is lacking sufficient privilege, enable the
11822 * DTRACE_COND_USERMODE implicit predicate.
11823 */
11824 if (!(state->dts_cred.dcr_visible & DTRACE_CRV_KERNEL) &&
11825 (prov->dtpv_priv.dtpp_flags & DTRACE_PRIV_KERNEL))
11826 ecb->dte_cond |= DTRACE_COND_USERMODE;
11827 }
11828
11829 if (dtrace_ecb_create_cache != NULL) {
11830 /*
11831 * If we have a cached ecb, we'll use its action list instead
11832 * of creating our own (saving both time and space).
11833 */
11834 dtrace_ecb_t *cached = dtrace_ecb_create_cache;
11835 dtrace_action_t *act_if = cached->dte_action;
11836
11837 if (act_if != NULL) {
11838 ASSERT(act_if->dta_refcnt > 0);
11839 act_if->dta_refcnt++;
11840 ecb->dte_action = act_if;
11841 ecb->dte_action_last = cached->dte_action_last;
11842 ecb->dte_needed = cached->dte_needed;
11843 ecb->dte_size = cached->dte_size;
11844 ecb->dte_alignment = cached->dte_alignment;
11845 }
11846
11847 return (ecb);
11848 }
11849
11850 for (act = desc->dted_action; act != NULL; act = act->dtad_next) {
11851 if ((enab->dten_error = dtrace_ecb_action_add(ecb, act)) != 0) {
11852 dtrace_ecb_destroy(ecb);
11853 return (NULL);
11854 }
11855 }
11856
11857 if ((enab->dten_error = dtrace_ecb_resize(ecb)) != 0) {
11858 dtrace_ecb_destroy(ecb);
11859 return (NULL);
11860 }
11861
11862 return (dtrace_ecb_create_cache = ecb);
11863 }
11864
11865 static int
dtrace_ecb_create_enable(dtrace_probe_t * probe,void * arg1,void * arg2)11866 dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg1, void *arg2)
11867 {
11868 dtrace_ecb_t *ecb;
11869 dtrace_enabling_t *enab = arg1;
11870 dtrace_ecbdesc_t *ep = arg2;
11871 dtrace_state_t *state = enab->dten_vstate->dtvs_state;
11872
11873 ASSERT(state != NULL);
11874
11875 if (probe != NULL && ep != NULL && probe->dtpr_gen < ep->dted_probegen) {
11876 /*
11877 * This probe was created in a generation for which this
11878 * enabling has previously created ECBs; we don't want to
11879 * enable it again, so just kick out.
11880 */
11881 return (DTRACE_MATCH_NEXT);
11882 }
11883
11884 if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
11885 return (DTRACE_MATCH_DONE);
11886
11887 if (dtrace_ecb_enable(ecb) < 0)
11888 return (DTRACE_MATCH_FAIL);
11889
11890 return (DTRACE_MATCH_NEXT);
11891 }
11892
11893 static dtrace_ecb_t *
dtrace_epid2ecb(dtrace_state_t * state,dtrace_epid_t id)11894 dtrace_epid2ecb(dtrace_state_t *state, dtrace_epid_t id)
11895 {
11896 dtrace_ecb_t *ecb;
11897 #pragma unused(ecb) /* __APPLE__ */
11898
11899 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11900
11901 if (id == 0 || id > (dtrace_epid_t)state->dts_necbs)
11902 return (NULL);
11903
11904 ASSERT(state->dts_necbs > 0 && state->dts_ecbs != NULL);
11905 ASSERT((ecb = state->dts_ecbs[id - 1]) == NULL || ecb->dte_epid == id);
11906
11907 return (state->dts_ecbs[id - 1]);
11908 }
11909
11910 static dtrace_aggregation_t *
dtrace_aggid2agg(dtrace_state_t * state,dtrace_aggid_t id)11911 dtrace_aggid2agg(dtrace_state_t *state, dtrace_aggid_t id)
11912 {
11913 dtrace_aggregation_t *agg;
11914 #pragma unused(agg) /* __APPLE__ */
11915
11916 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
11917
11918 if (id == 0 || id > (dtrace_aggid_t)state->dts_naggregations)
11919 return (NULL);
11920
11921 ASSERT(state->dts_naggregations > 0 && state->dts_aggregations != NULL);
11922 ASSERT((agg = state->dts_aggregations[id - 1]) == NULL ||
11923 agg->dtag_id == id);
11924
11925 return (state->dts_aggregations[id - 1]);
11926 }
11927
11928 /*
11929 * DTrace Buffer Functions
11930 *
11931 * The following functions manipulate DTrace buffers. Most of these functions
11932 * are called in the context of establishing or processing consumer state;
11933 * exceptions are explicitly noted.
11934 */
11935
11936 /*
11937 * Note: called from cross call context. This function switches the two
11938 * buffers on a given CPU. The atomicity of this operation is assured by
11939 * disabling interrupts while the actual switch takes place; the disabling of
11940 * interrupts serializes the execution with any execution of dtrace_probe() on
11941 * the same CPU.
11942 */
11943 static void
dtrace_buffer_switch(dtrace_buffer_t * buf)11944 dtrace_buffer_switch(dtrace_buffer_t *buf)
11945 {
11946 caddr_t tomax = buf->dtb_tomax;
11947 caddr_t xamot = buf->dtb_xamot;
11948 dtrace_icookie_t cookie;
11949 hrtime_t now;
11950
11951 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
11952 ASSERT(!(buf->dtb_flags & DTRACEBUF_RING));
11953
11954 cookie = dtrace_interrupt_disable();
11955 now = dtrace_gethrtime();
11956 buf->dtb_tomax = xamot;
11957 buf->dtb_xamot = tomax;
11958 buf->dtb_xamot_drops = buf->dtb_drops;
11959 buf->dtb_xamot_offset = buf->dtb_offset;
11960 buf->dtb_xamot_errors = buf->dtb_errors;
11961 buf->dtb_xamot_flags = buf->dtb_flags;
11962 buf->dtb_offset = 0;
11963 buf->dtb_drops = 0;
11964 buf->dtb_errors = 0;
11965 buf->dtb_flags &= ~(DTRACEBUF_ERROR | DTRACEBUF_DROPPED);
11966 buf->dtb_interval = now - buf->dtb_switched;
11967 buf->dtb_switched = now;
11968 buf->dtb_cur_limit = buf->dtb_limit;
11969
11970 dtrace_interrupt_enable(cookie);
11971 }
11972
11973 /*
11974 * Note: called from cross call context. This function activates a buffer
11975 * on a CPU. As with dtrace_buffer_switch(), the atomicity of the operation
11976 * is guaranteed by the disabling of interrupts.
11977 */
11978 static void
dtrace_buffer_activate(dtrace_state_t * state)11979 dtrace_buffer_activate(dtrace_state_t *state)
11980 {
11981 dtrace_buffer_t *buf;
11982 dtrace_icookie_t cookie = dtrace_interrupt_disable();
11983
11984 buf = &state->dts_buffer[CPU->cpu_id];
11985
11986 if (buf->dtb_tomax != NULL) {
11987 /*
11988 * We might like to assert that the buffer is marked inactive,
11989 * but this isn't necessarily true: the buffer for the CPU
11990 * that processes the BEGIN probe has its buffer activated
11991 * manually. In this case, we take the (harmless) action
11992 * re-clearing the bit INACTIVE bit.
11993 */
11994 buf->dtb_flags &= ~DTRACEBUF_INACTIVE;
11995 }
11996
11997 dtrace_interrupt_enable(cookie);
11998 }
11999
12000 static int
dtrace_buffer_canalloc(size_t size)12001 dtrace_buffer_canalloc(size_t size)
12002 {
12003 if (size > (UINT64_MAX - dtrace_buffer_memory_inuse))
12004 return (B_FALSE);
12005 if ((size + dtrace_buffer_memory_inuse) > dtrace_buffer_memory_maxsize)
12006 return (B_FALSE);
12007
12008 return (B_TRUE);
12009 }
12010
12011 static int
dtrace_buffer_alloc(dtrace_buffer_t * bufs,size_t limit,size_t size,int flags,processorid_t cpu)12012 dtrace_buffer_alloc(dtrace_buffer_t *bufs, size_t limit, size_t size, int flags,
12013 processorid_t cpu)
12014 {
12015 dtrace_cpu_t *cp;
12016 dtrace_buffer_t *buf;
12017 size_t size_before_alloc = dtrace_buffer_memory_inuse;
12018
12019 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12020 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12021
12022 if (size > (size_t)dtrace_nonroot_maxsize &&
12023 !PRIV_POLICY_CHOICE(CRED(), PRIV_ALL, B_FALSE))
12024 return (EFBIG);
12025
12026 cp = cpu_list;
12027
12028 do {
12029 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12030 continue;
12031
12032 buf = &bufs[cp->cpu_id];
12033
12034 /*
12035 * If there is already a buffer allocated for this CPU, it
12036 * is only possible that this is a DR event. In this case,
12037 * the buffer size must match our specified size.
12038 */
12039 if (buf->dtb_tomax != NULL) {
12040 ASSERT(buf->dtb_size == size);
12041 continue;
12042 }
12043
12044 ASSERT(buf->dtb_xamot == NULL);
12045
12046 /* DTrace, please do not eat all the memory. */
12047 if (dtrace_buffer_canalloc(size) == B_FALSE)
12048 goto err;
12049 if ((buf->dtb_tomax = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12050 goto err;
12051 dtrace_buffer_memory_inuse += size;
12052
12053 /* Unsure that limit is always lower than size */
12054 limit = limit == size ? limit - 1 : limit;
12055 buf->dtb_cur_limit = limit;
12056 buf->dtb_limit = limit;
12057 buf->dtb_size = size;
12058 buf->dtb_flags = flags;
12059 buf->dtb_offset = 0;
12060 buf->dtb_drops = 0;
12061
12062 if (flags & DTRACEBUF_NOSWITCH)
12063 continue;
12064
12065 /* DTrace, please do not eat all the memory. */
12066 if (dtrace_buffer_canalloc(size) == B_FALSE)
12067 goto err;
12068 if ((buf->dtb_xamot = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
12069 goto err;
12070 dtrace_buffer_memory_inuse += size;
12071 } while ((cp = cp->cpu_next) != cpu_list);
12072
12073 ASSERT(dtrace_buffer_memory_inuse <= dtrace_buffer_memory_maxsize);
12074
12075 return (0);
12076
12077 err:
12078 cp = cpu_list;
12079
12080 do {
12081 if (cpu != DTRACE_CPUALL && cpu != cp->cpu_id)
12082 continue;
12083
12084 buf = &bufs[cp->cpu_id];
12085
12086 if (buf->dtb_xamot != NULL) {
12087 ASSERT(buf->dtb_tomax != NULL);
12088 ASSERT(buf->dtb_size == size);
12089 kmem_free(buf->dtb_xamot, size);
12090 }
12091
12092 if (buf->dtb_tomax != NULL) {
12093 ASSERT(buf->dtb_size == size);
12094 kmem_free(buf->dtb_tomax, size);
12095 }
12096
12097 buf->dtb_tomax = NULL;
12098 buf->dtb_xamot = NULL;
12099 buf->dtb_size = 0;
12100 } while ((cp = cp->cpu_next) != cpu_list);
12101
12102 /* Restore the size saved before allocating memory */
12103 dtrace_buffer_memory_inuse = size_before_alloc;
12104
12105 return (ENOMEM);
12106 }
12107
12108 /*
12109 * Note: called from probe context. This function just increments the drop
12110 * count on a buffer. It has been made a function to allow for the
12111 * possibility of understanding the source of mysterious drop counts. (A
12112 * problem for which one may be particularly disappointed that DTrace cannot
12113 * be used to understand DTrace.)
12114 */
12115 static void
dtrace_buffer_drop(dtrace_buffer_t * buf)12116 dtrace_buffer_drop(dtrace_buffer_t *buf)
12117 {
12118 buf->dtb_drops++;
12119 }
12120
12121 /*
12122 * Note: called from probe context. This function is called to reserve space
12123 * in a buffer. If mstate is non-NULL, sets the scratch base and size in the
12124 * mstate. Returns the new offset in the buffer, or a negative value if an
12125 * error has occurred.
12126 */
12127 static intptr_t
dtrace_buffer_reserve(dtrace_buffer_t * buf,size_t needed,size_t align,dtrace_state_t * state,dtrace_mstate_t * mstate)12128 dtrace_buffer_reserve(dtrace_buffer_t *buf, size_t needed, size_t align,
12129 dtrace_state_t *state, dtrace_mstate_t *mstate)
12130 {
12131 intptr_t offs = buf->dtb_offset, soffs;
12132 intptr_t woffs;
12133 caddr_t tomax;
12134 size_t total_off;
12135
12136 if (buf->dtb_flags & DTRACEBUF_INACTIVE)
12137 return (-1);
12138
12139 if ((tomax = buf->dtb_tomax) == NULL) {
12140 dtrace_buffer_drop(buf);
12141 return (-1);
12142 }
12143
12144 if (!(buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL))) {
12145 while (offs & (align - 1)) {
12146 /*
12147 * Assert that our alignment is off by a number which
12148 * is itself sizeof (uint32_t) aligned.
12149 */
12150 ASSERT(!((align - (offs & (align - 1))) &
12151 (sizeof (uint32_t) - 1)));
12152 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12153 offs += sizeof (uint32_t);
12154 }
12155
12156 if ((uint64_t)(soffs = offs + needed) > buf->dtb_cur_limit) {
12157 if (buf->dtb_cur_limit == buf->dtb_limit) {
12158 buf->dtb_cur_limit = buf->dtb_size;
12159
12160 os_atomic_inc(&state->dts_buf_over_limit, relaxed);
12161 /**
12162 * Set an AST on the current processor
12163 * so that we can wake up the process
12164 * outside of probe context, when we know
12165 * it is safe to do so
12166 */
12167 minor_t minor = getminor(state->dts_dev);
12168 ASSERT(minor < 32);
12169
12170 os_atomic_or(&dtrace_wake_clients, 1 << minor, relaxed);
12171 ast_dtrace_on();
12172 }
12173 if ((uint64_t)soffs > buf->dtb_size) {
12174 dtrace_buffer_drop(buf);
12175 return (-1);
12176 }
12177 }
12178
12179 if (mstate == NULL)
12180 return (offs);
12181
12182 mstate->dtms_scratch_base = (uintptr_t)tomax + soffs;
12183 mstate->dtms_scratch_size = buf->dtb_size - soffs;
12184 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12185
12186 return (offs);
12187 }
12188
12189 if (buf->dtb_flags & DTRACEBUF_FILL) {
12190 if (state->dts_activity != DTRACE_ACTIVITY_COOLDOWN &&
12191 (buf->dtb_flags & DTRACEBUF_FULL))
12192 return (-1);
12193 goto out;
12194 }
12195
12196 total_off = needed + (offs & (align - 1));
12197
12198 /*
12199 * For a ring buffer, life is quite a bit more complicated. Before
12200 * we can store any padding, we need to adjust our wrapping offset.
12201 * (If we've never before wrapped or we're not about to, no adjustment
12202 * is required.)
12203 */
12204 if ((buf->dtb_flags & DTRACEBUF_WRAPPED) ||
12205 offs + total_off > buf->dtb_size) {
12206 woffs = buf->dtb_xamot_offset;
12207
12208 if (offs + total_off > buf->dtb_size) {
12209 /*
12210 * We can't fit in the end of the buffer. First, a
12211 * sanity check that we can fit in the buffer at all.
12212 */
12213 if (total_off > buf->dtb_size) {
12214 dtrace_buffer_drop(buf);
12215 return (-1);
12216 }
12217
12218 /*
12219 * We're going to be storing at the top of the buffer,
12220 * so now we need to deal with the wrapped offset. We
12221 * only reset our wrapped offset to 0 if it is
12222 * currently greater than the current offset. If it
12223 * is less than the current offset, it is because a
12224 * previous allocation induced a wrap -- but the
12225 * allocation didn't subsequently take the space due
12226 * to an error or false predicate evaluation. In this
12227 * case, we'll just leave the wrapped offset alone: if
12228 * the wrapped offset hasn't been advanced far enough
12229 * for this allocation, it will be adjusted in the
12230 * lower loop.
12231 */
12232 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
12233 if (woffs >= offs)
12234 woffs = 0;
12235 } else {
12236 woffs = 0;
12237 }
12238
12239 /*
12240 * Now we know that we're going to be storing to the
12241 * top of the buffer and that there is room for us
12242 * there. We need to clear the buffer from the current
12243 * offset to the end (there may be old gunk there).
12244 */
12245 while ((uint64_t)offs < buf->dtb_size)
12246 tomax[offs++] = 0;
12247
12248 /*
12249 * We need to set our offset to zero. And because we
12250 * are wrapping, we need to set the bit indicating as
12251 * much. We can also adjust our needed space back
12252 * down to the space required by the ECB -- we know
12253 * that the top of the buffer is aligned.
12254 */
12255 offs = 0;
12256 total_off = needed;
12257 buf->dtb_flags |= DTRACEBUF_WRAPPED;
12258 } else {
12259 /*
12260 * There is room for us in the buffer, so we simply
12261 * need to check the wrapped offset.
12262 */
12263 if (woffs < offs) {
12264 /*
12265 * The wrapped offset is less than the offset.
12266 * This can happen if we allocated buffer space
12267 * that induced a wrap, but then we didn't
12268 * subsequently take the space due to an error
12269 * or false predicate evaluation. This is
12270 * okay; we know that _this_ allocation isn't
12271 * going to induce a wrap. We still can't
12272 * reset the wrapped offset to be zero,
12273 * however: the space may have been trashed in
12274 * the previous failed probe attempt. But at
12275 * least the wrapped offset doesn't need to
12276 * be adjusted at all...
12277 */
12278 goto out;
12279 }
12280 }
12281
12282 while (offs + total_off > (size_t)woffs) {
12283 dtrace_epid_t epid = *(uint32_t *)(tomax + woffs);
12284 size_t size;
12285
12286 if (epid == DTRACE_EPIDNONE) {
12287 size = sizeof (uint32_t);
12288 } else {
12289 ASSERT(epid <= (dtrace_epid_t)state->dts_necbs);
12290 ASSERT(state->dts_ecbs[epid - 1] != NULL);
12291
12292 size = state->dts_ecbs[epid - 1]->dte_size;
12293 }
12294
12295 ASSERT(woffs + size <= buf->dtb_size);
12296 ASSERT(size != 0);
12297
12298 if (woffs + size == buf->dtb_size) {
12299 /*
12300 * We've reached the end of the buffer; we want
12301 * to set the wrapped offset to 0 and break
12302 * out. However, if the offs is 0, then we're
12303 * in a strange edge-condition: the amount of
12304 * space that we want to reserve plus the size
12305 * of the record that we're overwriting is
12306 * greater than the size of the buffer. This
12307 * is problematic because if we reserve the
12308 * space but subsequently don't consume it (due
12309 * to a failed predicate or error) the wrapped
12310 * offset will be 0 -- yet the EPID at offset 0
12311 * will not be committed. This situation is
12312 * relatively easy to deal with: if we're in
12313 * this case, the buffer is indistinguishable
12314 * from one that hasn't wrapped; we need only
12315 * finish the job by clearing the wrapped bit,
12316 * explicitly setting the offset to be 0, and
12317 * zero'ing out the old data in the buffer.
12318 */
12319 if (offs == 0) {
12320 buf->dtb_flags &= ~DTRACEBUF_WRAPPED;
12321 buf->dtb_offset = 0;
12322 woffs = total_off;
12323
12324 while ((uint64_t)woffs < buf->dtb_size)
12325 tomax[woffs++] = 0;
12326 }
12327
12328 woffs = 0;
12329 break;
12330 }
12331
12332 woffs += size;
12333 }
12334
12335 /*
12336 * We have a wrapped offset. It may be that the wrapped offset
12337 * has become zero -- that's okay.
12338 */
12339 buf->dtb_xamot_offset = woffs;
12340 }
12341
12342 out:
12343 /*
12344 * Now we can plow the buffer with any necessary padding.
12345 */
12346 while (offs & (align - 1)) {
12347 /*
12348 * Assert that our alignment is off by a number which
12349 * is itself sizeof (uint32_t) aligned.
12350 */
12351 ASSERT(!((align - (offs & (align - 1))) &
12352 (sizeof (uint32_t) - 1)));
12353 DTRACE_STORE(uint32_t, tomax, offs, DTRACE_EPIDNONE);
12354 offs += sizeof (uint32_t);
12355 }
12356
12357 if (buf->dtb_flags & DTRACEBUF_FILL) {
12358 if (offs + needed > buf->dtb_size - state->dts_reserve) {
12359 buf->dtb_flags |= DTRACEBUF_FULL;
12360 return (-1);
12361 }
12362 }
12363
12364 if (mstate == NULL)
12365 return (offs);
12366
12367 /*
12368 * For ring buffers and fill buffers, the scratch space is always
12369 * the inactive buffer.
12370 */
12371 mstate->dtms_scratch_base = (uintptr_t)buf->dtb_xamot;
12372 mstate->dtms_scratch_size = buf->dtb_size;
12373 mstate->dtms_scratch_ptr = mstate->dtms_scratch_base;
12374
12375 return (offs);
12376 }
12377
12378 static void
dtrace_buffer_polish(dtrace_buffer_t * buf)12379 dtrace_buffer_polish(dtrace_buffer_t *buf)
12380 {
12381 ASSERT(buf->dtb_flags & DTRACEBUF_RING);
12382 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12383
12384 if (!(buf->dtb_flags & DTRACEBUF_WRAPPED))
12385 return;
12386
12387 /*
12388 * We need to polish the ring buffer. There are three cases:
12389 *
12390 * - The first (and presumably most common) is that there is no gap
12391 * between the buffer offset and the wrapped offset. In this case,
12392 * there is nothing in the buffer that isn't valid data; we can
12393 * mark the buffer as polished and return.
12394 *
12395 * - The second (less common than the first but still more common
12396 * than the third) is that there is a gap between the buffer offset
12397 * and the wrapped offset, and the wrapped offset is larger than the
12398 * buffer offset. This can happen because of an alignment issue, or
12399 * can happen because of a call to dtrace_buffer_reserve() that
12400 * didn't subsequently consume the buffer space. In this case,
12401 * we need to zero the data from the buffer offset to the wrapped
12402 * offset.
12403 *
12404 * - The third (and least common) is that there is a gap between the
12405 * buffer offset and the wrapped offset, but the wrapped offset is
12406 * _less_ than the buffer offset. This can only happen because a
12407 * call to dtrace_buffer_reserve() induced a wrap, but the space
12408 * was not subsequently consumed. In this case, we need to zero the
12409 * space from the offset to the end of the buffer _and_ from the
12410 * top of the buffer to the wrapped offset.
12411 */
12412 if (buf->dtb_offset < buf->dtb_xamot_offset) {
12413 bzero(buf->dtb_tomax + buf->dtb_offset,
12414 buf->dtb_xamot_offset - buf->dtb_offset);
12415 }
12416
12417 if (buf->dtb_offset > buf->dtb_xamot_offset) {
12418 bzero(buf->dtb_tomax + buf->dtb_offset,
12419 buf->dtb_size - buf->dtb_offset);
12420 bzero(buf->dtb_tomax, buf->dtb_xamot_offset);
12421 }
12422 }
12423
12424 static void
dtrace_buffer_free(dtrace_buffer_t * bufs)12425 dtrace_buffer_free(dtrace_buffer_t *bufs)
12426 {
12427 int i;
12428
12429 for (i = 0; i < (int)NCPU; i++) {
12430 dtrace_buffer_t *buf = &bufs[i];
12431
12432 if (buf->dtb_tomax == NULL) {
12433 ASSERT(buf->dtb_xamot == NULL);
12434 ASSERT(buf->dtb_size == 0);
12435 continue;
12436 }
12437
12438 if (buf->dtb_xamot != NULL) {
12439 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
12440 kmem_free(buf->dtb_xamot, buf->dtb_size);
12441
12442 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12443 dtrace_buffer_memory_inuse -= buf->dtb_size;
12444 }
12445
12446 kmem_free(buf->dtb_tomax, buf->dtb_size);
12447 ASSERT(dtrace_buffer_memory_inuse >= buf->dtb_size);
12448 dtrace_buffer_memory_inuse -= buf->dtb_size;
12449
12450 buf->dtb_size = 0;
12451 buf->dtb_tomax = NULL;
12452 buf->dtb_xamot = NULL;
12453 }
12454 }
12455
12456 /*
12457 * DTrace Enabling Functions
12458 */
12459 static dtrace_enabling_t *
dtrace_enabling_create(dtrace_vstate_t * vstate)12460 dtrace_enabling_create(dtrace_vstate_t *vstate)
12461 {
12462 dtrace_enabling_t *enab;
12463
12464 enab = kmem_zalloc(sizeof (dtrace_enabling_t), KM_SLEEP);
12465 enab->dten_vstate = vstate;
12466
12467 return (enab);
12468 }
12469
12470 static void
dtrace_enabling_add(dtrace_enabling_t * enab,dtrace_ecbdesc_t * ecb)12471 dtrace_enabling_add(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb)
12472 {
12473 dtrace_ecbdesc_t **ndesc;
12474 size_t osize, nsize;
12475
12476 /*
12477 * We can't add to enablings after we've enabled them, or after we've
12478 * retained them.
12479 */
12480 ASSERT(enab->dten_probegen == 0);
12481 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12482
12483 /* APPLE NOTE: this protects against gcc 4.0 botch on x86 */
12484 if (ecb == NULL) return;
12485
12486 if (enab->dten_ndesc < enab->dten_maxdesc) {
12487 enab->dten_desc[enab->dten_ndesc++] = ecb;
12488 return;
12489 }
12490
12491 osize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12492
12493 if (enab->dten_maxdesc == 0) {
12494 enab->dten_maxdesc = 1;
12495 } else {
12496 enab->dten_maxdesc <<= 1;
12497 }
12498
12499 ASSERT(enab->dten_ndesc < enab->dten_maxdesc);
12500
12501 nsize = enab->dten_maxdesc * sizeof (dtrace_enabling_t *);
12502 ndesc = kmem_zalloc(nsize, KM_SLEEP);
12503 bcopy(enab->dten_desc, ndesc, osize);
12504 kmem_free(enab->dten_desc, osize);
12505
12506 enab->dten_desc = ndesc;
12507 enab->dten_desc[enab->dten_ndesc++] = ecb;
12508 }
12509
12510 static void
dtrace_enabling_addlike(dtrace_enabling_t * enab,dtrace_ecbdesc_t * ecb,dtrace_probedesc_t * pd)12511 dtrace_enabling_addlike(dtrace_enabling_t *enab, dtrace_ecbdesc_t *ecb,
12512 dtrace_probedesc_t *pd)
12513 {
12514 dtrace_ecbdesc_t *new;
12515 dtrace_predicate_t *pred;
12516 dtrace_actdesc_t *act;
12517
12518 /*
12519 * We're going to create a new ECB description that matches the
12520 * specified ECB in every way, but has the specified probe description.
12521 */
12522 new = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
12523
12524 if ((pred = ecb->dted_pred.dtpdd_predicate) != NULL)
12525 dtrace_predicate_hold(pred);
12526
12527 for (act = ecb->dted_action; act != NULL; act = act->dtad_next)
12528 dtrace_actdesc_hold(act);
12529
12530 new->dted_action = ecb->dted_action;
12531 new->dted_pred = ecb->dted_pred;
12532 new->dted_probe = *pd;
12533 new->dted_uarg = ecb->dted_uarg;
12534
12535 dtrace_enabling_add(enab, new);
12536 }
12537
12538 static void
dtrace_enabling_dump(dtrace_enabling_t * enab)12539 dtrace_enabling_dump(dtrace_enabling_t *enab)
12540 {
12541 int i;
12542
12543 for (i = 0; i < enab->dten_ndesc; i++) {
12544 dtrace_probedesc_t *desc = &enab->dten_desc[i]->dted_probe;
12545
12546 cmn_err(CE_NOTE, "enabling probe %d (%s:%s:%s:%s)", i,
12547 desc->dtpd_provider, desc->dtpd_mod,
12548 desc->dtpd_func, desc->dtpd_name);
12549 }
12550 }
12551
12552 static void
dtrace_enabling_destroy(dtrace_enabling_t * enab)12553 dtrace_enabling_destroy(dtrace_enabling_t *enab)
12554 {
12555 int i;
12556 dtrace_ecbdesc_t *ep;
12557 dtrace_vstate_t *vstate = enab->dten_vstate;
12558
12559 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12560
12561 for (i = 0; i < enab->dten_ndesc; i++) {
12562 dtrace_actdesc_t *act, *next;
12563 dtrace_predicate_t *pred;
12564
12565 ep = enab->dten_desc[i];
12566
12567 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL)
12568 dtrace_predicate_release(pred, vstate);
12569
12570 for (act = ep->dted_action; act != NULL; act = next) {
12571 next = act->dtad_next;
12572 dtrace_actdesc_release(act, vstate);
12573 }
12574
12575 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
12576 }
12577
12578 kmem_free(enab->dten_desc,
12579 enab->dten_maxdesc * sizeof (dtrace_enabling_t *));
12580
12581 /*
12582 * If this was a retained enabling, decrement the dts_nretained count
12583 * and take it off of the dtrace_retained list.
12584 */
12585 if (enab->dten_prev != NULL || enab->dten_next != NULL ||
12586 dtrace_retained == enab) {
12587 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12588 ASSERT(enab->dten_vstate->dtvs_state->dts_nretained > 0);
12589 enab->dten_vstate->dtvs_state->dts_nretained--;
12590 dtrace_retained_gen++;
12591 }
12592
12593 if (enab->dten_prev == NULL) {
12594 if (dtrace_retained == enab) {
12595 dtrace_retained = enab->dten_next;
12596
12597 if (dtrace_retained != NULL)
12598 dtrace_retained->dten_prev = NULL;
12599 }
12600 } else {
12601 ASSERT(enab != dtrace_retained);
12602 ASSERT(dtrace_retained != NULL);
12603 enab->dten_prev->dten_next = enab->dten_next;
12604 }
12605
12606 if (enab->dten_next != NULL) {
12607 ASSERT(dtrace_retained != NULL);
12608 enab->dten_next->dten_prev = enab->dten_prev;
12609 }
12610
12611 kmem_free(enab, sizeof (dtrace_enabling_t));
12612 }
12613
12614 static int
dtrace_enabling_retain(dtrace_enabling_t * enab)12615 dtrace_enabling_retain(dtrace_enabling_t *enab)
12616 {
12617 dtrace_state_t *state;
12618
12619 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12620 ASSERT(enab->dten_next == NULL && enab->dten_prev == NULL);
12621 ASSERT(enab->dten_vstate != NULL);
12622
12623 state = enab->dten_vstate->dtvs_state;
12624 ASSERT(state != NULL);
12625
12626 /*
12627 * We only allow each state to retain dtrace_retain_max enablings.
12628 */
12629 if (state->dts_nretained >= dtrace_retain_max)
12630 return (ENOSPC);
12631
12632 state->dts_nretained++;
12633 dtrace_retained_gen++;
12634
12635 if (dtrace_retained == NULL) {
12636 dtrace_retained = enab;
12637 return (0);
12638 }
12639
12640 enab->dten_next = dtrace_retained;
12641 dtrace_retained->dten_prev = enab;
12642 dtrace_retained = enab;
12643
12644 return (0);
12645 }
12646
12647 static int
dtrace_enabling_replicate(dtrace_state_t * state,dtrace_probedesc_t * match,dtrace_probedesc_t * create)12648 dtrace_enabling_replicate(dtrace_state_t *state, dtrace_probedesc_t *match,
12649 dtrace_probedesc_t *create)
12650 {
12651 dtrace_enabling_t *new, *enab;
12652 int found = 0, err = ENOENT;
12653
12654 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12655 ASSERT(strlen(match->dtpd_provider) < DTRACE_PROVNAMELEN);
12656 ASSERT(strlen(match->dtpd_mod) < DTRACE_MODNAMELEN);
12657 ASSERT(strlen(match->dtpd_func) < DTRACE_FUNCNAMELEN);
12658 ASSERT(strlen(match->dtpd_name) < DTRACE_NAMELEN);
12659
12660 new = dtrace_enabling_create(&state->dts_vstate);
12661
12662 /*
12663 * Iterate over all retained enablings, looking for enablings that
12664 * match the specified state.
12665 */
12666 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12667 int i;
12668
12669 /*
12670 * dtvs_state can only be NULL for helper enablings -- and
12671 * helper enablings can't be retained.
12672 */
12673 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12674
12675 if (enab->dten_vstate->dtvs_state != state)
12676 continue;
12677
12678 /*
12679 * Now iterate over each probe description; we're looking for
12680 * an exact match to the specified probe description.
12681 */
12682 for (i = 0; i < enab->dten_ndesc; i++) {
12683 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12684 dtrace_probedesc_t *pd = &ep->dted_probe;
12685
12686 /* APPLE NOTE: Darwin employs size bounded string operation. */
12687 if (strncmp(pd->dtpd_provider, match->dtpd_provider, DTRACE_PROVNAMELEN))
12688 continue;
12689
12690 if (strncmp(pd->dtpd_mod, match->dtpd_mod, DTRACE_MODNAMELEN))
12691 continue;
12692
12693 if (strncmp(pd->dtpd_func, match->dtpd_func, DTRACE_FUNCNAMELEN))
12694 continue;
12695
12696 if (strncmp(pd->dtpd_name, match->dtpd_name, DTRACE_NAMELEN))
12697 continue;
12698
12699 /*
12700 * We have a winning probe! Add it to our growing
12701 * enabling.
12702 */
12703 found = 1;
12704 dtrace_enabling_addlike(new, ep, create);
12705 }
12706 }
12707
12708 if (!found || (err = dtrace_enabling_retain(new)) != 0) {
12709 dtrace_enabling_destroy(new);
12710 return (err);
12711 }
12712
12713 return (0);
12714 }
12715
12716 static void
dtrace_enabling_retract(dtrace_state_t * state)12717 dtrace_enabling_retract(dtrace_state_t *state)
12718 {
12719 dtrace_enabling_t *enab, *next;
12720
12721 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12722
12723 /*
12724 * Iterate over all retained enablings, destroy the enablings retained
12725 * for the specified state.
12726 */
12727 for (enab = dtrace_retained; enab != NULL; enab = next) {
12728 next = enab->dten_next;
12729
12730 /*
12731 * dtvs_state can only be NULL for helper enablings -- and
12732 * helper enablings can't be retained.
12733 */
12734 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12735
12736 if (enab->dten_vstate->dtvs_state == state) {
12737 ASSERT(state->dts_nretained > 0);
12738 dtrace_enabling_destroy(enab);
12739 }
12740 }
12741
12742 ASSERT(state->dts_nretained == 0);
12743 }
12744
12745 static int
dtrace_enabling_match(dtrace_enabling_t * enab,int * nmatched,dtrace_match_cond_t * cond)12746 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched, dtrace_match_cond_t *cond)
12747 {
12748 int i = 0;
12749 int total_matched = 0, matched = 0;
12750
12751 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
12752 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12753
12754 for (i = 0; i < enab->dten_ndesc; i++) {
12755 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
12756
12757 enab->dten_current = ep;
12758 enab->dten_error = 0;
12759
12760 /**
12761 * Before doing a dtrace_probe_enable, which is really
12762 * expensive, check that this enabling matches the matching precondition
12763 * if we have one
12764 */
12765 if (cond && (cond->dmc_func(&ep->dted_probe, cond->dmc_data) == 0)) {
12766 continue;
12767 }
12768 /*
12769 * If a provider failed to enable a probe then get out and
12770 * let the consumer know we failed.
12771 */
12772 if ((matched = dtrace_probe_enable(&ep->dted_probe, enab, ep)) < 0)
12773 return (EBUSY);
12774
12775 total_matched += matched;
12776
12777 if (enab->dten_error != 0) {
12778 /*
12779 * If we get an error half-way through enabling the
12780 * probes, we kick out -- perhaps with some number of
12781 * them enabled. Leaving enabled probes enabled may
12782 * be slightly confusing for user-level, but we expect
12783 * that no one will attempt to actually drive on in
12784 * the face of such errors. If this is an anonymous
12785 * enabling (indicated with a NULL nmatched pointer),
12786 * we cmn_err() a message. We aren't expecting to
12787 * get such an error -- such as it can exist at all,
12788 * it would be a result of corrupted DOF in the driver
12789 * properties.
12790 */
12791 if (nmatched == NULL) {
12792 cmn_err(CE_WARN, "dtrace_enabling_match() "
12793 "error on %p: %d", (void *)ep,
12794 enab->dten_error);
12795 }
12796
12797 return (enab->dten_error);
12798 }
12799
12800 ep->dted_probegen = dtrace_probegen;
12801 }
12802
12803 if (nmatched != NULL)
12804 *nmatched = total_matched;
12805
12806 return (0);
12807 }
12808
12809 static void
dtrace_enabling_matchall_with_cond(dtrace_match_cond_t * cond)12810 dtrace_enabling_matchall_with_cond(dtrace_match_cond_t *cond)
12811 {
12812 dtrace_enabling_t *enab;
12813
12814 lck_mtx_lock(&cpu_lock);
12815 lck_mtx_lock(&dtrace_lock);
12816
12817 /*
12818 * Iterate over all retained enablings to see if any probes match
12819 * against them. We only perform this operation on enablings for which
12820 * we have sufficient permissions by virtue of being in the global zone
12821 * or in the same zone as the DTrace client. Because we can be called
12822 * after dtrace_detach() has been called, we cannot assert that there
12823 * are retained enablings. We can safely load from dtrace_retained,
12824 * however: the taskq_destroy() at the end of dtrace_detach() will
12825 * block pending our completion.
12826 */
12827
12828 /*
12829 * Darwin doesn't do zones.
12830 * Behave as if always in "global" zone."
12831 */
12832 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12833 (void) dtrace_enabling_match(enab, NULL, cond);
12834 }
12835
12836 lck_mtx_unlock(&dtrace_lock);
12837 lck_mtx_unlock(&cpu_lock);
12838
12839 }
12840
12841 static void
dtrace_enabling_matchall(void)12842 dtrace_enabling_matchall(void)
12843 {
12844 dtrace_enabling_matchall_with_cond(NULL);
12845 }
12846
12847
12848
12849 /*
12850 * If an enabling is to be enabled without having matched probes (that is, if
12851 * dtrace_state_go() is to be called on the underlying dtrace_state_t), the
12852 * enabling must be _primed_ by creating an ECB for every ECB description.
12853 * This must be done to assure that we know the number of speculations, the
12854 * number of aggregations, the minimum buffer size needed, etc. before we
12855 * transition out of DTRACE_ACTIVITY_INACTIVE. To do this without actually
12856 * enabling any probes, we create ECBs for every ECB decription, but with a
12857 * NULL probe -- which is exactly what this function does.
12858 */
12859 static void
dtrace_enabling_prime(dtrace_state_t * state)12860 dtrace_enabling_prime(dtrace_state_t *state)
12861 {
12862 dtrace_enabling_t *enab;
12863 int i;
12864
12865 for (enab = dtrace_retained; enab != NULL; enab = enab->dten_next) {
12866 ASSERT(enab->dten_vstate->dtvs_state != NULL);
12867
12868 if (enab->dten_vstate->dtvs_state != state)
12869 continue;
12870
12871 /*
12872 * We don't want to prime an enabling more than once, lest
12873 * we allow a malicious user to induce resource exhaustion.
12874 * (The ECBs that result from priming an enabling aren't
12875 * leaked -- but they also aren't deallocated until the
12876 * consumer state is destroyed.)
12877 */
12878 if (enab->dten_primed)
12879 continue;
12880
12881 for (i = 0; i < enab->dten_ndesc; i++) {
12882 enab->dten_current = enab->dten_desc[i];
12883 (void) dtrace_probe_enable(NULL, enab, NULL);
12884 }
12885
12886 enab->dten_primed = 1;
12887 }
12888 }
12889
12890 /*
12891 * Called to indicate that probes should be provided due to retained
12892 * enablings. This is implemented in terms of dtrace_probe_provide(), but it
12893 * must take an initial lap through the enabling calling the dtps_provide()
12894 * entry point explicitly to allow for autocreated probes.
12895 */
12896 static void
dtrace_enabling_provide(dtrace_provider_t * prv)12897 dtrace_enabling_provide(dtrace_provider_t *prv)
12898 {
12899 int i, all = 0;
12900 dtrace_probedesc_t desc;
12901 dtrace_genid_t gen;
12902
12903 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12904 LCK_MTX_ASSERT(&dtrace_provider_lock, LCK_MTX_ASSERT_OWNED);
12905
12906 if (prv == NULL) {
12907 all = 1;
12908 prv = dtrace_provider;
12909 }
12910
12911 do {
12912 dtrace_enabling_t *enab;
12913 void *parg = prv->dtpv_arg;
12914
12915 retry:
12916 gen = dtrace_retained_gen;
12917 for (enab = dtrace_retained; enab != NULL;
12918 enab = enab->dten_next) {
12919 for (i = 0; i < enab->dten_ndesc; i++) {
12920 desc = enab->dten_desc[i]->dted_probe;
12921 lck_mtx_unlock(&dtrace_lock);
12922 prv->dtpv_pops.dtps_provide(parg, &desc);
12923 lck_mtx_lock(&dtrace_lock);
12924 /*
12925 * Process the retained enablings again if
12926 * they have changed while we weren't holding
12927 * dtrace_lock.
12928 */
12929 if (gen != dtrace_retained_gen)
12930 goto retry;
12931 }
12932 }
12933 } while (all && (prv = prv->dtpv_next) != NULL);
12934
12935 lck_mtx_unlock(&dtrace_lock);
12936 dtrace_probe_provide(NULL, all ? NULL : prv);
12937 lck_mtx_lock(&dtrace_lock);
12938 }
12939
12940 /*
12941 * DTrace DOF Functions
12942 */
12943 /*ARGSUSED*/
12944 static void
dtrace_dof_error(dof_hdr_t * dof,const char * str)12945 dtrace_dof_error(dof_hdr_t *dof, const char *str)
12946 {
12947 #pragma unused(dof) /* __APPLE__ */
12948 if (dtrace_err_verbose)
12949 cmn_err(CE_WARN, "failed to process DOF: %s", str);
12950
12951 #ifdef DTRACE_ERRDEBUG
12952 dtrace_errdebug(str);
12953 #endif
12954 }
12955
12956 /*
12957 * Create DOF out of a currently enabled state. Right now, we only create
12958 * DOF containing the run-time options -- but this could be expanded to create
12959 * complete DOF representing the enabled state.
12960 */
12961 static dof_hdr_t *
dtrace_dof_create(dtrace_state_t * state)12962 dtrace_dof_create(dtrace_state_t *state)
12963 {
12964 dof_hdr_t *dof;
12965 dof_sec_t *sec;
12966 dof_optdesc_t *opt;
12967 int i, len = sizeof (dof_hdr_t) +
12968 roundup(sizeof (dof_sec_t), sizeof (uint64_t)) +
12969 sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
12970
12971 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
12972
12973 dof = kmem_zalloc_aligned(len, 8, KM_SLEEP);
12974 dof->dofh_ident[DOF_ID_MAG0] = DOF_MAG_MAG0;
12975 dof->dofh_ident[DOF_ID_MAG1] = DOF_MAG_MAG1;
12976 dof->dofh_ident[DOF_ID_MAG2] = DOF_MAG_MAG2;
12977 dof->dofh_ident[DOF_ID_MAG3] = DOF_MAG_MAG3;
12978
12979 dof->dofh_ident[DOF_ID_MODEL] = DOF_MODEL_NATIVE;
12980 dof->dofh_ident[DOF_ID_ENCODING] = DOF_ENCODE_NATIVE;
12981 dof->dofh_ident[DOF_ID_VERSION] = DOF_VERSION;
12982 dof->dofh_ident[DOF_ID_DIFVERS] = DIF_VERSION;
12983 dof->dofh_ident[DOF_ID_DIFIREG] = DIF_DIR_NREGS;
12984 dof->dofh_ident[DOF_ID_DIFTREG] = DIF_DTR_NREGS;
12985
12986 dof->dofh_flags = 0;
12987 dof->dofh_hdrsize = sizeof (dof_hdr_t);
12988 dof->dofh_secsize = sizeof (dof_sec_t);
12989 dof->dofh_secnum = 1; /* only DOF_SECT_OPTDESC */
12990 dof->dofh_secoff = sizeof (dof_hdr_t);
12991 dof->dofh_loadsz = len;
12992 dof->dofh_filesz = len;
12993 dof->dofh_pad = 0;
12994
12995 /*
12996 * Fill in the option section header...
12997 */
12998 sec = (dof_sec_t *)((uintptr_t)dof + sizeof (dof_hdr_t));
12999 sec->dofs_type = DOF_SECT_OPTDESC;
13000 sec->dofs_align = sizeof (uint64_t);
13001 sec->dofs_flags = DOF_SECF_LOAD;
13002 sec->dofs_entsize = sizeof (dof_optdesc_t);
13003
13004 opt = (dof_optdesc_t *)((uintptr_t)sec +
13005 roundup(sizeof (dof_sec_t), sizeof (uint64_t)));
13006
13007 sec->dofs_offset = (uintptr_t)opt - (uintptr_t)dof;
13008 sec->dofs_size = sizeof (dof_optdesc_t) * DTRACEOPT_MAX;
13009
13010 for (i = 0; i < DTRACEOPT_MAX; i++) {
13011 opt[i].dofo_option = i;
13012 opt[i].dofo_strtab = DOF_SECIDX_NONE;
13013 opt[i].dofo_value = state->dts_options[i];
13014 }
13015
13016 return (dof);
13017 }
13018
13019 static dof_hdr_t *
dtrace_dof_copyin(user_addr_t uarg,int * errp)13020 dtrace_dof_copyin(user_addr_t uarg, int *errp)
13021 {
13022 dof_hdr_t hdr, *dof;
13023
13024 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13025
13026 /*
13027 * First, we're going to copyin() the sizeof (dof_hdr_t).
13028 */
13029 if (copyin(uarg, &hdr, sizeof (hdr)) != 0) {
13030 dtrace_dof_error(NULL, "failed to copyin DOF header");
13031 *errp = EFAULT;
13032 return (NULL);
13033 }
13034
13035 /*
13036 * Now we'll allocate the entire DOF and copy it in -- provided
13037 * that the length isn't outrageous.
13038 */
13039 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13040 dtrace_dof_error(&hdr, "load size exceeds maximum");
13041 *errp = E2BIG;
13042 return (NULL);
13043 }
13044
13045 if (hdr.dofh_loadsz < sizeof (hdr)) {
13046 dtrace_dof_error(&hdr, "invalid load size");
13047 *errp = EINVAL;
13048 return (NULL);
13049 }
13050
13051 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13052
13053 if (copyin(uarg, dof, hdr.dofh_loadsz) != 0 ||
13054 dof->dofh_loadsz != hdr.dofh_loadsz) {
13055 kmem_free_aligned(dof, hdr.dofh_loadsz);
13056 *errp = EFAULT;
13057 return (NULL);
13058 }
13059
13060 return (dof);
13061 }
13062
13063 static dof_hdr_t *
dtrace_dof_copyin_from_proc(proc_t * p,user_addr_t uarg,int * errp)13064 dtrace_dof_copyin_from_proc(proc_t* p, user_addr_t uarg, int *errp)
13065 {
13066 dof_hdr_t hdr, *dof;
13067
13068 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
13069
13070 /*
13071 * First, we're going to copyin() the sizeof (dof_hdr_t).
13072 */
13073 if (uread(p, &hdr, sizeof(hdr), uarg) != KERN_SUCCESS) {
13074 dtrace_dof_error(NULL, "failed to copyin DOF header");
13075 *errp = EFAULT;
13076 return (NULL);
13077 }
13078
13079 /*
13080 * Now we'll allocate the entire DOF and copy it in -- provided
13081 * that the length isn't outrageous.
13082 */
13083 if (hdr.dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13084 dtrace_dof_error(&hdr, "load size exceeds maximum");
13085 *errp = E2BIG;
13086 return (NULL);
13087 }
13088
13089 if (hdr.dofh_loadsz < sizeof (hdr)) {
13090 dtrace_dof_error(&hdr, "invalid load size");
13091 *errp = EINVAL;
13092 return (NULL);
13093 }
13094
13095 dof = kmem_alloc_aligned(hdr.dofh_loadsz, 8, KM_SLEEP);
13096
13097 if (uread(p, dof, hdr.dofh_loadsz, uarg) != KERN_SUCCESS) {
13098 kmem_free_aligned(dof, hdr.dofh_loadsz);
13099 *errp = EFAULT;
13100 return (NULL);
13101 }
13102
13103 return (dof);
13104 }
13105
13106 static void
dtrace_dof_destroy(dof_hdr_t * dof)13107 dtrace_dof_destroy(dof_hdr_t *dof)
13108 {
13109 kmem_free_aligned(dof, dof->dofh_loadsz);
13110 }
13111
13112 static dof_hdr_t *
dtrace_dof_property(const char * name)13113 dtrace_dof_property(const char *name)
13114 {
13115 unsigned int len = 0;
13116 dof_hdr_t *dof;
13117
13118 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
13119 return NULL;
13120 }
13121
13122 if (!PEReadNVRAMProperty(name, NULL, &len)) {
13123 return NULL;
13124 }
13125
13126 dof = kmem_alloc_aligned(len, 8, KM_SLEEP);
13127
13128 if (!PEReadNVRAMProperty(name, dof, &len)) {
13129 dtrace_dof_destroy(dof);
13130 dtrace_dof_error(NULL, "unreadable DOF");
13131 return NULL;
13132 }
13133
13134 if (len < sizeof (dof_hdr_t)) {
13135 dtrace_dof_destroy(dof);
13136 dtrace_dof_error(NULL, "truncated header");
13137 return (NULL);
13138 }
13139
13140 if (len < dof->dofh_loadsz) {
13141 dtrace_dof_destroy(dof);
13142 dtrace_dof_error(NULL, "truncated DOF");
13143 return (NULL);
13144 }
13145
13146 if (len != dof->dofh_loadsz) {
13147 dtrace_dof_destroy(dof);
13148 dtrace_dof_error(NULL, "invalid DOF size");
13149 return (NULL);
13150 }
13151
13152 if (dof->dofh_loadsz >= (uint64_t)dtrace_dof_maxsize) {
13153 dtrace_dof_destroy(dof);
13154 dtrace_dof_error(NULL, "oversized DOF");
13155 return (NULL);
13156 }
13157
13158 return (dof);
13159 }
13160
13161 /*
13162 * Return the dof_sec_t pointer corresponding to a given section index. If the
13163 * index is not valid, dtrace_dof_error() is called and NULL is returned. If
13164 * a type other than DOF_SECT_NONE is specified, the header is checked against
13165 * this type and NULL is returned if the types do not match.
13166 */
13167 static dof_sec_t *
dtrace_dof_sect(dof_hdr_t * dof,uint32_t type,dof_secidx_t i)13168 dtrace_dof_sect(dof_hdr_t *dof, uint32_t type, dof_secidx_t i)
13169 {
13170 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)
13171 ((uintptr_t)dof + dof->dofh_secoff + i * dof->dofh_secsize);
13172
13173 if (i >= dof->dofh_secnum) {
13174 dtrace_dof_error(dof, "referenced section index is invalid");
13175 return (NULL);
13176 }
13177
13178 if (!(sec->dofs_flags & DOF_SECF_LOAD)) {
13179 dtrace_dof_error(dof, "referenced section is not loadable");
13180 return (NULL);
13181 }
13182
13183 if (type != DOF_SECT_NONE && type != sec->dofs_type) {
13184 dtrace_dof_error(dof, "referenced section is the wrong type");
13185 return (NULL);
13186 }
13187
13188 return (sec);
13189 }
13190
13191 static dtrace_probedesc_t *
dtrace_dof_probedesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_probedesc_t * desc)13192 dtrace_dof_probedesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_probedesc_t *desc)
13193 {
13194 dof_probedesc_t *probe;
13195 dof_sec_t *strtab;
13196 uintptr_t daddr = (uintptr_t)dof;
13197 uintptr_t str;
13198 size_t size;
13199
13200 if (sec->dofs_type != DOF_SECT_PROBEDESC) {
13201 dtrace_dof_error(dof, "invalid probe section");
13202 return (NULL);
13203 }
13204
13205 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13206 dtrace_dof_error(dof, "bad alignment in probe description");
13207 return (NULL);
13208 }
13209
13210 if (sec->dofs_offset + sizeof (dof_probedesc_t) > dof->dofh_loadsz) {
13211 dtrace_dof_error(dof, "truncated probe description");
13212 return (NULL);
13213 }
13214
13215 probe = (dof_probedesc_t *)(uintptr_t)(daddr + sec->dofs_offset);
13216 strtab = dtrace_dof_sect(dof, DOF_SECT_STRTAB, probe->dofp_strtab);
13217
13218 if (strtab == NULL)
13219 return (NULL);
13220
13221 str = daddr + strtab->dofs_offset;
13222 size = strtab->dofs_size;
13223
13224 if (probe->dofp_provider >= strtab->dofs_size) {
13225 dtrace_dof_error(dof, "corrupt probe provider");
13226 return (NULL);
13227 }
13228
13229 (void) strncpy(desc->dtpd_provider,
13230 (char *)(str + probe->dofp_provider),
13231 MIN(DTRACE_PROVNAMELEN - 1, size - probe->dofp_provider));
13232
13233 /* APPLE NOTE: Darwin employs size bounded string operation. */
13234 desc->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
13235
13236 if (probe->dofp_mod >= strtab->dofs_size) {
13237 dtrace_dof_error(dof, "corrupt probe module");
13238 return (NULL);
13239 }
13240
13241 (void) strncpy(desc->dtpd_mod, (char *)(str + probe->dofp_mod),
13242 MIN(DTRACE_MODNAMELEN - 1, size - probe->dofp_mod));
13243
13244 /* APPLE NOTE: Darwin employs size bounded string operation. */
13245 desc->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
13246
13247 if (probe->dofp_func >= strtab->dofs_size) {
13248 dtrace_dof_error(dof, "corrupt probe function");
13249 return (NULL);
13250 }
13251
13252 (void) strncpy(desc->dtpd_func, (char *)(str + probe->dofp_func),
13253 MIN(DTRACE_FUNCNAMELEN - 1, size - probe->dofp_func));
13254
13255 /* APPLE NOTE: Darwin employs size bounded string operation. */
13256 desc->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
13257
13258 if (probe->dofp_name >= strtab->dofs_size) {
13259 dtrace_dof_error(dof, "corrupt probe name");
13260 return (NULL);
13261 }
13262
13263 (void) strncpy(desc->dtpd_name, (char *)(str + probe->dofp_name),
13264 MIN(DTRACE_NAMELEN - 1, size - probe->dofp_name));
13265
13266 /* APPLE NOTE: Darwin employs size bounded string operation. */
13267 desc->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
13268
13269 return (desc);
13270 }
13271
13272 static dtrace_difo_t *
dtrace_dof_difo(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13273 dtrace_dof_difo(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13274 cred_t *cr)
13275 {
13276 dtrace_difo_t *dp;
13277 size_t ttl = 0;
13278 dof_difohdr_t *dofd;
13279 uintptr_t daddr = (uintptr_t)dof;
13280 size_t max_size = dtrace_difo_maxsize;
13281 uint_t i;
13282 int l, n;
13283
13284
13285 static const struct {
13286 int section;
13287 int bufoffs;
13288 int lenoffs;
13289 int entsize;
13290 int align;
13291 const char *msg;
13292 } difo[] = {
13293 { DOF_SECT_DIF, offsetof(dtrace_difo_t, dtdo_buf),
13294 offsetof(dtrace_difo_t, dtdo_len), sizeof (dif_instr_t),
13295 sizeof (dif_instr_t), "multiple DIF sections" },
13296
13297 { DOF_SECT_INTTAB, offsetof(dtrace_difo_t, dtdo_inttab),
13298 offsetof(dtrace_difo_t, dtdo_intlen), sizeof (uint64_t),
13299 sizeof (uint64_t), "multiple integer tables" },
13300
13301 { DOF_SECT_STRTAB, offsetof(dtrace_difo_t, dtdo_strtab),
13302 offsetof(dtrace_difo_t, dtdo_strlen), 0,
13303 sizeof (char), "multiple string tables" },
13304
13305 { DOF_SECT_VARTAB, offsetof(dtrace_difo_t, dtdo_vartab),
13306 offsetof(dtrace_difo_t, dtdo_varlen), sizeof (dtrace_difv_t),
13307 sizeof (uint_t), "multiple variable tables" },
13308
13309 { DOF_SECT_NONE, 0, 0, 0, 0, NULL }
13310 };
13311
13312 if (sec->dofs_type != DOF_SECT_DIFOHDR) {
13313 dtrace_dof_error(dof, "invalid DIFO header section");
13314 return (NULL);
13315 }
13316
13317 if (sec->dofs_align != sizeof (dof_secidx_t)) {
13318 dtrace_dof_error(dof, "bad alignment in DIFO header");
13319 return (NULL);
13320 }
13321
13322 if (sec->dofs_size < sizeof (dof_difohdr_t) ||
13323 sec->dofs_size % sizeof (dof_secidx_t)) {
13324 dtrace_dof_error(dof, "bad size in DIFO header");
13325 return (NULL);
13326 }
13327
13328 dofd = (dof_difohdr_t *)(uintptr_t)(daddr + sec->dofs_offset);
13329 n = (sec->dofs_size - sizeof (*dofd)) / sizeof (dof_secidx_t) + 1;
13330
13331 dp = kmem_zalloc(sizeof (dtrace_difo_t), KM_SLEEP);
13332 dp->dtdo_rtype = dofd->dofd_rtype;
13333
13334 for (l = 0; l < n; l++) {
13335 dof_sec_t *subsec;
13336 void **bufp;
13337 uint32_t *lenp;
13338
13339 if ((subsec = dtrace_dof_sect(dof, DOF_SECT_NONE,
13340 dofd->dofd_links[l])) == NULL)
13341 goto err; /* invalid section link */
13342
13343 if (ttl + subsec->dofs_size > max_size) {
13344 dtrace_dof_error(dof, "exceeds maximum size");
13345 goto err;
13346 }
13347
13348 ttl += subsec->dofs_size;
13349
13350 for (i = 0; difo[i].section != DOF_SECT_NONE; i++) {
13351
13352 if (subsec->dofs_type != (uint32_t)difo[i].section)
13353 continue;
13354
13355 if (!(subsec->dofs_flags & DOF_SECF_LOAD)) {
13356 dtrace_dof_error(dof, "section not loaded");
13357 goto err;
13358 }
13359
13360 if (subsec->dofs_align != (uint32_t)difo[i].align) {
13361 dtrace_dof_error(dof, "bad alignment");
13362 goto err;
13363 }
13364
13365 bufp = (void **)((uintptr_t)dp + difo[i].bufoffs);
13366 lenp = (uint32_t *)((uintptr_t)dp + difo[i].lenoffs);
13367
13368 if (*bufp != NULL) {
13369 dtrace_dof_error(dof, difo[i].msg);
13370 goto err;
13371 }
13372
13373 if ((uint32_t)difo[i].entsize != subsec->dofs_entsize) {
13374 dtrace_dof_error(dof, "entry size mismatch");
13375 goto err;
13376 }
13377
13378 if (subsec->dofs_entsize != 0 &&
13379 (subsec->dofs_size % subsec->dofs_entsize) != 0) {
13380 dtrace_dof_error(dof, "corrupt entry size");
13381 goto err;
13382 }
13383
13384 *lenp = subsec->dofs_size;
13385 *bufp = kmem_alloc(subsec->dofs_size, KM_SLEEP);
13386 bcopy((char *)(uintptr_t)(daddr + subsec->dofs_offset),
13387 *bufp, subsec->dofs_size);
13388
13389 if (subsec->dofs_entsize != 0)
13390 *lenp /= subsec->dofs_entsize;
13391
13392 break;
13393 }
13394
13395 /*
13396 * If we encounter a loadable DIFO sub-section that is not
13397 * known to us, assume this is a broken program and fail.
13398 */
13399 if (difo[i].section == DOF_SECT_NONE &&
13400 (subsec->dofs_flags & DOF_SECF_LOAD)) {
13401 dtrace_dof_error(dof, "unrecognized DIFO subsection");
13402 goto err;
13403 }
13404 }
13405
13406 if (dp->dtdo_buf == NULL) {
13407 /*
13408 * We can't have a DIF object without DIF text.
13409 */
13410 dtrace_dof_error(dof, "missing DIF text");
13411 goto err;
13412 }
13413
13414 /*
13415 * Before we validate the DIF object, run through the variable table
13416 * looking for the strings -- if any of their size are under, we'll set
13417 * their size to be the system-wide default string size. Note that
13418 * this should _not_ happen if the "strsize" option has been set --
13419 * in this case, the compiler should have set the size to reflect the
13420 * setting of the option.
13421 */
13422 for (i = 0; i < dp->dtdo_varlen; i++) {
13423 dtrace_difv_t *v = &dp->dtdo_vartab[i];
13424 dtrace_diftype_t *t = &v->dtdv_type;
13425
13426 if (v->dtdv_id < DIF_VAR_OTHER_UBASE)
13427 continue;
13428
13429 if (t->dtdt_kind == DIF_TYPE_STRING && t->dtdt_size == 0)
13430 t->dtdt_size = dtrace_strsize_default;
13431 }
13432
13433 if (dtrace_difo_validate(dp, vstate, DIF_DIR_NREGS, cr) != 0)
13434 goto err;
13435
13436 dtrace_difo_init(dp, vstate);
13437 return (dp);
13438
13439 err:
13440 kmem_free(dp->dtdo_buf, dp->dtdo_len * sizeof (dif_instr_t));
13441 kmem_free(dp->dtdo_inttab, dp->dtdo_intlen * sizeof (uint64_t));
13442 kmem_free(dp->dtdo_strtab, dp->dtdo_strlen);
13443 kmem_free(dp->dtdo_vartab, dp->dtdo_varlen * sizeof (dtrace_difv_t));
13444
13445 kmem_free(dp, sizeof (dtrace_difo_t));
13446 return (NULL);
13447 }
13448
13449 static dtrace_predicate_t *
dtrace_dof_predicate(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13450 dtrace_dof_predicate(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13451 cred_t *cr)
13452 {
13453 dtrace_difo_t *dp;
13454
13455 if ((dp = dtrace_dof_difo(dof, sec, vstate, cr)) == NULL)
13456 return (NULL);
13457
13458 return (dtrace_predicate_create(dp));
13459 }
13460
13461 static dtrace_actdesc_t *
dtrace_dof_actdesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13462 dtrace_dof_actdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13463 cred_t *cr)
13464 {
13465 dtrace_actdesc_t *act, *first = NULL, *last = NULL, *next;
13466 dof_actdesc_t *desc;
13467 dof_sec_t *difosec;
13468 size_t offs;
13469 uintptr_t daddr = (uintptr_t)dof;
13470 uint64_t arg;
13471 dtrace_actkind_t kind;
13472
13473 if (sec->dofs_type != DOF_SECT_ACTDESC) {
13474 dtrace_dof_error(dof, "invalid action section");
13475 return (NULL);
13476 }
13477
13478 if (sec->dofs_offset + sizeof (dof_actdesc_t) > dof->dofh_loadsz) {
13479 dtrace_dof_error(dof, "truncated action description");
13480 return (NULL);
13481 }
13482
13483 if (sec->dofs_align != sizeof (uint64_t)) {
13484 dtrace_dof_error(dof, "bad alignment in action description");
13485 return (NULL);
13486 }
13487
13488 if (sec->dofs_size < sec->dofs_entsize) {
13489 dtrace_dof_error(dof, "section entry size exceeds total size");
13490 return (NULL);
13491 }
13492
13493 if (sec->dofs_entsize != sizeof (dof_actdesc_t)) {
13494 dtrace_dof_error(dof, "bad entry size in action description");
13495 return (NULL);
13496 }
13497
13498 if (sec->dofs_size / sec->dofs_entsize > dtrace_actions_max) {
13499 dtrace_dof_error(dof, "actions exceed dtrace_actions_max");
13500 return (NULL);
13501 }
13502
13503 for (offs = 0; offs < sec->dofs_size; offs += sec->dofs_entsize) {
13504 desc = (dof_actdesc_t *)(daddr +
13505 (uintptr_t)sec->dofs_offset + offs);
13506 kind = (dtrace_actkind_t)desc->dofa_kind;
13507
13508 if ((DTRACEACT_ISPRINTFLIKE(kind) &&
13509 (kind != DTRACEACT_PRINTA || desc->dofa_strtab != DOF_SECIDX_NONE)) ||
13510 (kind == DTRACEACT_DIFEXPR && desc->dofa_strtab != DOF_SECIDX_NONE))
13511 {
13512 dof_sec_t *strtab;
13513 char *str, *fmt;
13514 uint64_t i;
13515
13516 /*
13517 * The argument to these actions is an index into the
13518 * DOF string table. For printf()-like actions, this
13519 * is the format string. For print(), this is the
13520 * CTF type of the expression result.
13521 */
13522 if ((strtab = dtrace_dof_sect(dof,
13523 DOF_SECT_STRTAB, desc->dofa_strtab)) == NULL)
13524 goto err;
13525
13526 str = (char *)((uintptr_t)dof +
13527 (uintptr_t)strtab->dofs_offset);
13528
13529 for (i = desc->dofa_arg; i < strtab->dofs_size; i++) {
13530 if (str[i] == '\0')
13531 break;
13532 }
13533
13534 if (i >= strtab->dofs_size) {
13535 dtrace_dof_error(dof, "bogus format string");
13536 goto err;
13537 }
13538
13539 if (i == desc->dofa_arg) {
13540 dtrace_dof_error(dof, "empty format string");
13541 goto err;
13542 }
13543
13544 i -= desc->dofa_arg;
13545 fmt = kmem_alloc(i + 1, KM_SLEEP);
13546 bcopy(&str[desc->dofa_arg], fmt, i + 1);
13547 arg = (uint64_t)(uintptr_t)fmt;
13548 } else {
13549 if (kind == DTRACEACT_PRINTA) {
13550 ASSERT(desc->dofa_strtab == DOF_SECIDX_NONE);
13551 arg = 0;
13552 } else {
13553 arg = desc->dofa_arg;
13554 }
13555 }
13556
13557 act = dtrace_actdesc_create(kind, desc->dofa_ntuple,
13558 desc->dofa_uarg, arg);
13559
13560 if (last != NULL) {
13561 last->dtad_next = act;
13562 } else {
13563 first = act;
13564 }
13565
13566 last = act;
13567
13568 if (desc->dofa_difo == DOF_SECIDX_NONE)
13569 continue;
13570
13571 if ((difosec = dtrace_dof_sect(dof,
13572 DOF_SECT_DIFOHDR, desc->dofa_difo)) == NULL)
13573 goto err;
13574
13575 act->dtad_difo = dtrace_dof_difo(dof, difosec, vstate, cr);
13576
13577 if (act->dtad_difo == NULL)
13578 goto err;
13579 }
13580
13581 ASSERT(first != NULL);
13582 return (first);
13583
13584 err:
13585 for (act = first; act != NULL; act = next) {
13586 next = act->dtad_next;
13587 dtrace_actdesc_release(act, vstate);
13588 }
13589
13590 return (NULL);
13591 }
13592
13593 static dtrace_ecbdesc_t *
dtrace_dof_ecbdesc(dof_hdr_t * dof,dof_sec_t * sec,dtrace_vstate_t * vstate,cred_t * cr)13594 dtrace_dof_ecbdesc(dof_hdr_t *dof, dof_sec_t *sec, dtrace_vstate_t *vstate,
13595 cred_t *cr)
13596 {
13597 dtrace_ecbdesc_t *ep;
13598 dof_ecbdesc_t *ecb;
13599 dtrace_probedesc_t *desc;
13600 dtrace_predicate_t *pred = NULL;
13601
13602 if (sec->dofs_size < sizeof (dof_ecbdesc_t)) {
13603 dtrace_dof_error(dof, "truncated ECB description");
13604 return (NULL);
13605 }
13606
13607 if (sec->dofs_align != sizeof (uint64_t)) {
13608 dtrace_dof_error(dof, "bad alignment in ECB description");
13609 return (NULL);
13610 }
13611
13612 ecb = (dof_ecbdesc_t *)((uintptr_t)dof + (uintptr_t)sec->dofs_offset);
13613 sec = dtrace_dof_sect(dof, DOF_SECT_PROBEDESC, ecb->dofe_probes);
13614
13615 if (sec == NULL)
13616 return (NULL);
13617
13618 ep = kmem_zalloc(sizeof (dtrace_ecbdesc_t), KM_SLEEP);
13619 ep->dted_uarg = ecb->dofe_uarg;
13620 desc = &ep->dted_probe;
13621
13622 if (dtrace_dof_probedesc(dof, sec, desc) == NULL)
13623 goto err;
13624
13625 if (ecb->dofe_pred != DOF_SECIDX_NONE) {
13626 if ((sec = dtrace_dof_sect(dof,
13627 DOF_SECT_DIFOHDR, ecb->dofe_pred)) == NULL)
13628 goto err;
13629
13630 if ((pred = dtrace_dof_predicate(dof, sec, vstate, cr)) == NULL)
13631 goto err;
13632
13633 ep->dted_pred.dtpdd_predicate = pred;
13634 }
13635
13636 if (ecb->dofe_actions != DOF_SECIDX_NONE) {
13637 if ((sec = dtrace_dof_sect(dof,
13638 DOF_SECT_ACTDESC, ecb->dofe_actions)) == NULL)
13639 goto err;
13640
13641 ep->dted_action = dtrace_dof_actdesc(dof, sec, vstate, cr);
13642
13643 if (ep->dted_action == NULL)
13644 goto err;
13645 }
13646
13647 return (ep);
13648
13649 err:
13650 if (pred != NULL)
13651 dtrace_predicate_release(pred, vstate);
13652 kmem_free(ep, sizeof (dtrace_ecbdesc_t));
13653 return (NULL);
13654 }
13655
13656 /*
13657 * APPLE NOTE: dyld handles dof relocation.
13658 * Darwin does not need dtrace_dof_relocate()
13659 */
13660
13661 /*
13662 * The dof_hdr_t passed to dtrace_dof_slurp() should be a partially validated
13663 * header: it should be at the front of a memory region that is at least
13664 * sizeof (dof_hdr_t) in size -- and then at least dof_hdr.dofh_loadsz in
13665 * size. It need not be validated in any other way.
13666 */
13667 static int
dtrace_dof_slurp(dof_hdr_t * dof,dtrace_vstate_t * vstate,cred_t * cr,dtrace_enabling_t ** enabp,uint64_t ubase,int noprobes)13668 dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
13669 dtrace_enabling_t **enabp, uint64_t ubase, int noprobes)
13670 {
13671 #pragma unused(ubase) /* __APPLE__ */
13672 uint64_t len = dof->dofh_loadsz, seclen;
13673 uintptr_t daddr = (uintptr_t)dof;
13674 dtrace_ecbdesc_t *ep;
13675 dtrace_enabling_t *enab;
13676 uint_t i;
13677
13678 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13679 ASSERT(dof->dofh_loadsz >= sizeof (dof_hdr_t));
13680
13681 /*
13682 * Check the DOF header identification bytes. In addition to checking
13683 * valid settings, we also verify that unused bits/bytes are zeroed so
13684 * we can use them later without fear of regressing existing binaries.
13685 */
13686 if (bcmp(&dof->dofh_ident[DOF_ID_MAG0],
13687 DOF_MAG_STRING, DOF_MAG_STRLEN) != 0) {
13688 dtrace_dof_error(dof, "DOF magic string mismatch");
13689 return (-1);
13690 }
13691
13692 if (dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_ILP32 &&
13693 dof->dofh_ident[DOF_ID_MODEL] != DOF_MODEL_LP64) {
13694 dtrace_dof_error(dof, "DOF has invalid data model");
13695 return (-1);
13696 }
13697
13698 if (dof->dofh_ident[DOF_ID_ENCODING] != DOF_ENCODE_NATIVE) {
13699 dtrace_dof_error(dof, "DOF encoding mismatch");
13700 return (-1);
13701 }
13702
13703 /*
13704 * APPLE NOTE: Darwin only supports DOF_VERSION_3 for now.
13705 */
13706 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_3) {
13707 dtrace_dof_error(dof, "DOF version mismatch");
13708 return (-1);
13709 }
13710
13711 if (dof->dofh_ident[DOF_ID_DIFVERS] != DIF_VERSION_2) {
13712 dtrace_dof_error(dof, "DOF uses unsupported instruction set");
13713 return (-1);
13714 }
13715
13716 if (dof->dofh_ident[DOF_ID_DIFIREG] > DIF_DIR_NREGS) {
13717 dtrace_dof_error(dof, "DOF uses too many integer registers");
13718 return (-1);
13719 }
13720
13721 if (dof->dofh_ident[DOF_ID_DIFTREG] > DIF_DTR_NREGS) {
13722 dtrace_dof_error(dof, "DOF uses too many tuple registers");
13723 return (-1);
13724 }
13725
13726 for (i = DOF_ID_PAD; i < DOF_ID_SIZE; i++) {
13727 if (dof->dofh_ident[i] != 0) {
13728 dtrace_dof_error(dof, "DOF has invalid ident byte set");
13729 return (-1);
13730 }
13731 }
13732
13733 if (dof->dofh_flags & ~DOF_FL_VALID) {
13734 dtrace_dof_error(dof, "DOF has invalid flag bits set");
13735 return (-1);
13736 }
13737
13738 if (dof->dofh_secsize < sizeof(dof_sec_t)) {
13739 dtrace_dof_error(dof, "invalid section header size");
13740 return (-1);
13741 }
13742
13743 /*
13744 * Check that the section headers don't exceed the amount of DOF
13745 * data. Note that we cast the section size and number of sections
13746 * to uint64_t's to prevent possible overflow in the multiplication.
13747 */
13748 seclen = (uint64_t)dof->dofh_secnum * (uint64_t)dof->dofh_secsize;
13749
13750 if (dof->dofh_secoff > len || seclen > len ||
13751 dof->dofh_secoff + seclen > len) {
13752 dtrace_dof_error(dof, "truncated section headers");
13753 return (-1);
13754 }
13755
13756 if (!IS_P2ALIGNED(dof->dofh_secoff, sizeof (uint64_t))) {
13757 dtrace_dof_error(dof, "misaligned section headers");
13758 return (-1);
13759 }
13760
13761 if (!IS_P2ALIGNED(dof->dofh_secsize, sizeof (uint64_t))) {
13762 dtrace_dof_error(dof, "misaligned section size");
13763 return (-1);
13764 }
13765
13766 /*
13767 * Take an initial pass through the section headers to be sure that
13768 * the headers don't have stray offsets. If the 'noprobes' flag is
13769 * set, do not permit sections relating to providers, probes, or args.
13770 */
13771 for (i = 0; i < dof->dofh_secnum; i++) {
13772 dof_sec_t *sec = (dof_sec_t *)(daddr +
13773 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13774
13775 if (noprobes) {
13776 switch (sec->dofs_type) {
13777 case DOF_SECT_PROVIDER:
13778 case DOF_SECT_PROBES:
13779 case DOF_SECT_PRARGS:
13780 case DOF_SECT_PROFFS:
13781 dtrace_dof_error(dof, "illegal sections "
13782 "for enabling");
13783 return (-1);
13784 }
13785 }
13786
13787 if (sec->dofs_align & (sec->dofs_align - 1)) {
13788 dtrace_dof_error(dof, "bad section alignment");
13789 return (-1);
13790 }
13791
13792 if (sec->dofs_offset & (sec->dofs_align - 1)) {
13793 dtrace_dof_error(dof, "misaligned section");
13794 return (-1);
13795 }
13796
13797 if (sec->dofs_flags & DOF_SECF_LOAD) {
13798 len = dof->dofh_loadsz;
13799 } else {
13800 len = dof->dofh_filesz;
13801 }
13802
13803 if (sec->dofs_offset > len || sec->dofs_size > len ||
13804 sec->dofs_offset + sec->dofs_size > len) {
13805 dtrace_dof_error(dof, "corrupt section header");
13806 return (-1);
13807 }
13808
13809 if (sec->dofs_type == DOF_SECT_STRTAB && *((char *)daddr +
13810 sec->dofs_offset + sec->dofs_size - 1) != '\0') {
13811 dtrace_dof_error(dof, "non-terminating string table");
13812 return (-1);
13813 }
13814 }
13815
13816 /*
13817 * APPLE NOTE: We have no further relocation to perform.
13818 * All dof values are relative offsets.
13819 */
13820
13821 if ((enab = *enabp) == NULL)
13822 enab = *enabp = dtrace_enabling_create(vstate);
13823
13824 for (i = 0; i < dof->dofh_secnum; i++) {
13825 dof_sec_t *sec = (dof_sec_t *)(daddr +
13826 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13827
13828 if (sec->dofs_type != DOF_SECT_ECBDESC)
13829 continue;
13830
13831 /*
13832 * APPLE NOTE: Defend against gcc 4.0 botch on x86.
13833 * not all paths out of inlined dtrace_dof_ecbdesc
13834 * are checked for the NULL return value.
13835 * Check for NULL explicitly here.
13836 */
13837 ep = dtrace_dof_ecbdesc(dof, sec, vstate, cr);
13838 if (ep == NULL) {
13839 dtrace_enabling_destroy(enab);
13840 *enabp = NULL;
13841 return (-1);
13842 }
13843
13844 dtrace_enabling_add(enab, ep);
13845 }
13846
13847 return (0);
13848 }
13849
13850 /*
13851 * Process DOF for any options. This routine assumes that the DOF has been
13852 * at least processed by dtrace_dof_slurp().
13853 */
13854 static int
dtrace_dof_options(dof_hdr_t * dof,dtrace_state_t * state)13855 dtrace_dof_options(dof_hdr_t *dof, dtrace_state_t *state)
13856 {
13857 uint_t i;
13858 int rval;
13859 uint32_t entsize;
13860 size_t offs;
13861 dof_optdesc_t *desc;
13862
13863 for (i = 0; i < dof->dofh_secnum; i++) {
13864 dof_sec_t *sec = (dof_sec_t *)((uintptr_t)dof +
13865 (uintptr_t)dof->dofh_secoff + i * dof->dofh_secsize);
13866
13867 if (sec->dofs_type != DOF_SECT_OPTDESC)
13868 continue;
13869
13870 if (sec->dofs_align != sizeof (uint64_t)) {
13871 dtrace_dof_error(dof, "bad alignment in "
13872 "option description");
13873 return (EINVAL);
13874 }
13875
13876 if ((entsize = sec->dofs_entsize) == 0) {
13877 dtrace_dof_error(dof, "zeroed option entry size");
13878 return (EINVAL);
13879 }
13880
13881 if (entsize < sizeof (dof_optdesc_t)) {
13882 dtrace_dof_error(dof, "bad option entry size");
13883 return (EINVAL);
13884 }
13885
13886 for (offs = 0; offs < sec->dofs_size; offs += entsize) {
13887 desc = (dof_optdesc_t *)((uintptr_t)dof +
13888 (uintptr_t)sec->dofs_offset + offs);
13889
13890 if (desc->dofo_strtab != DOF_SECIDX_NONE) {
13891 dtrace_dof_error(dof, "non-zero option string");
13892 return (EINVAL);
13893 }
13894
13895 if (desc->dofo_value == (uint64_t)DTRACEOPT_UNSET) {
13896 dtrace_dof_error(dof, "unset option");
13897 return (EINVAL);
13898 }
13899
13900 if ((rval = dtrace_state_option(state,
13901 desc->dofo_option, desc->dofo_value)) != 0) {
13902 dtrace_dof_error(dof, "rejected option");
13903 return (rval);
13904 }
13905 }
13906 }
13907
13908 return (0);
13909 }
13910
13911 /*
13912 * DTrace Consumer State Functions
13913 */
13914 static int
dtrace_dstate_init(dtrace_dstate_t * dstate,size_t size)13915 dtrace_dstate_init(dtrace_dstate_t *dstate, size_t size)
13916 {
13917 size_t hashsize, maxper, min_size, chunksize = dstate->dtds_chunksize;
13918 void *base;
13919 uintptr_t limit;
13920 dtrace_dynvar_t *dvar, *next, *start;
13921
13922 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
13923 ASSERT(dstate->dtds_base == NULL && dstate->dtds_percpu == NULL);
13924
13925 bzero(dstate, sizeof (dtrace_dstate_t));
13926
13927 if ((dstate->dtds_chunksize = chunksize) == 0)
13928 dstate->dtds_chunksize = DTRACE_DYNVAR_CHUNKSIZE;
13929
13930 VERIFY(dstate->dtds_chunksize < (LONG_MAX - sizeof (dtrace_dynhash_t)));
13931
13932 if (size < (min_size = dstate->dtds_chunksize + sizeof (dtrace_dynhash_t)))
13933 size = min_size;
13934
13935 if ((base = kmem_zalloc(size, KM_NOSLEEP)) == NULL)
13936 return (ENOMEM);
13937
13938 dstate->dtds_size = size;
13939 dstate->dtds_base = base;
13940 dstate->dtds_percpu = zalloc_percpu(dtrace_state_pcpu_zone, Z_WAITOK | Z_ZERO);
13941
13942 hashsize = size / (dstate->dtds_chunksize + sizeof (dtrace_dynhash_t));
13943
13944 if (hashsize != 1 && (hashsize & 1))
13945 hashsize--;
13946
13947 dstate->dtds_hashsize = hashsize;
13948 dstate->dtds_hash = dstate->dtds_base;
13949
13950 /*
13951 * Set all of our hash buckets to point to the single sink, and (if
13952 * it hasn't already been set), set the sink's hash value to be the
13953 * sink sentinel value. The sink is needed for dynamic variable
13954 * lookups to know that they have iterated over an entire, valid hash
13955 * chain.
13956 */
13957 for (size_t i = 0; i < hashsize; i++)
13958 dstate->dtds_hash[i].dtdh_chain = &dtrace_dynhash_sink;
13959
13960 if (dtrace_dynhash_sink.dtdv_hashval != DTRACE_DYNHASH_SINK)
13961 dtrace_dynhash_sink.dtdv_hashval = DTRACE_DYNHASH_SINK;
13962
13963 /*
13964 * Determine number of active CPUs. Divide free list evenly among
13965 * active CPUs.
13966 */
13967 start = (dtrace_dynvar_t *)
13968 ((uintptr_t)base + hashsize * sizeof (dtrace_dynhash_t));
13969 limit = (uintptr_t)base + size;
13970
13971 VERIFY((uintptr_t)start < limit);
13972 VERIFY((uintptr_t)start >= (uintptr_t)base);
13973
13974 maxper = (limit - (uintptr_t)start) / (int)NCPU;
13975 maxper = (maxper / dstate->dtds_chunksize) * dstate->dtds_chunksize;
13976
13977 zpercpu_foreach_cpu(i) {
13978 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
13979
13980 dcpu->dtdsc_free = dvar = start;
13981
13982 /*
13983 * If we don't even have enough chunks to make it once through
13984 * NCPUs, we're just going to allocate everything to the first
13985 * CPU. And if we're on the last CPU, we're going to allocate
13986 * whatever is left over. In either case, we set the limit to
13987 * be the limit of the dynamic variable space.
13988 */
13989 if (maxper == 0 || i == NCPU - 1) {
13990 limit = (uintptr_t)base + size;
13991 start = NULL;
13992 } else {
13993 limit = (uintptr_t)start + maxper;
13994 start = (dtrace_dynvar_t *)limit;
13995 }
13996
13997 VERIFY(limit <= (uintptr_t)base + size);
13998
13999 for (;;) {
14000 next = (dtrace_dynvar_t *)((uintptr_t)dvar +
14001 dstate->dtds_chunksize);
14002
14003 if ((uintptr_t)next + dstate->dtds_chunksize >= limit)
14004 break;
14005
14006 VERIFY((uintptr_t)dvar >= (uintptr_t)base &&
14007 (uintptr_t)dvar <= (uintptr_t)base + size);
14008 dvar->dtdv_next = next;
14009 dvar = next;
14010 }
14011
14012 if (maxper == 0)
14013 break;
14014 }
14015
14016 return (0);
14017 }
14018
14019 static void
dtrace_dstate_fini(dtrace_dstate_t * dstate)14020 dtrace_dstate_fini(dtrace_dstate_t *dstate)
14021 {
14022 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14023
14024 if (dstate->dtds_base == NULL)
14025 return;
14026
14027 kmem_free(dstate->dtds_base, dstate->dtds_size);
14028 zfree_percpu(dtrace_state_pcpu_zone, dstate->dtds_percpu);
14029 }
14030
14031 static void
dtrace_vstate_fini(dtrace_vstate_t * vstate)14032 dtrace_vstate_fini(dtrace_vstate_t *vstate)
14033 {
14034 /*
14035 * Logical XOR, where are you?
14036 */
14037 ASSERT((vstate->dtvs_nglobals == 0) ^ (vstate->dtvs_globals != NULL));
14038
14039 if (vstate->dtvs_nglobals > 0) {
14040 kmem_free(vstate->dtvs_globals, vstate->dtvs_nglobals *
14041 sizeof (dtrace_statvar_t *));
14042 }
14043
14044 if (vstate->dtvs_ntlocals > 0) {
14045 kmem_free(vstate->dtvs_tlocals, vstate->dtvs_ntlocals *
14046 sizeof (dtrace_difv_t));
14047 }
14048
14049 ASSERT((vstate->dtvs_nlocals == 0) ^ (vstate->dtvs_locals != NULL));
14050
14051 if (vstate->dtvs_nlocals > 0) {
14052 kmem_free(vstate->dtvs_locals, vstate->dtvs_nlocals *
14053 sizeof (dtrace_statvar_t *));
14054 }
14055 }
14056
14057 static void
dtrace_state_clean(dtrace_state_t * state)14058 dtrace_state_clean(dtrace_state_t *state)
14059 {
14060 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE)
14061 return;
14062
14063 dtrace_dynvar_clean(&state->dts_vstate.dtvs_dynvars);
14064 dtrace_speculation_clean(state);
14065 }
14066
14067 static void
dtrace_state_deadman(dtrace_state_t * state)14068 dtrace_state_deadman(dtrace_state_t *state)
14069 {
14070 hrtime_t now;
14071
14072 dtrace_sync();
14073
14074 now = dtrace_gethrtime();
14075
14076 if (state != dtrace_anon.dta_state &&
14077 now - state->dts_laststatus >= dtrace_deadman_user)
14078 return;
14079
14080 /*
14081 * We must be sure that dts_alive never appears to be less than the
14082 * value upon entry to dtrace_state_deadman(), and because we lack a
14083 * dtrace_cas64(), we cannot store to it atomically. We thus instead
14084 * store INT64_MAX to it, followed by a memory barrier, followed by
14085 * the new value. This assures that dts_alive never appears to be
14086 * less than its true value, regardless of the order in which the
14087 * stores to the underlying storage are issued.
14088 */
14089 state->dts_alive = INT64_MAX;
14090 dtrace_membar_producer();
14091 state->dts_alive = now;
14092 }
14093
14094 static int
dtrace_state_create(dev_t * devp,cred_t * cr,dtrace_state_t ** new_state)14095 dtrace_state_create(dev_t *devp, cred_t *cr, dtrace_state_t **new_state)
14096 {
14097 minor_t minor;
14098 major_t major;
14099 char c[30];
14100 dtrace_state_t *state;
14101 dtrace_optval_t *opt;
14102 int bufsize = (int)NCPU * sizeof (dtrace_buffer_t), i;
14103 unsigned int cpu_it;
14104
14105 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14106 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14107
14108 /* Cause restart */
14109 *new_state = NULL;
14110
14111 if (devp != NULL) {
14112 minor = getminor(*devp);
14113 }
14114 else {
14115 minor = DTRACE_NCLIENTS - 1;
14116 }
14117
14118 state = dtrace_state_allocate(minor);
14119 if (NULL == state) {
14120 printf("dtrace_open: couldn't acquire minor number %d. This usually means that too many DTrace clients are in use at the moment", minor);
14121 return (ERESTART); /* can't reacquire */
14122 }
14123
14124 state->dts_epid = DTRACE_EPIDNONE + 1;
14125
14126 (void) snprintf(c, sizeof (c), "dtrace_aggid_%d", minor);
14127 state->dts_aggid_arena = vmem_create(c, (void *)1, INT32_MAX, 1,
14128 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
14129
14130 if (devp != NULL) {
14131 major = getemajor(*devp);
14132 } else {
14133 major = ddi_driver_major(dtrace_devi);
14134 }
14135
14136 state->dts_dev = makedev(major, minor);
14137
14138 if (devp != NULL)
14139 *devp = state->dts_dev;
14140
14141 /*
14142 * We allocate NCPU buffers. On the one hand, this can be quite
14143 * a bit of memory per instance (nearly 36K on a Starcat). On the
14144 * other hand, it saves an additional memory reference in the probe
14145 * path.
14146 */
14147 state->dts_buffer = kmem_zalloc(bufsize, KM_SLEEP);
14148 state->dts_aggbuffer = kmem_zalloc(bufsize, KM_SLEEP);
14149 state->dts_buf_over_limit = 0;
14150
14151 /*
14152 * Allocate and initialise the per-process per-CPU random state.
14153 * SI_SUB_RANDOM < SI_SUB_DTRACE_ANON therefore entropy device is
14154 * assumed to be seeded at this point (if from Fortuna seed file).
14155 */
14156 state->dts_rstate = kmem_zalloc(NCPU * sizeof(uint64_t*), KM_SLEEP);
14157 state->dts_rstate[0] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14158 (void) read_random(state->dts_rstate[0], 2 * sizeof(uint64_t));
14159 for (cpu_it = 1; cpu_it < NCPU; cpu_it++) {
14160 state->dts_rstate[cpu_it] = kmem_zalloc(2 * sizeof(uint64_t), KM_SLEEP);
14161 /*
14162 * Each CPU is assigned a 2^64 period, non-overlapping
14163 * subsequence.
14164 */
14165 dtrace_xoroshiro128_plus_jump(state->dts_rstate[cpu_it-1],
14166 state->dts_rstate[cpu_it]);
14167 }
14168
14169 state->dts_cleaner = CYCLIC_NONE;
14170 state->dts_deadman = CYCLIC_NONE;
14171 state->dts_vstate.dtvs_state = state;
14172
14173 for (i = 0; i < DTRACEOPT_MAX; i++)
14174 state->dts_options[i] = DTRACEOPT_UNSET;
14175
14176 /*
14177 * Set the default options.
14178 */
14179 opt = state->dts_options;
14180 opt[DTRACEOPT_BUFPOLICY] = DTRACEOPT_BUFPOLICY_SWITCH;
14181 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_AUTO;
14182 opt[DTRACEOPT_NSPEC] = dtrace_nspec_default;
14183 opt[DTRACEOPT_SPECSIZE] = dtrace_specsize_default;
14184 opt[DTRACEOPT_CPU] = (dtrace_optval_t)DTRACE_CPUALL;
14185 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_default;
14186 opt[DTRACEOPT_STACKFRAMES] = dtrace_stackframes_default;
14187 opt[DTRACEOPT_USTACKFRAMES] = dtrace_ustackframes_default;
14188 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_default;
14189 opt[DTRACEOPT_AGGRATE] = dtrace_aggrate_default;
14190 opt[DTRACEOPT_SWITCHRATE] = dtrace_switchrate_default;
14191 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_default;
14192 opt[DTRACEOPT_JSTACKFRAMES] = dtrace_jstackframes_default;
14193 opt[DTRACEOPT_JSTACKSTRSIZE] = dtrace_jstackstrsize_default;
14194 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_default;
14195
14196 /*
14197 * Depending on the user credentials, we set flag bits which alter probe
14198 * visibility or the amount of destructiveness allowed. In the case of
14199 * actual anonymous tracing, or the possession of all privileges, all of
14200 * the normal checks are bypassed.
14201 */
14202 #if defined(__APPLE__)
14203 if (cr != NULL) {
14204 kauth_cred_ref(cr);
14205 state->dts_cred.dcr_cred = cr;
14206 }
14207 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14208 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14209 /*
14210 * Allow only proc credentials when DTrace is
14211 * restricted by the current security policy
14212 */
14213 state->dts_cred.dcr_visible = DTRACE_CRV_ALLPROC;
14214 state->dts_cred.dcr_action = DTRACE_CRA_PROC | DTRACE_CRA_PROC_CONTROL | DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14215 }
14216 else {
14217 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14218 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14219 }
14220 }
14221
14222 #else
14223 if (cr == NULL || PRIV_POLICY_ONLY(cr, PRIV_ALL, B_FALSE)) {
14224 state->dts_cred.dcr_visible = DTRACE_CRV_ALL;
14225 state->dts_cred.dcr_action = DTRACE_CRA_ALL;
14226 }
14227 else {
14228 /*
14229 * Set up the credentials for this instantiation. We take a
14230 * hold on the credential to prevent it from disappearing on
14231 * us; this in turn prevents the zone_t referenced by this
14232 * credential from disappearing. This means that we can
14233 * examine the credential and the zone from probe context.
14234 */
14235 crhold(cr);
14236 state->dts_cred.dcr_cred = cr;
14237
14238 /*
14239 * CRA_PROC means "we have *some* privilege for dtrace" and
14240 * unlocks the use of variables like pid, zonename, etc.
14241 */
14242 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE) ||
14243 PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14244 state->dts_cred.dcr_action |= DTRACE_CRA_PROC;
14245 }
14246
14247 /*
14248 * dtrace_user allows use of syscall and profile providers.
14249 * If the user also has proc_owner and/or proc_zone, we
14250 * extend the scope to include additional visibility and
14251 * destructive power.
14252 */
14253 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_USER, B_FALSE)) {
14254 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE)) {
14255 state->dts_cred.dcr_visible |=
14256 DTRACE_CRV_ALLPROC;
14257
14258 state->dts_cred.dcr_action |=
14259 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14260 }
14261
14262 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE)) {
14263 state->dts_cred.dcr_visible |=
14264 DTRACE_CRV_ALLZONE;
14265
14266 state->dts_cred.dcr_action |=
14267 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14268 }
14269
14270 /*
14271 * If we have all privs in whatever zone this is,
14272 * we can do destructive things to processes which
14273 * have altered credentials.
14274 *
14275 * APPLE NOTE: Darwin doesn't do zones.
14276 * Behave as if zone always has destructive privs.
14277 */
14278
14279 state->dts_cred.dcr_action |=
14280 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14281 }
14282
14283 /*
14284 * Holding the dtrace_kernel privilege also implies that
14285 * the user has the dtrace_user privilege from a visibility
14286 * perspective. But without further privileges, some
14287 * destructive actions are not available.
14288 */
14289 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_KERNEL, B_FALSE)) {
14290 /*
14291 * Make all probes in all zones visible. However,
14292 * this doesn't mean that all actions become available
14293 * to all zones.
14294 */
14295 state->dts_cred.dcr_visible |= DTRACE_CRV_KERNEL |
14296 DTRACE_CRV_ALLPROC | DTRACE_CRV_ALLZONE;
14297
14298 state->dts_cred.dcr_action |= DTRACE_CRA_KERNEL |
14299 DTRACE_CRA_PROC;
14300 /*
14301 * Holding proc_owner means that destructive actions
14302 * for *this* zone are allowed.
14303 */
14304 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14305 state->dts_cred.dcr_action |=
14306 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14307
14308 /*
14309 * Holding proc_zone means that destructive actions
14310 * for this user/group ID in all zones is allowed.
14311 */
14312 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14313 state->dts_cred.dcr_action |=
14314 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14315
14316 /*
14317 * If we have all privs in whatever zone this is,
14318 * we can do destructive things to processes which
14319 * have altered credentials.
14320 *
14321 * APPLE NOTE: Darwin doesn't do zones.
14322 * Behave as if zone always has destructive privs.
14323 */
14324 state->dts_cred.dcr_action |=
14325 DTRACE_CRA_PROC_DESTRUCTIVE_CREDCHG;
14326 }
14327
14328 /*
14329 * Holding the dtrace_proc privilege gives control over fasttrap
14330 * and pid providers. We need to grant wider destructive
14331 * privileges in the event that the user has proc_owner and/or
14332 * proc_zone.
14333 */
14334 if (PRIV_POLICY_ONLY(cr, PRIV_DTRACE_PROC, B_FALSE)) {
14335 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_OWNER, B_FALSE))
14336 state->dts_cred.dcr_action |=
14337 DTRACE_CRA_PROC_DESTRUCTIVE_ALLUSER;
14338
14339 if (PRIV_POLICY_ONLY(cr, PRIV_PROC_ZONE, B_FALSE))
14340 state->dts_cred.dcr_action |=
14341 DTRACE_CRA_PROC_DESTRUCTIVE_ALLZONE;
14342 }
14343 }
14344 #endif
14345
14346 *new_state = state;
14347 return(0); /* Success */
14348 }
14349
14350 static int
dtrace_state_buffer(dtrace_state_t * state,dtrace_buffer_t * buf,int which)14351 dtrace_state_buffer(dtrace_state_t *state, dtrace_buffer_t *buf, int which)
14352 {
14353 dtrace_optval_t *opt = state->dts_options, size;
14354 processorid_t cpu = 0;
14355 size_t limit = buf->dtb_size;
14356 int flags = 0, rval;
14357
14358 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14359 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14360 ASSERT(which < DTRACEOPT_MAX);
14361 ASSERT(state->dts_activity == DTRACE_ACTIVITY_INACTIVE ||
14362 (state == dtrace_anon.dta_state &&
14363 state->dts_activity == DTRACE_ACTIVITY_ACTIVE));
14364
14365 if (opt[which] == DTRACEOPT_UNSET || opt[which] == 0)
14366 return (0);
14367
14368 if (opt[DTRACEOPT_CPU] != DTRACEOPT_UNSET)
14369 cpu = opt[DTRACEOPT_CPU];
14370
14371 if (which == DTRACEOPT_SPECSIZE)
14372 flags |= DTRACEBUF_NOSWITCH;
14373
14374 if (which == DTRACEOPT_BUFSIZE) {
14375 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_RING)
14376 flags |= DTRACEBUF_RING;
14377
14378 if (opt[DTRACEOPT_BUFPOLICY] == DTRACEOPT_BUFPOLICY_FILL)
14379 flags |= DTRACEBUF_FILL;
14380
14381 if (state != dtrace_anon.dta_state ||
14382 state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
14383 flags |= DTRACEBUF_INACTIVE;
14384 }
14385
14386 for (size = opt[which]; (size_t)size >= sizeof (uint64_t); size >>= 1) {
14387 /*
14388 * The size must be 8-byte aligned. If the size is not 8-byte
14389 * aligned, drop it down by the difference.
14390 */
14391 if (size & (sizeof (uint64_t) - 1))
14392 size -= size & (sizeof (uint64_t) - 1);
14393
14394 if (size < state->dts_reserve) {
14395 /*
14396 * Buffers always must be large enough to accommodate
14397 * their prereserved space. We return E2BIG instead
14398 * of ENOMEM in this case to allow for user-level
14399 * software to differentiate the cases.
14400 */
14401 return (E2BIG);
14402 }
14403 limit = opt[DTRACEOPT_BUFLIMIT] * size / 100;
14404 rval = dtrace_buffer_alloc(buf, limit, size, flags, cpu);
14405
14406 if (rval != ENOMEM) {
14407 opt[which] = size;
14408 return (rval);
14409 }
14410
14411 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14412 return (rval);
14413 }
14414
14415 return (ENOMEM);
14416 }
14417
14418 static int
dtrace_state_buffers(dtrace_state_t * state)14419 dtrace_state_buffers(dtrace_state_t *state)
14420 {
14421 dtrace_speculation_t *spec = state->dts_speculations;
14422 int rval, i;
14423
14424 if ((rval = dtrace_state_buffer(state, state->dts_buffer,
14425 DTRACEOPT_BUFSIZE)) != 0)
14426 return (rval);
14427
14428 if ((rval = dtrace_state_buffer(state, state->dts_aggbuffer,
14429 DTRACEOPT_AGGSIZE)) != 0)
14430 return (rval);
14431
14432 for (i = 0; i < state->dts_nspeculations; i++) {
14433 if ((rval = dtrace_state_buffer(state,
14434 spec[i].dtsp_buffer, DTRACEOPT_SPECSIZE)) != 0)
14435 return (rval);
14436 }
14437
14438 return (0);
14439 }
14440
14441 static void
dtrace_state_prereserve(dtrace_state_t * state)14442 dtrace_state_prereserve(dtrace_state_t *state)
14443 {
14444 dtrace_ecb_t *ecb;
14445 dtrace_probe_t *probe;
14446
14447 state->dts_reserve = 0;
14448
14449 if (state->dts_options[DTRACEOPT_BUFPOLICY] != DTRACEOPT_BUFPOLICY_FILL)
14450 return;
14451
14452 /*
14453 * If our buffer policy is a "fill" buffer policy, we need to set the
14454 * prereserved space to be the space required by the END probes.
14455 */
14456 probe = dtrace_probes[dtrace_probeid_end - 1];
14457 ASSERT(probe != NULL);
14458
14459 for (ecb = probe->dtpr_ecb; ecb != NULL; ecb = ecb->dte_next) {
14460 if (ecb->dte_state != state)
14461 continue;
14462
14463 state->dts_reserve += ecb->dte_needed + ecb->dte_alignment;
14464 }
14465 }
14466
14467 static int
dtrace_state_go(dtrace_state_t * state,processorid_t * cpu)14468 dtrace_state_go(dtrace_state_t *state, processorid_t *cpu)
14469 {
14470 dtrace_optval_t *opt = state->dts_options, sz, nspec;
14471 dtrace_speculation_t *spec;
14472 dtrace_buffer_t *buf;
14473 cyc_handler_t hdlr;
14474 cyc_time_t when;
14475 int rval = 0, i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14476 dtrace_icookie_t cookie;
14477
14478 lck_mtx_lock(&cpu_lock);
14479 lck_mtx_lock(&dtrace_lock);
14480
14481 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
14482 rval = EBUSY;
14483 goto out;
14484 }
14485
14486 /*
14487 * Before we can perform any checks, we must prime all of the
14488 * retained enablings that correspond to this state.
14489 */
14490 dtrace_enabling_prime(state);
14491
14492 if (state->dts_destructive && !state->dts_cred.dcr_destructive) {
14493 rval = EACCES;
14494 goto out;
14495 }
14496
14497 dtrace_state_prereserve(state);
14498
14499 /*
14500 * Now we want to do is try to allocate our speculations.
14501 * We do not automatically resize the number of speculations; if
14502 * this fails, we will fail the operation.
14503 */
14504 nspec = opt[DTRACEOPT_NSPEC];
14505 ASSERT(nspec != DTRACEOPT_UNSET);
14506
14507 if (nspec > INT_MAX) {
14508 rval = ENOMEM;
14509 goto out;
14510 }
14511
14512 spec = kmem_zalloc(nspec * sizeof (dtrace_speculation_t), KM_NOSLEEP);
14513
14514 if (spec == NULL) {
14515 rval = ENOMEM;
14516 goto out;
14517 }
14518
14519 state->dts_speculations = spec;
14520 state->dts_nspeculations = (int)nspec;
14521
14522 for (i = 0; i < nspec; i++) {
14523 if ((buf = kmem_zalloc(bufsize, KM_NOSLEEP)) == NULL) {
14524 rval = ENOMEM;
14525 goto err;
14526 }
14527
14528 spec[i].dtsp_buffer = buf;
14529 }
14530
14531 if (opt[DTRACEOPT_GRABANON] != DTRACEOPT_UNSET) {
14532 if (dtrace_anon.dta_state == NULL) {
14533 rval = ENOENT;
14534 goto out;
14535 }
14536
14537 if (state->dts_necbs != 0) {
14538 rval = EALREADY;
14539 goto out;
14540 }
14541
14542 state->dts_anon = dtrace_anon_grab();
14543 ASSERT(state->dts_anon != NULL);
14544 state = state->dts_anon;
14545
14546 /*
14547 * We want "grabanon" to be set in the grabbed state, so we'll
14548 * copy that option value from the grabbing state into the
14549 * grabbed state.
14550 */
14551 state->dts_options[DTRACEOPT_GRABANON] =
14552 opt[DTRACEOPT_GRABANON];
14553
14554 *cpu = dtrace_anon.dta_beganon;
14555
14556 /*
14557 * If the anonymous state is active (as it almost certainly
14558 * is if the anonymous enabling ultimately matched anything),
14559 * we don't allow any further option processing -- but we
14560 * don't return failure.
14561 */
14562 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14563 goto out;
14564 }
14565
14566 if (opt[DTRACEOPT_AGGSIZE] != DTRACEOPT_UNSET &&
14567 opt[DTRACEOPT_AGGSIZE] != 0) {
14568 if (state->dts_aggregations == NULL) {
14569 /*
14570 * We're not going to create an aggregation buffer
14571 * because we don't have any ECBs that contain
14572 * aggregations -- set this option to 0.
14573 */
14574 opt[DTRACEOPT_AGGSIZE] = 0;
14575 } else {
14576 /*
14577 * If we have an aggregation buffer, we must also have
14578 * a buffer to use as scratch.
14579 */
14580 if (opt[DTRACEOPT_BUFSIZE] == DTRACEOPT_UNSET ||
14581 (size_t)opt[DTRACEOPT_BUFSIZE] < state->dts_needed) {
14582 opt[DTRACEOPT_BUFSIZE] = state->dts_needed;
14583 }
14584 }
14585 }
14586
14587 if (opt[DTRACEOPT_SPECSIZE] != DTRACEOPT_UNSET &&
14588 opt[DTRACEOPT_SPECSIZE] != 0) {
14589 if (!state->dts_speculates) {
14590 /*
14591 * We're not going to create speculation buffers
14592 * because we don't have any ECBs that actually
14593 * speculate -- set the speculation size to 0.
14594 */
14595 opt[DTRACEOPT_SPECSIZE] = 0;
14596 }
14597 }
14598
14599 /*
14600 * The bare minimum size for any buffer that we're actually going to
14601 * do anything to is sizeof (uint64_t).
14602 */
14603 sz = sizeof (uint64_t);
14604
14605 if ((state->dts_needed != 0 && opt[DTRACEOPT_BUFSIZE] < sz) ||
14606 (state->dts_speculates && opt[DTRACEOPT_SPECSIZE] < sz) ||
14607 (state->dts_aggregations != NULL && opt[DTRACEOPT_AGGSIZE] < sz)) {
14608 /*
14609 * A buffer size has been explicitly set to 0 (or to a size
14610 * that will be adjusted to 0) and we need the space -- we
14611 * need to return failure. We return ENOSPC to differentiate
14612 * it from failing to allocate a buffer due to failure to meet
14613 * the reserve (for which we return E2BIG).
14614 */
14615 rval = ENOSPC;
14616 goto out;
14617 }
14618
14619 if ((rval = dtrace_state_buffers(state)) != 0)
14620 goto err;
14621
14622 if ((sz = opt[DTRACEOPT_DYNVARSIZE]) == DTRACEOPT_UNSET)
14623 sz = dtrace_dstate_defsize;
14624
14625 do {
14626 rval = dtrace_dstate_init(&state->dts_vstate.dtvs_dynvars, sz);
14627
14628 if (rval == 0)
14629 break;
14630
14631 if (opt[DTRACEOPT_BUFRESIZE] == DTRACEOPT_BUFRESIZE_MANUAL)
14632 goto err;
14633 } while (sz >>= 1);
14634
14635 opt[DTRACEOPT_DYNVARSIZE] = sz;
14636
14637 if (rval != 0)
14638 goto err;
14639
14640 if (opt[DTRACEOPT_STATUSRATE] > dtrace_statusrate_max)
14641 opt[DTRACEOPT_STATUSRATE] = dtrace_statusrate_max;
14642
14643 if (opt[DTRACEOPT_CLEANRATE] == 0)
14644 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14645
14646 if (opt[DTRACEOPT_CLEANRATE] < dtrace_cleanrate_min)
14647 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_min;
14648
14649 if (opt[DTRACEOPT_CLEANRATE] > dtrace_cleanrate_max)
14650 opt[DTRACEOPT_CLEANRATE] = dtrace_cleanrate_max;
14651
14652 if (opt[DTRACEOPT_STRSIZE] > dtrace_strsize_max)
14653 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_max;
14654
14655 if (opt[DTRACEOPT_STRSIZE] < dtrace_strsize_min)
14656 opt[DTRACEOPT_STRSIZE] = dtrace_strsize_min;
14657
14658 if (opt[DTRACEOPT_BUFLIMIT] > dtrace_buflimit_max)
14659 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_max;
14660
14661 if (opt[DTRACEOPT_BUFLIMIT] < dtrace_buflimit_min)
14662 opt[DTRACEOPT_BUFLIMIT] = dtrace_buflimit_min;
14663
14664 hdlr.cyh_func = (cyc_func_t)dtrace_state_clean;
14665 hdlr.cyh_arg = state;
14666 hdlr.cyh_level = CY_LOW_LEVEL;
14667
14668 when.cyt_when = 0;
14669 when.cyt_interval = opt[DTRACEOPT_CLEANRATE];
14670
14671 state->dts_cleaner = cyclic_add(&hdlr, &when);
14672
14673 hdlr.cyh_func = (cyc_func_t)dtrace_state_deadman;
14674 hdlr.cyh_arg = state;
14675 hdlr.cyh_level = CY_LOW_LEVEL;
14676
14677 when.cyt_when = 0;
14678 when.cyt_interval = dtrace_deadman_interval;
14679
14680 state->dts_alive = state->dts_laststatus = dtrace_gethrtime();
14681 state->dts_deadman = cyclic_add(&hdlr, &when);
14682
14683 state->dts_activity = DTRACE_ACTIVITY_WARMUP;
14684
14685 /*
14686 * Now it's time to actually fire the BEGIN probe. We need to disable
14687 * interrupts here both to record the CPU on which we fired the BEGIN
14688 * probe (the data from this CPU will be processed first at user
14689 * level) and to manually activate the buffer for this CPU.
14690 */
14691 cookie = dtrace_interrupt_disable();
14692 *cpu = CPU->cpu_id;
14693 ASSERT(state->dts_buffer[*cpu].dtb_flags & DTRACEBUF_INACTIVE);
14694 state->dts_buffer[*cpu].dtb_flags &= ~DTRACEBUF_INACTIVE;
14695
14696 dtrace_probe(dtrace_probeid_begin,
14697 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14698 dtrace_interrupt_enable(cookie);
14699 /*
14700 * We may have had an exit action from a BEGIN probe; only change our
14701 * state to ACTIVE if we're still in WARMUP.
14702 */
14703 ASSERT(state->dts_activity == DTRACE_ACTIVITY_WARMUP ||
14704 state->dts_activity == DTRACE_ACTIVITY_DRAINING);
14705
14706 if (state->dts_activity == DTRACE_ACTIVITY_WARMUP)
14707 state->dts_activity = DTRACE_ACTIVITY_ACTIVE;
14708
14709 /*
14710 * Regardless of whether or not now we're in ACTIVE or DRAINING, we
14711 * want each CPU to transition its principal buffer out of the
14712 * INACTIVE state. Doing this assures that no CPU will suddenly begin
14713 * processing an ECB halfway down a probe's ECB chain; all CPUs will
14714 * atomically transition from processing none of a state's ECBs to
14715 * processing all of them.
14716 */
14717 dtrace_xcall(DTRACE_CPUALL,
14718 (dtrace_xcall_t)dtrace_buffer_activate, state);
14719 goto out;
14720
14721 err:
14722 dtrace_buffer_free(state->dts_buffer);
14723 dtrace_buffer_free(state->dts_aggbuffer);
14724
14725 if ((nspec = state->dts_nspeculations) == 0) {
14726 ASSERT(state->dts_speculations == NULL);
14727 goto out;
14728 }
14729
14730 spec = state->dts_speculations;
14731 ASSERT(spec != NULL);
14732
14733 for (i = 0; i < state->dts_nspeculations; i++) {
14734 if ((buf = spec[i].dtsp_buffer) == NULL)
14735 break;
14736
14737 dtrace_buffer_free(buf);
14738 kmem_free(buf, bufsize);
14739 }
14740
14741 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14742 state->dts_nspeculations = 0;
14743 state->dts_speculations = NULL;
14744
14745 out:
14746 lck_mtx_unlock(&dtrace_lock);
14747 lck_mtx_unlock(&cpu_lock);
14748
14749 return (rval);
14750 }
14751
14752 static int
dtrace_state_stop(dtrace_state_t * state,processorid_t * cpu)14753 dtrace_state_stop(dtrace_state_t *state, processorid_t *cpu)
14754 {
14755 dtrace_icookie_t cookie;
14756
14757 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14758
14759 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE &&
14760 state->dts_activity != DTRACE_ACTIVITY_DRAINING)
14761 return (EINVAL);
14762
14763 /*
14764 * We'll set the activity to DTRACE_ACTIVITY_DRAINING, and issue a sync
14765 * to be sure that every CPU has seen it. See below for the details
14766 * on why this is done.
14767 */
14768 state->dts_activity = DTRACE_ACTIVITY_DRAINING;
14769 dtrace_sync();
14770
14771 /*
14772 * By this point, it is impossible for any CPU to be still processing
14773 * with DTRACE_ACTIVITY_ACTIVE. We can thus set our activity to
14774 * DTRACE_ACTIVITY_COOLDOWN and know that we're not racing with any
14775 * other CPU in dtrace_buffer_reserve(). This allows dtrace_probe()
14776 * and callees to know that the activity is DTRACE_ACTIVITY_COOLDOWN
14777 * iff we're in the END probe.
14778 */
14779 state->dts_activity = DTRACE_ACTIVITY_COOLDOWN;
14780 dtrace_sync();
14781 ASSERT(state->dts_activity == DTRACE_ACTIVITY_COOLDOWN);
14782
14783 /*
14784 * Finally, we can release the reserve and call the END probe. We
14785 * disable interrupts across calling the END probe to allow us to
14786 * return the CPU on which we actually called the END probe. This
14787 * allows user-land to be sure that this CPU's principal buffer is
14788 * processed last.
14789 */
14790 state->dts_reserve = 0;
14791
14792 cookie = dtrace_interrupt_disable();
14793 *cpu = CPU->cpu_id;
14794 dtrace_probe(dtrace_probeid_end,
14795 (uint64_t)(uintptr_t)state, 0, 0, 0, 0);
14796 dtrace_interrupt_enable(cookie);
14797
14798 state->dts_activity = DTRACE_ACTIVITY_STOPPED;
14799 dtrace_sync();
14800
14801 return (0);
14802 }
14803
14804 static int
dtrace_state_option(dtrace_state_t * state,dtrace_optid_t option,dtrace_optval_t val)14805 dtrace_state_option(dtrace_state_t *state, dtrace_optid_t option,
14806 dtrace_optval_t val)
14807 {
14808 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14809
14810 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE)
14811 return (EBUSY);
14812
14813 if (option >= DTRACEOPT_MAX)
14814 return (EINVAL);
14815
14816 if (option != DTRACEOPT_CPU && val < 0)
14817 return (EINVAL);
14818
14819 switch (option) {
14820 case DTRACEOPT_DESTRUCTIVE:
14821 if (dtrace_destructive_disallow)
14822 return (EACCES);
14823
14824 state->dts_cred.dcr_destructive = 1;
14825 break;
14826
14827 case DTRACEOPT_BUFSIZE:
14828 case DTRACEOPT_DYNVARSIZE:
14829 case DTRACEOPT_AGGSIZE:
14830 case DTRACEOPT_SPECSIZE:
14831 case DTRACEOPT_STRSIZE:
14832 if (val < 0)
14833 return (EINVAL);
14834
14835 if (val >= LONG_MAX) {
14836 /*
14837 * If this is an otherwise negative value, set it to
14838 * the highest multiple of 128m less than LONG_MAX.
14839 * Technically, we're adjusting the size without
14840 * regard to the buffer resizing policy, but in fact,
14841 * this has no effect -- if we set the buffer size to
14842 * ~LONG_MAX and the buffer policy is ultimately set to
14843 * be "manual", the buffer allocation is guaranteed to
14844 * fail, if only because the allocation requires two
14845 * buffers. (We set the the size to the highest
14846 * multiple of 128m because it ensures that the size
14847 * will remain a multiple of a megabyte when
14848 * repeatedly halved -- all the way down to 15m.)
14849 */
14850 val = LONG_MAX - (1 << 27) + 1;
14851 }
14852 }
14853
14854 state->dts_options[option] = val;
14855
14856 return (0);
14857 }
14858
14859 static void
dtrace_state_destroy(dtrace_state_t * state)14860 dtrace_state_destroy(dtrace_state_t *state)
14861 {
14862 dtrace_ecb_t *ecb;
14863 dtrace_vstate_t *vstate = &state->dts_vstate;
14864 minor_t minor = getminor(state->dts_dev);
14865 int i, bufsize = (int)NCPU * sizeof (dtrace_buffer_t);
14866 dtrace_speculation_t *spec = state->dts_speculations;
14867 int nspec = state->dts_nspeculations;
14868 uint32_t match;
14869
14870 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
14871 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
14872
14873 /*
14874 * First, retract any retained enablings for this state.
14875 */
14876 dtrace_enabling_retract(state);
14877 ASSERT(state->dts_nretained == 0);
14878
14879 if (state->dts_activity == DTRACE_ACTIVITY_ACTIVE ||
14880 state->dts_activity == DTRACE_ACTIVITY_DRAINING) {
14881 /*
14882 * We have managed to come into dtrace_state_destroy() on a
14883 * hot enabling -- almost certainly because of a disorderly
14884 * shutdown of a consumer. (That is, a consumer that is
14885 * exiting without having called dtrace_stop().) In this case,
14886 * we're going to set our activity to be KILLED, and then
14887 * issue a sync to be sure that everyone is out of probe
14888 * context before we start blowing away ECBs.
14889 */
14890 state->dts_activity = DTRACE_ACTIVITY_KILLED;
14891 dtrace_sync();
14892 }
14893
14894 /*
14895 * Release the credential hold we took in dtrace_state_create().
14896 */
14897 if (state->dts_cred.dcr_cred != NULL)
14898 kauth_cred_unref(&state->dts_cred.dcr_cred);
14899
14900 /*
14901 * Now we can safely disable and destroy any enabled probes. Because
14902 * any DTRACE_PRIV_KERNEL probes may actually be slowing our progress
14903 * (especially if they're all enabled), we take two passes through the
14904 * ECBs: in the first, we disable just DTRACE_PRIV_KERNEL probes, and
14905 * in the second we disable whatever is left over.
14906 */
14907 for (match = DTRACE_PRIV_KERNEL; ; match = 0) {
14908 for (i = 0; i < state->dts_necbs; i++) {
14909 if ((ecb = state->dts_ecbs[i]) == NULL)
14910 continue;
14911
14912 if (match && ecb->dte_probe != NULL) {
14913 dtrace_probe_t *probe = ecb->dte_probe;
14914 dtrace_provider_t *prov = probe->dtpr_provider;
14915
14916 if (!(prov->dtpv_priv.dtpp_flags & match))
14917 continue;
14918 }
14919
14920 dtrace_ecb_disable(ecb);
14921 dtrace_ecb_destroy(ecb);
14922 }
14923
14924 if (!match)
14925 break;
14926 }
14927
14928 /*
14929 * Before we free the buffers, perform one more sync to assure that
14930 * every CPU is out of probe context.
14931 */
14932 dtrace_sync();
14933
14934 dtrace_buffer_free(state->dts_buffer);
14935 dtrace_buffer_free(state->dts_aggbuffer);
14936
14937 for (i = 0; i < (int)NCPU; i++) {
14938 kmem_free(state->dts_rstate[i], 2 * sizeof(uint64_t));
14939 }
14940 kmem_free(state->dts_rstate, NCPU * sizeof(uint64_t*));
14941
14942 for (i = 0; i < nspec; i++)
14943 dtrace_buffer_free(spec[i].dtsp_buffer);
14944
14945 if (state->dts_cleaner != CYCLIC_NONE)
14946 cyclic_remove(state->dts_cleaner);
14947
14948 if (state->dts_deadman != CYCLIC_NONE)
14949 cyclic_remove(state->dts_deadman);
14950
14951 dtrace_dstate_fini(&vstate->dtvs_dynvars);
14952 dtrace_vstate_fini(vstate);
14953 kmem_free(state->dts_ecbs, state->dts_necbs * sizeof (dtrace_ecb_t *));
14954
14955 if (state->dts_aggregations != NULL) {
14956 #if DEBUG
14957 for (i = 0; i < state->dts_naggregations; i++)
14958 ASSERT(state->dts_aggregations[i] == NULL);
14959 #endif
14960 ASSERT(state->dts_naggregations > 0);
14961 kmem_free(state->dts_aggregations,
14962 state->dts_naggregations * sizeof (dtrace_aggregation_t *));
14963 }
14964
14965 kmem_free(state->dts_buffer, bufsize);
14966 kmem_free(state->dts_aggbuffer, bufsize);
14967
14968 for (i = 0; i < nspec; i++)
14969 kmem_free(spec[i].dtsp_buffer, bufsize);
14970
14971 kmem_free(spec, nspec * sizeof (dtrace_speculation_t));
14972
14973 dtrace_format_destroy(state);
14974
14975 vmem_destroy(state->dts_aggid_arena);
14976 dtrace_state_free(minor);
14977 }
14978
14979 /*
14980 * DTrace Anonymous Enabling Functions
14981 */
14982
14983 int
dtrace_keep_kernel_symbols(void)14984 dtrace_keep_kernel_symbols(void)
14985 {
14986 if (dtrace_is_restricted() && !dtrace_are_restrictions_relaxed()) {
14987 return 0;
14988 }
14989
14990 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL)
14991 return 1;
14992
14993 return 0;
14994 }
14995
14996 static dtrace_state_t *
dtrace_anon_grab(void)14997 dtrace_anon_grab(void)
14998 {
14999 dtrace_state_t *state;
15000
15001 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15002
15003 if ((state = dtrace_anon.dta_state) == NULL) {
15004 ASSERT(dtrace_anon.dta_enabling == NULL);
15005 return (NULL);
15006 }
15007
15008 ASSERT(dtrace_anon.dta_enabling != NULL);
15009 ASSERT(dtrace_retained != NULL);
15010
15011 dtrace_enabling_destroy(dtrace_anon.dta_enabling);
15012 dtrace_anon.dta_enabling = NULL;
15013 dtrace_anon.dta_state = NULL;
15014
15015 return (state);
15016 }
15017
15018 static void
dtrace_anon_property(void)15019 dtrace_anon_property(void)
15020 {
15021 int i, rv;
15022 dtrace_state_t *state;
15023 dof_hdr_t *dof;
15024 char c[32]; /* enough for "dof-data-" + digits */
15025
15026 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15027 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
15028
15029 for (i = 0; ; i++) {
15030 (void) snprintf(c, sizeof (c), "dof-data-%d", i);
15031
15032 dtrace_err_verbose = 1;
15033
15034 if ((dof = dtrace_dof_property(c)) == NULL) {
15035 dtrace_err_verbose = 0;
15036 break;
15037 }
15038
15039 #ifdef illumos
15040 /*
15041 * We want to create anonymous state, so we need to transition
15042 * the kernel debugger to indicate that DTrace is active. If
15043 * this fails (e.g. because the debugger has modified text in
15044 * some way), we won't continue with the processing.
15045 */
15046 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
15047 cmn_err(CE_NOTE, "kernel debugger active; anonymous "
15048 "enabling ignored.");
15049 dtrace_dof_destroy(dof);
15050 break;
15051 }
15052 #endif
15053
15054 /*
15055 * If we haven't allocated an anonymous state, we'll do so now.
15056 */
15057 if ((state = dtrace_anon.dta_state) == NULL) {
15058 rv = dtrace_state_create(NULL, NULL, &state);
15059 dtrace_anon.dta_state = state;
15060 if (rv != 0 || state == NULL) {
15061 /*
15062 * This basically shouldn't happen: the only
15063 * failure mode from dtrace_state_create() is a
15064 * failure of ddi_soft_state_zalloc() that
15065 * itself should never happen. Still, the
15066 * interface allows for a failure mode, and
15067 * we want to fail as gracefully as possible:
15068 * we'll emit an error message and cease
15069 * processing anonymous state in this case.
15070 */
15071 cmn_err(CE_WARN, "failed to create "
15072 "anonymous state");
15073 dtrace_dof_destroy(dof);
15074 break;
15075 }
15076 }
15077
15078 rv = dtrace_dof_slurp(dof, &state->dts_vstate, CRED(),
15079 &dtrace_anon.dta_enabling, 0, B_TRUE);
15080
15081 if (rv == 0)
15082 rv = dtrace_dof_options(dof, state);
15083
15084 dtrace_err_verbose = 0;
15085 dtrace_dof_destroy(dof);
15086
15087 if (rv != 0) {
15088 /*
15089 * This is malformed DOF; chuck any anonymous state
15090 * that we created.
15091 */
15092 ASSERT(dtrace_anon.dta_enabling == NULL);
15093 dtrace_state_destroy(state);
15094 dtrace_anon.dta_state = NULL;
15095 break;
15096 }
15097
15098 ASSERT(dtrace_anon.dta_enabling != NULL);
15099 }
15100
15101 if (dtrace_anon.dta_enabling != NULL) {
15102 int rval;
15103
15104 /*
15105 * dtrace_enabling_retain() can only fail because we are
15106 * trying to retain more enablings than are allowed -- but
15107 * we only have one anonymous enabling, and we are guaranteed
15108 * to be allowed at least one retained enabling; we assert
15109 * that dtrace_enabling_retain() returns success.
15110 */
15111 rval = dtrace_enabling_retain(dtrace_anon.dta_enabling);
15112 ASSERT(rval == 0);
15113
15114 dtrace_enabling_dump(dtrace_anon.dta_enabling);
15115 }
15116 }
15117
15118 /*
15119 * DTrace Helper Functions
15120 */
15121 static void
dtrace_helper_trace(dtrace_helper_action_t * helper,dtrace_mstate_t * mstate,dtrace_vstate_t * vstate,int where)15122 dtrace_helper_trace(dtrace_helper_action_t *helper,
15123 dtrace_mstate_t *mstate, dtrace_vstate_t *vstate, int where)
15124 {
15125 uint32_t size, next, nnext;
15126 int i;
15127 dtrace_helptrace_t *ent;
15128 uint16_t flags = cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15129
15130 if (!dtrace_helptrace_enabled)
15131 return;
15132
15133 ASSERT((uint32_t)vstate->dtvs_nlocals <= dtrace_helptrace_nlocals);
15134
15135 /*
15136 * What would a tracing framework be without its own tracing
15137 * framework? (Well, a hell of a lot simpler, for starters...)
15138 */
15139 size = sizeof (dtrace_helptrace_t) + dtrace_helptrace_nlocals *
15140 sizeof (uint64_t) - sizeof (uint64_t);
15141
15142 /*
15143 * Iterate until we can allocate a slot in the trace buffer.
15144 */
15145 do {
15146 next = dtrace_helptrace_next;
15147
15148 if (next + size < dtrace_helptrace_bufsize) {
15149 nnext = next + size;
15150 } else {
15151 nnext = size;
15152 }
15153 } while (dtrace_cas32(&dtrace_helptrace_next, next, nnext) != next);
15154
15155 /*
15156 * We have our slot; fill it in.
15157 */
15158 if (nnext == size)
15159 next = 0;
15160
15161 ent = (dtrace_helptrace_t *)&dtrace_helptrace_buffer[next];
15162 ent->dtht_helper = helper;
15163 ent->dtht_where = where;
15164 ent->dtht_nlocals = vstate->dtvs_nlocals;
15165
15166 ent->dtht_fltoffs = (mstate->dtms_present & DTRACE_MSTATE_FLTOFFS) ?
15167 mstate->dtms_fltoffs : -1;
15168 ent->dtht_fault = DTRACE_FLAGS2FLT(flags);
15169 ent->dtht_illval = cpu_core[CPU->cpu_id].cpuc_dtrace_illval;
15170
15171 for (i = 0; i < vstate->dtvs_nlocals; i++) {
15172 dtrace_statvar_t *svar;
15173
15174 if ((svar = vstate->dtvs_locals[i]) == NULL)
15175 continue;
15176
15177 ASSERT(svar->dtsv_size >= (int)NCPU * sizeof (uint64_t));
15178 ent->dtht_locals[i] =
15179 ((uint64_t *)(uintptr_t)svar->dtsv_data)[CPU->cpu_id];
15180 }
15181 }
15182
15183 __attribute__((noinline))
15184 static uint64_t
dtrace_helper(int which,dtrace_mstate_t * mstate,dtrace_state_t * state,uint64_t arg0,uint64_t arg1)15185 dtrace_helper(int which, dtrace_mstate_t *mstate,
15186 dtrace_state_t *state, uint64_t arg0, uint64_t arg1)
15187 {
15188 uint16_t *flags = &cpu_core[CPU->cpu_id].cpuc_dtrace_flags;
15189 uint64_t sarg0 = mstate->dtms_arg[0];
15190 uint64_t sarg1 = mstate->dtms_arg[1];
15191 uint64_t rval = 0;
15192 dtrace_helpers_t *helpers = curproc->p_dtrace_helpers;
15193 dtrace_helper_action_t *helper;
15194 dtrace_vstate_t *vstate;
15195 dtrace_difo_t *pred;
15196 int i, trace = dtrace_helptrace_enabled;
15197
15198 ASSERT(which >= 0 && which < DTRACE_NHELPER_ACTIONS);
15199
15200 if (helpers == NULL)
15201 return (0);
15202
15203 if ((helper = helpers->dthps_actions[which]) == NULL)
15204 return (0);
15205
15206 vstate = &helpers->dthps_vstate;
15207 mstate->dtms_arg[0] = arg0;
15208 mstate->dtms_arg[1] = arg1;
15209
15210 /*
15211 * Now iterate over each helper. If its predicate evaluates to 'true',
15212 * we'll call the corresponding actions. Note that the below calls
15213 * to dtrace_dif_emulate() may set faults in machine state. This is
15214 * okay: our caller (the outer dtrace_dif_emulate()) will simply plow
15215 * the stored DIF offset with its own (which is the desired behavior).
15216 * Also, note the calls to dtrace_dif_emulate() may allocate scratch
15217 * from machine state; this is okay, too.
15218 */
15219 for (; helper != NULL; helper = helper->dtha_next) {
15220 if ((pred = helper->dtha_predicate) != NULL) {
15221 if (trace)
15222 dtrace_helper_trace(helper, mstate, vstate, 0);
15223
15224 if (!dtrace_dif_emulate(pred, mstate, vstate, state))
15225 goto next;
15226
15227 if (*flags & CPU_DTRACE_FAULT)
15228 goto err;
15229 }
15230
15231 for (i = 0; i < helper->dtha_nactions; i++) {
15232 if (trace)
15233 dtrace_helper_trace(helper,
15234 mstate, vstate, i + 1);
15235
15236 rval = dtrace_dif_emulate(helper->dtha_actions[i],
15237 mstate, vstate, state);
15238
15239 if (*flags & CPU_DTRACE_FAULT)
15240 goto err;
15241 }
15242
15243 next:
15244 if (trace)
15245 dtrace_helper_trace(helper, mstate, vstate,
15246 DTRACE_HELPTRACE_NEXT);
15247 }
15248
15249 if (trace)
15250 dtrace_helper_trace(helper, mstate, vstate,
15251 DTRACE_HELPTRACE_DONE);
15252
15253 /*
15254 * Restore the arg0 that we saved upon entry.
15255 */
15256 mstate->dtms_arg[0] = sarg0;
15257 mstate->dtms_arg[1] = sarg1;
15258
15259 return (rval);
15260
15261 err:
15262 if (trace)
15263 dtrace_helper_trace(helper, mstate, vstate,
15264 DTRACE_HELPTRACE_ERR);
15265
15266 /*
15267 * Restore the arg0 that we saved upon entry.
15268 */
15269 mstate->dtms_arg[0] = sarg0;
15270 mstate->dtms_arg[1] = sarg1;
15271
15272 return (0);
15273 }
15274
15275 static void
dtrace_helper_action_destroy(dtrace_helper_action_t * helper,dtrace_vstate_t * vstate)15276 dtrace_helper_action_destroy(dtrace_helper_action_t *helper,
15277 dtrace_vstate_t *vstate)
15278 {
15279 int i;
15280
15281 if (helper->dtha_predicate != NULL)
15282 dtrace_difo_release(helper->dtha_predicate, vstate);
15283
15284 for (i = 0; i < helper->dtha_nactions; i++) {
15285 ASSERT(helper->dtha_actions[i] != NULL);
15286 dtrace_difo_release(helper->dtha_actions[i], vstate);
15287 }
15288
15289 kmem_free(helper->dtha_actions,
15290 helper->dtha_nactions * sizeof (dtrace_difo_t *));
15291 kmem_free(helper, sizeof (dtrace_helper_action_t));
15292 }
15293
15294 static int
dtrace_helper_destroygen(proc_t * p,int gen)15295 dtrace_helper_destroygen(proc_t* p, int gen)
15296 {
15297 dtrace_helpers_t *help = p->p_dtrace_helpers;
15298 dtrace_vstate_t *vstate;
15299 uint_t i;
15300
15301 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15302 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15303
15304 if (help == NULL || gen > help->dthps_generation)
15305 return (EINVAL);
15306
15307 vstate = &help->dthps_vstate;
15308
15309 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
15310 dtrace_helper_action_t *last = NULL, *h, *next;
15311
15312 for (h = help->dthps_actions[i]; h != NULL; h = next) {
15313 next = h->dtha_next;
15314
15315 if (h->dtha_generation == gen) {
15316 if (last != NULL) {
15317 last->dtha_next = next;
15318 } else {
15319 help->dthps_actions[i] = next;
15320 }
15321
15322 dtrace_helper_action_destroy(h, vstate);
15323 } else {
15324 last = h;
15325 }
15326 }
15327 }
15328
15329 /*
15330 * Interate until we've cleared out all helper providers with the
15331 * given generation number.
15332 */
15333 for (;;) {
15334 dtrace_helper_provider_t *prov = NULL;
15335
15336 /*
15337 * Look for a helper provider with the right generation. We
15338 * have to start back at the beginning of the list each time
15339 * because we drop dtrace_lock. It's unlikely that we'll make
15340 * more than two passes.
15341 */
15342 for (i = 0; i < help->dthps_nprovs; i++) {
15343 prov = help->dthps_provs[i];
15344
15345 if (prov->dthp_generation == gen)
15346 break;
15347 }
15348
15349 /*
15350 * If there were no matches, we're done.
15351 */
15352 if (i == help->dthps_nprovs)
15353 break;
15354
15355 /*
15356 * Move the last helper provider into this slot.
15357 */
15358 help->dthps_nprovs--;
15359 help->dthps_provs[i] = help->dthps_provs[help->dthps_nprovs];
15360 help->dthps_provs[help->dthps_nprovs] = NULL;
15361
15362 lck_mtx_unlock(&dtrace_lock);
15363
15364 /*
15365 * If we have a meta provider, remove this helper provider.
15366 */
15367 if (dtrace_meta_pid != NULL) {
15368 ASSERT(dtrace_deferred_pid == NULL);
15369 dtrace_helper_provider_remove(&prov->dthp_prov,
15370 p);
15371 }
15372
15373 dtrace_helper_provider_destroy(prov);
15374
15375 lck_mtx_lock(&dtrace_lock);
15376 }
15377
15378 return (0);
15379 }
15380
15381 static int
dtrace_helper_validate(dtrace_helper_action_t * helper)15382 dtrace_helper_validate(dtrace_helper_action_t *helper)
15383 {
15384 int err = 0, i;
15385 dtrace_difo_t *dp;
15386
15387 if ((dp = helper->dtha_predicate) != NULL)
15388 err += dtrace_difo_validate_helper(dp);
15389
15390 for (i = 0; i < helper->dtha_nactions; i++)
15391 err += dtrace_difo_validate_helper(helper->dtha_actions[i]);
15392
15393 return (err == 0);
15394 }
15395
15396 static int
dtrace_helper_action_add(proc_t * p,int which,dtrace_ecbdesc_t * ep)15397 dtrace_helper_action_add(proc_t* p, int which, dtrace_ecbdesc_t *ep)
15398 {
15399 dtrace_helpers_t *help;
15400 dtrace_helper_action_t *helper, *last;
15401 dtrace_actdesc_t *act;
15402 dtrace_vstate_t *vstate;
15403 dtrace_predicate_t *pred;
15404 int count = 0, nactions = 0, i;
15405
15406 if (which < 0 || which >= DTRACE_NHELPER_ACTIONS)
15407 return (EINVAL);
15408
15409 help = p->p_dtrace_helpers;
15410 last = help->dthps_actions[which];
15411 vstate = &help->dthps_vstate;
15412
15413 for (count = 0; last != NULL; last = last->dtha_next) {
15414 count++;
15415 if (last->dtha_next == NULL)
15416 break;
15417 }
15418
15419 /*
15420 * If we already have dtrace_helper_actions_max helper actions for this
15421 * helper action type, we'll refuse to add a new one.
15422 */
15423 if (count >= dtrace_helper_actions_max)
15424 return (ENOSPC);
15425
15426 helper = kmem_zalloc(sizeof (dtrace_helper_action_t), KM_SLEEP);
15427 helper->dtha_generation = help->dthps_generation;
15428
15429 if ((pred = ep->dted_pred.dtpdd_predicate) != NULL) {
15430 ASSERT(pred->dtp_difo != NULL);
15431 dtrace_difo_hold(pred->dtp_difo);
15432 helper->dtha_predicate = pred->dtp_difo;
15433 }
15434
15435 for (act = ep->dted_action; act != NULL; act = act->dtad_next) {
15436 if (act->dtad_kind != DTRACEACT_DIFEXPR)
15437 goto err;
15438
15439 if (act->dtad_difo == NULL)
15440 goto err;
15441
15442 nactions++;
15443 }
15444
15445 helper->dtha_actions = kmem_zalloc(sizeof (dtrace_difo_t *) *
15446 (helper->dtha_nactions = nactions), KM_SLEEP);
15447
15448 for (act = ep->dted_action, i = 0; act != NULL; act = act->dtad_next) {
15449 dtrace_difo_hold(act->dtad_difo);
15450 helper->dtha_actions[i++] = act->dtad_difo;
15451 }
15452
15453 if (!dtrace_helper_validate(helper))
15454 goto err;
15455
15456 if (last == NULL) {
15457 help->dthps_actions[which] = helper;
15458 } else {
15459 last->dtha_next = helper;
15460 }
15461
15462 if ((uint32_t)vstate->dtvs_nlocals > dtrace_helptrace_nlocals) {
15463 dtrace_helptrace_nlocals = vstate->dtvs_nlocals;
15464 dtrace_helptrace_next = 0;
15465 }
15466
15467 return (0);
15468 err:
15469 dtrace_helper_action_destroy(helper, vstate);
15470 return (EINVAL);
15471 }
15472
15473 static void
dtrace_helper_provider_register(proc_t * p,dtrace_helpers_t * help,dof_helper_t * dofhp)15474 dtrace_helper_provider_register(proc_t *p, dtrace_helpers_t *help,
15475 dof_helper_t *dofhp)
15476 {
15477 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15478 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
15479
15480 lck_mtx_lock(&dtrace_lock);
15481
15482 if (!dtrace_attached() || dtrace_meta_pid == NULL) {
15483 /*
15484 * If the dtrace module is loaded but not attached, or if
15485 * there aren't isn't a meta provider registered to deal with
15486 * these provider descriptions, we need to postpone creating
15487 * the actual providers until later.
15488 */
15489
15490 if (help->dthps_next == NULL && help->dthps_prev == NULL &&
15491 dtrace_deferred_pid != help) {
15492 help->dthps_deferred = 1;
15493 help->dthps_pid = proc_getpid(p);
15494 help->dthps_next = dtrace_deferred_pid;
15495 help->dthps_prev = NULL;
15496 if (dtrace_deferred_pid != NULL)
15497 dtrace_deferred_pid->dthps_prev = help;
15498 dtrace_deferred_pid = help;
15499 }
15500
15501 lck_mtx_unlock(&dtrace_lock);
15502
15503 } else if (dofhp != NULL) {
15504 /*
15505 * If the dtrace module is loaded and we have a particular
15506 * helper provider description, pass that off to the
15507 * meta provider.
15508 */
15509
15510 lck_mtx_unlock(&dtrace_lock);
15511
15512 dtrace_helper_provide(dofhp, p);
15513
15514 } else {
15515 /*
15516 * Otherwise, just pass all the helper provider descriptions
15517 * off to the meta provider.
15518 */
15519
15520 uint_t i;
15521 lck_mtx_unlock(&dtrace_lock);
15522
15523 for (i = 0; i < help->dthps_nprovs; i++) {
15524 dtrace_helper_provide(&help->dthps_provs[i]->dthp_prov,
15525 p);
15526 }
15527 }
15528 }
15529
15530 static int
dtrace_helper_provider_add(proc_t * p,dof_helper_t * dofhp,int gen)15531 dtrace_helper_provider_add(proc_t* p, dof_helper_t *dofhp, int gen)
15532 {
15533 dtrace_helpers_t *help;
15534 dtrace_helper_provider_t *hprov, **tmp_provs;
15535 uint_t tmp_maxprovs, i;
15536
15537 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15538 help = p->p_dtrace_helpers;
15539 ASSERT(help != NULL);
15540
15541 /*
15542 * If we already have dtrace_helper_providers_max helper providers,
15543 * we're refuse to add a new one.
15544 */
15545 if (help->dthps_nprovs >= dtrace_helper_providers_max)
15546 return (ENOSPC);
15547
15548 /*
15549 * Check to make sure this isn't a duplicate.
15550 */
15551 for (i = 0; i < help->dthps_nprovs; i++) {
15552 if (dofhp->dofhp_addr ==
15553 help->dthps_provs[i]->dthp_prov.dofhp_addr)
15554 return (EALREADY);
15555 }
15556
15557 hprov = kmem_zalloc(sizeof (dtrace_helper_provider_t), KM_SLEEP);
15558 hprov->dthp_prov = *dofhp;
15559 hprov->dthp_ref = 1;
15560 hprov->dthp_generation = gen;
15561
15562 /*
15563 * Allocate a bigger table for helper providers if it's already full.
15564 */
15565 if (help->dthps_maxprovs == help->dthps_nprovs) {
15566 tmp_maxprovs = help->dthps_maxprovs;
15567 tmp_provs = help->dthps_provs;
15568
15569 if (help->dthps_maxprovs == 0)
15570 help->dthps_maxprovs = 2;
15571 else
15572 help->dthps_maxprovs *= 2;
15573 if (help->dthps_maxprovs > dtrace_helper_providers_max)
15574 help->dthps_maxprovs = dtrace_helper_providers_max;
15575
15576 ASSERT(tmp_maxprovs < help->dthps_maxprovs);
15577
15578 help->dthps_provs = kmem_zalloc(help->dthps_maxprovs *
15579 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
15580
15581 if (tmp_provs != NULL) {
15582 bcopy(tmp_provs, help->dthps_provs, tmp_maxprovs *
15583 sizeof (dtrace_helper_provider_t *));
15584 kmem_free(tmp_provs, tmp_maxprovs *
15585 sizeof (dtrace_helper_provider_t *));
15586 }
15587 }
15588
15589 help->dthps_provs[help->dthps_nprovs] = hprov;
15590 help->dthps_nprovs++;
15591
15592 return (0);
15593 }
15594
15595 static void
dtrace_helper_provider_destroy(dtrace_helper_provider_t * hprov)15596 dtrace_helper_provider_destroy(dtrace_helper_provider_t *hprov)
15597 {
15598 lck_mtx_lock(&dtrace_lock);
15599
15600 if (--hprov->dthp_ref == 0) {
15601 dof_hdr_t *dof;
15602 lck_mtx_unlock(&dtrace_lock);
15603 dof = (dof_hdr_t *)(uintptr_t)hprov->dthp_prov.dofhp_dof;
15604 dtrace_dof_destroy(dof);
15605 kmem_free(hprov, sizeof (dtrace_helper_provider_t));
15606 } else {
15607 lck_mtx_unlock(&dtrace_lock);
15608 }
15609 }
15610
15611 static int
dtrace_helper_provider_validate(dof_hdr_t * dof,dof_sec_t * sec)15612 dtrace_helper_provider_validate(dof_hdr_t *dof, dof_sec_t *sec)
15613 {
15614 uintptr_t daddr = (uintptr_t)dof;
15615 dof_sec_t *str_sec, *prb_sec, *arg_sec, *off_sec, *enoff_sec;
15616 dof_provider_t *provider;
15617 dof_probe_t *probe;
15618 uint8_t *arg;
15619 char *strtab, *typestr;
15620 dof_stridx_t typeidx;
15621 size_t typesz;
15622 uint_t nprobes, j, k;
15623
15624 ASSERT(sec->dofs_type == DOF_SECT_PROVIDER);
15625
15626 if (sec->dofs_offset & (sizeof (uint_t) - 1)) {
15627 dtrace_dof_error(dof, "misaligned section offset");
15628 return (-1);
15629 }
15630
15631 /*
15632 * The section needs to be large enough to contain the DOF provider
15633 * structure appropriate for the given version.
15634 */
15635 if (sec->dofs_size <
15636 ((dof->dofh_ident[DOF_ID_VERSION] == DOF_VERSION_1) ?
15637 offsetof(dof_provider_t, dofpv_prenoffs) :
15638 sizeof (dof_provider_t))) {
15639 dtrace_dof_error(dof, "provider section too small");
15640 return (-1);
15641 }
15642
15643 provider = (dof_provider_t *)(uintptr_t)(daddr + sec->dofs_offset);
15644 str_sec = dtrace_dof_sect(dof, DOF_SECT_STRTAB, provider->dofpv_strtab);
15645 prb_sec = dtrace_dof_sect(dof, DOF_SECT_PROBES, provider->dofpv_probes);
15646 arg_sec = dtrace_dof_sect(dof, DOF_SECT_PRARGS, provider->dofpv_prargs);
15647 off_sec = dtrace_dof_sect(dof, DOF_SECT_PROFFS, provider->dofpv_proffs);
15648
15649 if (str_sec == NULL || prb_sec == NULL ||
15650 arg_sec == NULL || off_sec == NULL)
15651 return (-1);
15652
15653 enoff_sec = NULL;
15654
15655 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1 &&
15656 provider->dofpv_prenoffs != DOF_SECT_NONE &&
15657 (enoff_sec = dtrace_dof_sect(dof, DOF_SECT_PRENOFFS,
15658 provider->dofpv_prenoffs)) == NULL)
15659 return (-1);
15660
15661 strtab = (char *)(uintptr_t)(daddr + str_sec->dofs_offset);
15662
15663 if (provider->dofpv_name >= str_sec->dofs_size ||
15664 strlen(strtab + provider->dofpv_name) >= DTRACE_PROVNAMELEN) {
15665 dtrace_dof_error(dof, "invalid provider name");
15666 return (-1);
15667 }
15668
15669 if (prb_sec->dofs_entsize == 0 ||
15670 prb_sec->dofs_entsize > prb_sec->dofs_size) {
15671 dtrace_dof_error(dof, "invalid entry size");
15672 return (-1);
15673 }
15674
15675 if (prb_sec->dofs_entsize & (sizeof (uintptr_t) - 1)) {
15676 dtrace_dof_error(dof, "misaligned entry size");
15677 return (-1);
15678 }
15679
15680 if (off_sec->dofs_entsize != sizeof (uint32_t)) {
15681 dtrace_dof_error(dof, "invalid entry size");
15682 return (-1);
15683 }
15684
15685 if (off_sec->dofs_offset & (sizeof (uint32_t) - 1)) {
15686 dtrace_dof_error(dof, "misaligned section offset");
15687 return (-1);
15688 }
15689
15690 if (arg_sec->dofs_entsize != sizeof (uint8_t)) {
15691 dtrace_dof_error(dof, "invalid entry size");
15692 return (-1);
15693 }
15694
15695 arg = (uint8_t *)(uintptr_t)(daddr + arg_sec->dofs_offset);
15696
15697 nprobes = prb_sec->dofs_size / prb_sec->dofs_entsize;
15698
15699 /*
15700 * Take a pass through the probes to check for errors.
15701 */
15702 for (j = 0; j < nprobes; j++) {
15703 probe = (dof_probe_t *)(uintptr_t)(daddr +
15704 prb_sec->dofs_offset + j * prb_sec->dofs_entsize);
15705
15706 if (probe->dofpr_func >= str_sec->dofs_size) {
15707 dtrace_dof_error(dof, "invalid function name");
15708 return (-1);
15709 }
15710
15711 if (strlen(strtab + probe->dofpr_func) >= DTRACE_FUNCNAMELEN) {
15712 dtrace_dof_error(dof, "function name too long");
15713 return (-1);
15714 }
15715
15716 if (probe->dofpr_name >= str_sec->dofs_size ||
15717 strlen(strtab + probe->dofpr_name) >= DTRACE_NAMELEN) {
15718 dtrace_dof_error(dof, "invalid probe name");
15719 return (-1);
15720 }
15721
15722 /*
15723 * The offset count must not wrap the index, and the offsets
15724 * must also not overflow the section's data.
15725 */
15726 if (probe->dofpr_offidx + probe->dofpr_noffs <
15727 probe->dofpr_offidx ||
15728 (probe->dofpr_offidx + probe->dofpr_noffs) *
15729 off_sec->dofs_entsize > off_sec->dofs_size) {
15730 dtrace_dof_error(dof, "invalid probe offset");
15731 return (-1);
15732 }
15733
15734 if (dof->dofh_ident[DOF_ID_VERSION] != DOF_VERSION_1) {
15735 /*
15736 * If there's no is-enabled offset section, make sure
15737 * there aren't any is-enabled offsets. Otherwise
15738 * perform the same checks as for probe offsets
15739 * (immediately above).
15740 */
15741 if (enoff_sec == NULL) {
15742 if (probe->dofpr_enoffidx != 0 ||
15743 probe->dofpr_nenoffs != 0) {
15744 dtrace_dof_error(dof, "is-enabled "
15745 "offsets with null section");
15746 return (-1);
15747 }
15748 } else if (probe->dofpr_enoffidx +
15749 probe->dofpr_nenoffs < probe->dofpr_enoffidx ||
15750 (probe->dofpr_enoffidx + probe->dofpr_nenoffs) *
15751 enoff_sec->dofs_entsize > enoff_sec->dofs_size) {
15752 dtrace_dof_error(dof, "invalid is-enabled "
15753 "offset");
15754 return (-1);
15755 }
15756
15757 if (probe->dofpr_noffs + probe->dofpr_nenoffs == 0) {
15758 dtrace_dof_error(dof, "zero probe and "
15759 "is-enabled offsets");
15760 return (-1);
15761 }
15762 } else if (probe->dofpr_noffs == 0) {
15763 dtrace_dof_error(dof, "zero probe offsets");
15764 return (-1);
15765 }
15766
15767 if (probe->dofpr_argidx + probe->dofpr_xargc <
15768 probe->dofpr_argidx ||
15769 (probe->dofpr_argidx + probe->dofpr_xargc) *
15770 arg_sec->dofs_entsize > arg_sec->dofs_size) {
15771 dtrace_dof_error(dof, "invalid args");
15772 return (-1);
15773 }
15774
15775 typeidx = probe->dofpr_nargv;
15776 typestr = strtab + probe->dofpr_nargv;
15777 for (k = 0; k < probe->dofpr_nargc; k++) {
15778 if (typeidx >= str_sec->dofs_size) {
15779 dtrace_dof_error(dof, "bad "
15780 "native argument type");
15781 return (-1);
15782 }
15783
15784 typesz = strlen(typestr) + 1;
15785 if (typesz > DTRACE_ARGTYPELEN) {
15786 dtrace_dof_error(dof, "native "
15787 "argument type too long");
15788 return (-1);
15789 }
15790 typeidx += typesz;
15791 typestr += typesz;
15792 }
15793
15794 typeidx = probe->dofpr_xargv;
15795 typestr = strtab + probe->dofpr_xargv;
15796 for (k = 0; k < probe->dofpr_xargc; k++) {
15797 if (arg[probe->dofpr_argidx + k] > probe->dofpr_nargc) {
15798 dtrace_dof_error(dof, "bad "
15799 "native argument index");
15800 return (-1);
15801 }
15802
15803 if (typeidx >= str_sec->dofs_size) {
15804 dtrace_dof_error(dof, "bad "
15805 "translated argument type");
15806 return (-1);
15807 }
15808
15809 typesz = strlen(typestr) + 1;
15810 if (typesz > DTRACE_ARGTYPELEN) {
15811 dtrace_dof_error(dof, "translated argument "
15812 "type too long");
15813 return (-1);
15814 }
15815
15816 typeidx += typesz;
15817 typestr += typesz;
15818 }
15819 }
15820
15821 return (0);
15822 }
15823
15824 static int
dtrace_helper_slurp(proc_t * p,dof_hdr_t * dof,dof_helper_t * dhp)15825 dtrace_helper_slurp(proc_t* p, dof_hdr_t *dof, dof_helper_t *dhp)
15826 {
15827 dtrace_helpers_t *help;
15828 dtrace_vstate_t *vstate;
15829 dtrace_enabling_t *enab = NULL;
15830 int i, gen, rv, nhelpers = 0, nprovs = 0, destroy = 1;
15831 uintptr_t daddr = (uintptr_t)dof;
15832
15833 LCK_MTX_ASSERT(&dtrace_meta_lock, LCK_MTX_ASSERT_OWNED);
15834 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
15835
15836 if ((help = p->p_dtrace_helpers) == NULL)
15837 help = dtrace_helpers_create(p);
15838
15839 vstate = &help->dthps_vstate;
15840
15841 if ((rv = dtrace_dof_slurp(dof, vstate, NULL, &enab,
15842 dhp != NULL ? dhp->dofhp_addr : 0, B_FALSE)) != 0) {
15843 dtrace_dof_destroy(dof);
15844 return (rv);
15845 }
15846
15847 /*
15848 * Look for helper providers and validate their descriptions.
15849 */
15850 if (dhp != NULL) {
15851 for (i = 0; (uint32_t)i < dof->dofh_secnum; i++) {
15852 dof_sec_t *sec = (dof_sec_t *)(uintptr_t)(daddr +
15853 dof->dofh_secoff + i * dof->dofh_secsize);
15854
15855 if (sec->dofs_type != DOF_SECT_PROVIDER)
15856 continue;
15857
15858 if (dtrace_helper_provider_validate(dof, sec) != 0) {
15859 dtrace_enabling_destroy(enab);
15860 dtrace_dof_destroy(dof);
15861 return (-1);
15862 }
15863
15864 nprovs++;
15865 }
15866 }
15867
15868 /*
15869 * Now we need to walk through the ECB descriptions in the enabling.
15870 */
15871 for (i = 0; i < enab->dten_ndesc; i++) {
15872 dtrace_ecbdesc_t *ep = enab->dten_desc[i];
15873 dtrace_probedesc_t *desc = &ep->dted_probe;
15874
15875 /* APPLE NOTE: Darwin employs size bounded string operation. */
15876 if (!LIT_STRNEQL(desc->dtpd_provider, "dtrace"))
15877 continue;
15878
15879 if (!LIT_STRNEQL(desc->dtpd_mod, "helper"))
15880 continue;
15881
15882 if (!LIT_STRNEQL(desc->dtpd_func, "ustack"))
15883 continue;
15884
15885 if ((rv = dtrace_helper_action_add(p, DTRACE_HELPER_ACTION_USTACK,
15886 ep)) != 0) {
15887 /*
15888 * Adding this helper action failed -- we are now going
15889 * to rip out the entire generation and return failure.
15890 */
15891 (void) dtrace_helper_destroygen(p, help->dthps_generation);
15892 dtrace_enabling_destroy(enab);
15893 dtrace_dof_destroy(dof);
15894 return (-1);
15895 }
15896
15897 nhelpers++;
15898 }
15899
15900 if (nhelpers < enab->dten_ndesc)
15901 dtrace_dof_error(dof, "unmatched helpers");
15902
15903 gen = help->dthps_generation++;
15904 dtrace_enabling_destroy(enab);
15905
15906 if (dhp != NULL && nprovs > 0) {
15907 dhp->dofhp_dof = (uint64_t)(uintptr_t)dof;
15908 if (dtrace_helper_provider_add(p, dhp, gen) == 0) {
15909 lck_mtx_unlock(&dtrace_lock);
15910 dtrace_helper_provider_register(p, help, dhp);
15911 lck_mtx_lock(&dtrace_lock);
15912
15913 destroy = 0;
15914 }
15915 }
15916
15917 if (destroy)
15918 dtrace_dof_destroy(dof);
15919
15920 return (gen);
15921 }
15922
15923 /*
15924 * APPLE NOTE: DTrace lazy dof implementation
15925 *
15926 * DTrace user static probes (USDT probes) and helper actions are loaded
15927 * in a process by proccessing dof sections. The dof sections are passed
15928 * into the kernel by dyld, in a dof_ioctl_data_t block. It is rather
15929 * expensive to process dof for a process that will never use it. There
15930 * is a memory cost (allocating the providers/probes), and a cpu cost
15931 * (creating the providers/probes).
15932 *
15933 * To reduce this cost, we use "lazy dof". The normal proceedure for
15934 * dof processing is to copyin the dof(s) pointed to by the dof_ioctl_data_t
15935 * block, and invoke dof_slurp_helper() on them. When "lazy dof" is
15936 * used, each process retains the dof_ioctl_data_t block, instead of
15937 * copying in the data it points to.
15938 *
15939 * The dof_ioctl_data_t blocks are managed as if they were the actual
15940 * processed dof; on fork the block is copied to the child, on exec and
15941 * exit the block is freed.
15942 *
15943 * If the process loads library(s) containing additional dof, the
15944 * new dof_ioctl_data_t is merged with the existing block.
15945 *
15946 * There are a few catches that make this slightly more difficult.
15947 * When dyld registers dof_ioctl_data_t blocks, it expects a unique
15948 * identifier value for each dof in the block. In non-lazy dof terms,
15949 * this is the generation that dof was loaded in. If we hand back
15950 * a UID for a lazy dof, that same UID must be able to unload the
15951 * dof once it has become non-lazy. To meet this requirement, the
15952 * code that loads lazy dof requires that the UID's for dof(s) in
15953 * the lazy dof be sorted, and in ascending order. It is okay to skip
15954 * UID's, I.E., 1 -> 5 -> 6 is legal.
15955 *
15956 * Once a process has become non-lazy, it will stay non-lazy. All
15957 * future dof operations for that process will be non-lazy, even
15958 * if the dof mode transitions back to lazy.
15959 *
15960 * Always do lazy dof checks before non-lazy (I.E. In fork, exit, exec.).
15961 * That way if the lazy check fails due to transitioning to non-lazy, the
15962 * right thing is done with the newly faulted in dof.
15963 */
15964
15965 /*
15966 * This method is a bit squicky. It must handle:
15967 *
15968 * dof should not be lazy.
15969 * dof should have been handled lazily, but there was an error
15970 * dof was handled lazily, and needs to be freed.
15971 * dof was handled lazily, and must not be freed.
15972 *
15973 *
15974 * Returns EACCESS if dof should be handled non-lazily.
15975 *
15976 * KERN_SUCCESS and all other return codes indicate lazy handling of dof.
15977 *
15978 * If the dofs data is claimed by this method, dofs_claimed will be set.
15979 * Callers should not free claimed dofs.
15980 */
15981 static int
dtrace_lazy_dofs_add(proc_t * p,dof_ioctl_data_t * incoming_dofs,int * dofs_claimed)15982 dtrace_lazy_dofs_add(proc_t *p, dof_ioctl_data_t* incoming_dofs, int *dofs_claimed)
15983 {
15984 ASSERT(p);
15985 ASSERT(incoming_dofs && incoming_dofs->dofiod_count > 0);
15986
15987 int rval = 0;
15988 *dofs_claimed = 0;
15989
15990 lck_rw_lock_shared(&dtrace_dof_mode_lock);
15991
15992 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
15993 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
15994
15995 /*
15996 * Any existing helpers force non-lazy behavior.
15997 */
15998 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
15999 dtrace_sprlock(p);
16000
16001 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16002 unsigned int existing_dofs_count = (existing_dofs) ? existing_dofs->dofiod_count : 0;
16003 unsigned int i, merged_dofs_count = incoming_dofs->dofiod_count + existing_dofs_count;
16004
16005 /*
16006 * Range check...
16007 */
16008 if (merged_dofs_count == 0 || merged_dofs_count > 1024) {
16009 dtrace_dof_error(NULL, "lazy_dofs_add merged_dofs_count out of range");
16010 rval = EINVAL;
16011 goto unlock;
16012 }
16013
16014 /*
16015 * Each dof being added must be assigned a unique generation.
16016 */
16017 uint64_t generation = (existing_dofs) ? existing_dofs->dofiod_helpers[existing_dofs_count - 1].dofhp_dof + 1 : 1;
16018 for (i=0; i<incoming_dofs->dofiod_count; i++) {
16019 /*
16020 * We rely on these being the same so we can overwrite dofhp_dof and not lose info.
16021 */
16022 ASSERT(incoming_dofs->dofiod_helpers[i].dofhp_dof == incoming_dofs->dofiod_helpers[i].dofhp_addr);
16023 incoming_dofs->dofiod_helpers[i].dofhp_dof = generation++;
16024 }
16025
16026
16027 if (existing_dofs) {
16028 /*
16029 * Merge the existing and incoming dofs
16030 */
16031 size_t merged_dofs_size = DOF_IOCTL_DATA_T_SIZE(merged_dofs_count);
16032 dof_ioctl_data_t* merged_dofs = kmem_alloc(merged_dofs_size, KM_SLEEP);
16033
16034 bcopy(&existing_dofs->dofiod_helpers[0],
16035 &merged_dofs->dofiod_helpers[0],
16036 sizeof(dof_helper_t) * existing_dofs_count);
16037 bcopy(&incoming_dofs->dofiod_helpers[0],
16038 &merged_dofs->dofiod_helpers[existing_dofs_count],
16039 sizeof(dof_helper_t) * incoming_dofs->dofiod_count);
16040
16041 merged_dofs->dofiod_count = merged_dofs_count;
16042
16043 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16044
16045 p->p_dtrace_lazy_dofs = merged_dofs;
16046 } else {
16047 /*
16048 * Claim the incoming dofs
16049 */
16050 *dofs_claimed = 1;
16051 p->p_dtrace_lazy_dofs = incoming_dofs;
16052 }
16053
16054 #if DEBUG
16055 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16056 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16057 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16058 }
16059 #endif /* DEBUG */
16060
16061 unlock:
16062 dtrace_sprunlock(p);
16063 } else {
16064 rval = EACCES;
16065 }
16066
16067 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16068
16069 return rval;
16070 }
16071
16072 /*
16073 * Returns:
16074 *
16075 * EINVAL: lazy dof is enabled, but the requested generation was not found.
16076 * EACCES: This removal needs to be handled non-lazily.
16077 */
16078 static int
dtrace_lazy_dofs_remove(proc_t * p,int generation)16079 dtrace_lazy_dofs_remove(proc_t *p, int generation)
16080 {
16081 int rval = EINVAL;
16082
16083 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16084
16085 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16086 ASSERT(dtrace_dof_mode != DTRACE_DOF_MODE_NEVER);
16087
16088 /*
16089 * Any existing helpers force non-lazy behavior.
16090 */
16091 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON && (p->p_dtrace_helpers == NULL)) {
16092 dtrace_sprlock(p);
16093
16094 dof_ioctl_data_t* existing_dofs = p->p_dtrace_lazy_dofs;
16095
16096 if (existing_dofs) {
16097 int index, existing_dofs_count = existing_dofs->dofiod_count;
16098 for (index=0; index<existing_dofs_count; index++) {
16099 if ((int)existing_dofs->dofiod_helpers[index].dofhp_dof == generation) {
16100 dof_ioctl_data_t* removed_dofs = NULL;
16101
16102 /*
16103 * If there is only 1 dof, we'll delete it and swap in NULL.
16104 */
16105 if (existing_dofs_count > 1) {
16106 int removed_dofs_count = existing_dofs_count - 1;
16107 size_t removed_dofs_size = DOF_IOCTL_DATA_T_SIZE(removed_dofs_count);
16108
16109 removed_dofs = kmem_alloc(removed_dofs_size, KM_SLEEP);
16110 removed_dofs->dofiod_count = removed_dofs_count;
16111
16112 /*
16113 * copy the remaining data.
16114 */
16115 if (index > 0) {
16116 bcopy(&existing_dofs->dofiod_helpers[0],
16117 &removed_dofs->dofiod_helpers[0],
16118 index * sizeof(dof_helper_t));
16119 }
16120
16121 if (index < existing_dofs_count-1) {
16122 bcopy(&existing_dofs->dofiod_helpers[index+1],
16123 &removed_dofs->dofiod_helpers[index],
16124 (existing_dofs_count - index - 1) * sizeof(dof_helper_t));
16125 }
16126 }
16127
16128 kmem_free(existing_dofs, DOF_IOCTL_DATA_T_SIZE(existing_dofs_count));
16129
16130 p->p_dtrace_lazy_dofs = removed_dofs;
16131
16132 rval = KERN_SUCCESS;
16133
16134 break;
16135 }
16136 }
16137
16138 #if DEBUG
16139 dof_ioctl_data_t* all_dofs = p->p_dtrace_lazy_dofs;
16140 if (all_dofs) {
16141 unsigned int i;
16142 for (i=0; i<all_dofs->dofiod_count-1; i++) {
16143 ASSERT(all_dofs->dofiod_helpers[i].dofhp_dof < all_dofs->dofiod_helpers[i+1].dofhp_dof);
16144 }
16145 }
16146 #endif
16147
16148 }
16149 dtrace_sprunlock(p);
16150 } else {
16151 rval = EACCES;
16152 }
16153
16154 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16155
16156 return rval;
16157 }
16158
16159 void
dtrace_lazy_dofs_destroy(proc_t * p)16160 dtrace_lazy_dofs_destroy(proc_t *p)
16161 {
16162 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16163 dtrace_sprlock(p);
16164
16165 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16166
16167 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16168 p->p_dtrace_lazy_dofs = NULL;
16169
16170 dtrace_sprunlock(p);
16171 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16172
16173 if (lazy_dofs) {
16174 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16175 }
16176 }
16177
16178 static int
dtrace_lazy_dofs_proc_iterate_filter(proc_t * p,void * ignored)16179 dtrace_lazy_dofs_proc_iterate_filter(proc_t *p, void* ignored)
16180 {
16181 #pragma unused(ignored)
16182 /*
16183 * Okay to NULL test without taking the sprlock.
16184 */
16185 return p->p_dtrace_lazy_dofs != NULL;
16186 }
16187
16188 static void
dtrace_lazy_dofs_process(proc_t * p)16189 dtrace_lazy_dofs_process(proc_t *p) {
16190 /*
16191 * It is possible this process may exit during our attempt to
16192 * fault in the dof. We could fix this by holding locks longer,
16193 * but the errors are benign.
16194 */
16195 dtrace_sprlock(p);
16196
16197
16198 ASSERT(p->p_dtrace_lazy_dofs == NULL || p->p_dtrace_helpers == NULL);
16199 ASSERT(dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF);
16200
16201 dof_ioctl_data_t* lazy_dofs = p->p_dtrace_lazy_dofs;
16202 p->p_dtrace_lazy_dofs = NULL;
16203
16204 dtrace_sprunlock(p);
16205 lck_mtx_lock(&dtrace_meta_lock);
16206 /*
16207 * Process each dof_helper_t
16208 */
16209 if (lazy_dofs != NULL) {
16210 unsigned int i;
16211 int rval;
16212
16213 for (i=0; i<lazy_dofs->dofiod_count; i++) {
16214 /*
16215 * When loading lazy dof, we depend on the generations being sorted in ascending order.
16216 */
16217 ASSERT(i >= (lazy_dofs->dofiod_count - 1) || lazy_dofs->dofiod_helpers[i].dofhp_dof < lazy_dofs->dofiod_helpers[i+1].dofhp_dof);
16218
16219 dof_helper_t *dhp = &lazy_dofs->dofiod_helpers[i];
16220
16221 /*
16222 * We stored the generation in dofhp_dof. Save it, and restore the original value.
16223 */
16224 int generation = dhp->dofhp_dof;
16225 dhp->dofhp_dof = dhp->dofhp_addr;
16226
16227 dof_hdr_t *dof = dtrace_dof_copyin_from_proc(p, dhp->dofhp_dof, &rval);
16228
16229 if (dof != NULL) {
16230 dtrace_helpers_t *help;
16231
16232 lck_mtx_lock(&dtrace_lock);
16233
16234 /*
16235 * This must be done with the dtrace_lock held
16236 */
16237 if ((help = p->p_dtrace_helpers) == NULL)
16238 help = dtrace_helpers_create(p);
16239
16240 /*
16241 * If the generation value has been bumped, someone snuck in
16242 * when we released the dtrace lock. We have to dump this generation,
16243 * there is no safe way to load it.
16244 */
16245 if (help->dthps_generation <= generation) {
16246 help->dthps_generation = generation;
16247
16248 /*
16249 * dtrace_helper_slurp() takes responsibility for the dof --
16250 * it may free it now or it may save it and free it later.
16251 */
16252 if ((rval = dtrace_helper_slurp(p, dof, dhp)) != generation) {
16253 dtrace_dof_error(NULL, "returned value did not match expected generation");
16254 }
16255 }
16256
16257 lck_mtx_unlock(&dtrace_lock);
16258 }
16259 }
16260 lck_mtx_unlock(&dtrace_meta_lock);
16261 kmem_free(lazy_dofs, DOF_IOCTL_DATA_T_SIZE(lazy_dofs->dofiod_count));
16262 } else {
16263 lck_mtx_unlock(&dtrace_meta_lock);
16264 }
16265 }
16266
16267 static int
dtrace_lazy_dofs_proc_iterate_doit(proc_t * p,void * ignored)16268 dtrace_lazy_dofs_proc_iterate_doit(proc_t *p, void* ignored)
16269 {
16270 #pragma unused(ignored)
16271
16272 dtrace_lazy_dofs_process(p);
16273
16274 return PROC_RETURNED;
16275 }
16276
16277 #define DTRACE_LAZY_DOFS_DUPLICATED 1
16278
16279 static int
dtrace_lazy_dofs_duplicate(proc_t * parent,proc_t * child)16280 dtrace_lazy_dofs_duplicate(proc_t *parent, proc_t *child)
16281 {
16282 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_NOTOWNED);
16283 LCK_MTX_ASSERT(&parent->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16284 LCK_MTX_ASSERT(&child->p_dtrace_sprlock, LCK_MTX_ASSERT_NOTOWNED);
16285
16286 lck_rw_lock_shared(&dtrace_dof_mode_lock);
16287 dtrace_sprlock(parent);
16288
16289 /*
16290 * We need to make sure that the transition to lazy dofs -> helpers
16291 * was atomic for our parent
16292 */
16293 ASSERT(parent->p_dtrace_lazy_dofs == NULL || parent->p_dtrace_helpers == NULL);
16294 /*
16295 * In theory we should hold the child sprlock, but this is safe...
16296 */
16297 ASSERT(child->p_dtrace_lazy_dofs == NULL && child->p_dtrace_helpers == NULL);
16298
16299 dof_ioctl_data_t* parent_dofs = parent->p_dtrace_lazy_dofs;
16300 dof_ioctl_data_t* child_dofs = NULL;
16301 if (parent_dofs) {
16302 size_t parent_dofs_size = DOF_IOCTL_DATA_T_SIZE(parent_dofs->dofiod_count);
16303 child_dofs = kmem_alloc(parent_dofs_size, KM_SLEEP);
16304 bcopy(parent_dofs, child_dofs, parent_dofs_size);
16305 }
16306
16307 dtrace_sprunlock(parent);
16308
16309 if (child_dofs) {
16310 dtrace_sprlock(child);
16311 child->p_dtrace_lazy_dofs = child_dofs;
16312 dtrace_sprunlock(child);
16313 /**
16314 * We process the DOF at this point if the mode is set to
16315 * LAZY_OFF. This can happen if DTrace is still processing the
16316 * DOF of other process (which can happen because the
16317 * protected pager can have a huge latency)
16318 * but has not processed our parent yet
16319 */
16320 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
16321 dtrace_lazy_dofs_process(child);
16322 }
16323 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16324
16325 return DTRACE_LAZY_DOFS_DUPLICATED;
16326 }
16327 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
16328
16329 return 0;
16330 }
16331
16332 static dtrace_helpers_t *
dtrace_helpers_create(proc_t * p)16333 dtrace_helpers_create(proc_t *p)
16334 {
16335 dtrace_helpers_t *help;
16336
16337 LCK_MTX_ASSERT(&dtrace_lock, LCK_MTX_ASSERT_OWNED);
16338 ASSERT(p->p_dtrace_helpers == NULL);
16339
16340 help = kmem_zalloc(sizeof (dtrace_helpers_t), KM_SLEEP);
16341 help->dthps_actions = kmem_zalloc(sizeof (dtrace_helper_action_t *) *
16342 DTRACE_NHELPER_ACTIONS, KM_SLEEP);
16343
16344 p->p_dtrace_helpers = help;
16345 dtrace_helpers++;
16346
16347 return (help);
16348 }
16349
16350 static void
dtrace_helpers_destroy(proc_t * p)16351 dtrace_helpers_destroy(proc_t* p)
16352 {
16353 dtrace_helpers_t *help;
16354 dtrace_vstate_t *vstate;
16355 uint_t i;
16356
16357 lck_mtx_lock(&dtrace_meta_lock);
16358 lck_mtx_lock(&dtrace_lock);
16359
16360 ASSERT(p->p_dtrace_helpers != NULL);
16361 ASSERT(dtrace_helpers > 0);
16362
16363 help = p->p_dtrace_helpers;
16364 vstate = &help->dthps_vstate;
16365
16366 /*
16367 * We're now going to lose the help from this process.
16368 */
16369 p->p_dtrace_helpers = NULL;
16370 dtrace_sync();
16371
16372 /*
16373 * Destory the helper actions.
16374 */
16375 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16376 dtrace_helper_action_t *h, *next;
16377
16378 for (h = help->dthps_actions[i]; h != NULL; h = next) {
16379 next = h->dtha_next;
16380 dtrace_helper_action_destroy(h, vstate);
16381 h = next;
16382 }
16383 }
16384
16385 lck_mtx_unlock(&dtrace_lock);
16386
16387 /*
16388 * Destroy the helper providers.
16389 */
16390 if (help->dthps_maxprovs > 0) {
16391 if (dtrace_meta_pid != NULL) {
16392 ASSERT(dtrace_deferred_pid == NULL);
16393
16394 for (i = 0; i < help->dthps_nprovs; i++) {
16395 dtrace_helper_provider_remove(
16396 &help->dthps_provs[i]->dthp_prov, p);
16397 }
16398 } else {
16399 lck_mtx_lock(&dtrace_lock);
16400 ASSERT(help->dthps_deferred == 0 ||
16401 help->dthps_next != NULL ||
16402 help->dthps_prev != NULL ||
16403 help == dtrace_deferred_pid);
16404
16405 /*
16406 * Remove the helper from the deferred list.
16407 */
16408 if (help->dthps_next != NULL)
16409 help->dthps_next->dthps_prev = help->dthps_prev;
16410 if (help->dthps_prev != NULL)
16411 help->dthps_prev->dthps_next = help->dthps_next;
16412 if (dtrace_deferred_pid == help) {
16413 dtrace_deferred_pid = help->dthps_next;
16414 ASSERT(help->dthps_prev == NULL);
16415 }
16416
16417 lck_mtx_unlock(&dtrace_lock);
16418 }
16419
16420
16421 for (i = 0; i < help->dthps_nprovs; i++) {
16422 dtrace_helper_provider_destroy(help->dthps_provs[i]);
16423 }
16424
16425 kmem_free(help->dthps_provs, help->dthps_maxprovs *
16426 sizeof (dtrace_helper_provider_t *));
16427 }
16428
16429 lck_mtx_lock(&dtrace_lock);
16430
16431 dtrace_vstate_fini(&help->dthps_vstate);
16432 kmem_free(help->dthps_actions,
16433 sizeof (dtrace_helper_action_t *) * DTRACE_NHELPER_ACTIONS);
16434 kmem_free(help, sizeof (dtrace_helpers_t));
16435
16436 --dtrace_helpers;
16437 lck_mtx_unlock(&dtrace_lock);
16438 lck_mtx_unlock(&dtrace_meta_lock);
16439 }
16440
16441 static void
dtrace_helpers_duplicate(proc_t * from,proc_t * to)16442 dtrace_helpers_duplicate(proc_t *from, proc_t *to)
16443 {
16444 dtrace_helpers_t *help, *newhelp;
16445 dtrace_helper_action_t *helper, *new, *last;
16446 dtrace_difo_t *dp;
16447 dtrace_vstate_t *vstate;
16448 uint_t i;
16449 int j, sz, hasprovs = 0;
16450
16451 lck_mtx_lock(&dtrace_meta_lock);
16452 lck_mtx_lock(&dtrace_lock);
16453 ASSERT(from->p_dtrace_helpers != NULL);
16454 ASSERT(dtrace_helpers > 0);
16455
16456 help = from->p_dtrace_helpers;
16457 newhelp = dtrace_helpers_create(to);
16458 ASSERT(to->p_dtrace_helpers != NULL);
16459
16460 newhelp->dthps_generation = help->dthps_generation;
16461 vstate = &newhelp->dthps_vstate;
16462
16463 /*
16464 * Duplicate the helper actions.
16465 */
16466 for (i = 0; i < DTRACE_NHELPER_ACTIONS; i++) {
16467 if ((helper = help->dthps_actions[i]) == NULL)
16468 continue;
16469
16470 for (last = NULL; helper != NULL; helper = helper->dtha_next) {
16471 new = kmem_zalloc(sizeof (dtrace_helper_action_t),
16472 KM_SLEEP);
16473 new->dtha_generation = helper->dtha_generation;
16474
16475 if ((dp = helper->dtha_predicate) != NULL) {
16476 dp = dtrace_difo_duplicate(dp, vstate);
16477 new->dtha_predicate = dp;
16478 }
16479
16480 new->dtha_nactions = helper->dtha_nactions;
16481 sz = sizeof (dtrace_difo_t *) * new->dtha_nactions;
16482 new->dtha_actions = kmem_alloc(sz, KM_SLEEP);
16483
16484 for (j = 0; j < new->dtha_nactions; j++) {
16485 dtrace_difo_t *dpj = helper->dtha_actions[j];
16486
16487 ASSERT(dpj != NULL);
16488 dpj = dtrace_difo_duplicate(dpj, vstate);
16489 new->dtha_actions[j] = dpj;
16490 }
16491
16492 if (last != NULL) {
16493 last->dtha_next = new;
16494 } else {
16495 newhelp->dthps_actions[i] = new;
16496 }
16497
16498 last = new;
16499 }
16500 }
16501
16502 /*
16503 * Duplicate the helper providers and register them with the
16504 * DTrace framework.
16505 */
16506 if (help->dthps_nprovs > 0) {
16507 newhelp->dthps_nprovs = help->dthps_nprovs;
16508 newhelp->dthps_maxprovs = help->dthps_nprovs;
16509 newhelp->dthps_provs = kmem_alloc(newhelp->dthps_nprovs *
16510 sizeof (dtrace_helper_provider_t *), KM_SLEEP);
16511 for (i = 0; i < newhelp->dthps_nprovs; i++) {
16512 newhelp->dthps_provs[i] = help->dthps_provs[i];
16513 newhelp->dthps_provs[i]->dthp_ref++;
16514 }
16515
16516 hasprovs = 1;
16517 }
16518
16519 lck_mtx_unlock(&dtrace_lock);
16520
16521 if (hasprovs)
16522 dtrace_helper_provider_register(to, newhelp, NULL);
16523
16524 lck_mtx_unlock(&dtrace_meta_lock);
16525 }
16526
16527 /**
16528 * DTrace Process functions
16529 */
16530
16531 void
dtrace_proc_fork(proc_t * parent_proc,proc_t * child_proc,int spawn)16532 dtrace_proc_fork(proc_t *parent_proc, proc_t *child_proc, int spawn)
16533 {
16534 /*
16535 * This code applies to new processes who are copying the task
16536 * and thread state and address spaces of their parent process.
16537 */
16538 if (!spawn) {
16539 /*
16540 * APPLE NOTE: Solaris does a sprlock() and drops the
16541 * proc_lock here. We're cheating a bit and only taking
16542 * the p_dtrace_sprlock lock. A full sprlock would
16543 * task_suspend the parent.
16544 */
16545 dtrace_sprlock(parent_proc);
16546
16547 /*
16548 * Remove all DTrace tracepoints from the child process. We
16549 * need to do this _before_ duplicating USDT providers since
16550 * any associated probes may be immediately enabled.
16551 */
16552 if (parent_proc->p_dtrace_count > 0) {
16553 dtrace_fasttrap_fork(parent_proc, child_proc);
16554 }
16555
16556 dtrace_sprunlock(parent_proc);
16557
16558 /*
16559 * Duplicate any lazy dof(s). This must be done while NOT
16560 * holding the parent sprlock! Lock ordering is
16561 * dtrace_dof_mode_lock, then sprlock. It is imperative we
16562 * always call dtrace_lazy_dofs_duplicate, rather than null
16563 * check and call if !NULL. If we NULL test, during lazy dof
16564 * faulting we can race with the faulting code and proceed
16565 * from here to beyond the helpers copy. The lazy dof
16566 * faulting will then fail to copy the helpers to the child
16567 * process. We return if we duplicated lazy dofs as a process
16568 * can only have one at the same time to avoid a race between
16569 * a dtrace client and dtrace_proc_fork where a process would
16570 * end up with both lazy dofs and helpers.
16571 */
16572 if (dtrace_lazy_dofs_duplicate(parent_proc, child_proc) == DTRACE_LAZY_DOFS_DUPLICATED) {
16573 return;
16574 }
16575
16576 /*
16577 * Duplicate any helper actions and providers if they haven't
16578 * already.
16579 */
16580 #if !defined(__APPLE__)
16581 /*
16582 * The SFORKING
16583 * we set above informs the code to enable USDT probes that
16584 * sprlock() may fail because the child is being forked.
16585 */
16586 #endif
16587 /*
16588 * APPLE NOTE: As best I can tell, Apple's sprlock() equivalent
16589 * never fails to find the child. We do not set SFORKING.
16590 */
16591 if (parent_proc->p_dtrace_helpers != NULL && dtrace_helpers_fork) {
16592 (*dtrace_helpers_fork)(parent_proc, child_proc);
16593 }
16594 }
16595 }
16596
16597 void
dtrace_proc_exec(proc_t * p)16598 dtrace_proc_exec(proc_t *p)
16599 {
16600 /*
16601 * Invalidate any predicate evaluation already cached for this thread by DTrace.
16602 * That's because we've just stored to p_comm and DTrace refers to that when it
16603 * evaluates the "execname" special variable. uid and gid may have changed as well.
16604 */
16605 dtrace_set_thread_predcache(current_thread(), 0);
16606
16607 /*
16608 * Free any outstanding lazy dof entries. It is imperative we
16609 * always call dtrace_lazy_dofs_destroy, rather than null check
16610 * and call if !NULL. If we NULL test, during lazy dof faulting
16611 * we can race with the faulting code and proceed from here to
16612 * beyond the helpers cleanup. The lazy dof faulting will then
16613 * install new helpers which no longer belong to this process!
16614 */
16615 dtrace_lazy_dofs_destroy(p);
16616
16617
16618 /*
16619 * Clean up any DTrace helpers for the process.
16620 */
16621 if (p->p_dtrace_helpers != NULL && dtrace_helpers_cleanup) {
16622 (*dtrace_helpers_cleanup)(p);
16623 }
16624
16625 /*
16626 * Cleanup the DTrace provider associated with this process.
16627 */
16628 proc_lock(p);
16629 if (p->p_dtrace_probes && dtrace_fasttrap_exec_ptr) {
16630 (*dtrace_fasttrap_exec_ptr)(p);
16631 }
16632 proc_unlock(p);
16633 }
16634
16635 void
dtrace_proc_exit(proc_t * p)16636 dtrace_proc_exit(proc_t *p)
16637 {
16638 /*
16639 * Free any outstanding lazy dof entries. It is imperative we
16640 * always call dtrace_lazy_dofs_destroy, rather than null check
16641 * and call if !NULL. If we NULL test, during lazy dof faulting
16642 * we can race with the faulting code and proceed from here to
16643 * beyond the helpers cleanup. The lazy dof faulting will then
16644 * install new helpers which will never be cleaned up, and leak.
16645 */
16646 dtrace_lazy_dofs_destroy(p);
16647
16648 /*
16649 * Clean up any DTrace helper actions or probes for the process.
16650 */
16651 if (p->p_dtrace_helpers != NULL) {
16652 (*dtrace_helpers_cleanup)(p);
16653 }
16654
16655 /*
16656 * Clean up any DTrace probes associated with this process.
16657 */
16658 /*
16659 * APPLE NOTE: We release ptss pages/entries in dtrace_fasttrap_exit_ptr(),
16660 * call this after dtrace_helpers_cleanup()
16661 */
16662 proc_lock(p);
16663 if (p->p_dtrace_probes && dtrace_fasttrap_exit_ptr) {
16664 (*dtrace_fasttrap_exit_ptr)(p);
16665 }
16666 proc_unlock(p);
16667 }
16668
16669 /*
16670 * DTrace Hook Functions
16671 */
16672
16673 /*
16674 * APPLE NOTE: dtrace_modctl_* routines for kext support.
16675 * Used to manipulate the modctl list within dtrace xnu.
16676 */
16677
16678 modctl_t *dtrace_modctl_list;
16679
16680 static void
dtrace_modctl_add(struct modctl * newctl)16681 dtrace_modctl_add(struct modctl * newctl)
16682 {
16683 struct modctl *nextp, *prevp;
16684
16685 ASSERT(newctl != NULL);
16686 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16687
16688 // Insert new module at the front of the list,
16689
16690 newctl->mod_next = dtrace_modctl_list;
16691 dtrace_modctl_list = newctl;
16692
16693 /*
16694 * If a module exists with the same name, then that module
16695 * must have been unloaded with enabled probes. We will move
16696 * the unloaded module to the new module's stale chain and
16697 * then stop traversing the list.
16698 */
16699
16700 prevp = newctl;
16701 nextp = newctl->mod_next;
16702
16703 while (nextp != NULL) {
16704 if (nextp->mod_loaded) {
16705 /* This is a loaded module. Keep traversing. */
16706 prevp = nextp;
16707 nextp = nextp->mod_next;
16708 continue;
16709 }
16710 else {
16711 /* Found an unloaded module */
16712 if (strncmp (newctl->mod_modname, nextp->mod_modname, KMOD_MAX_NAME)) {
16713 /* Names don't match. Keep traversing. */
16714 prevp = nextp;
16715 nextp = nextp->mod_next;
16716 continue;
16717 }
16718 else {
16719 /* We found a stale entry, move it. We're done. */
16720 prevp->mod_next = nextp->mod_next;
16721 newctl->mod_stale = nextp;
16722 nextp->mod_next = NULL;
16723 break;
16724 }
16725 }
16726 }
16727 }
16728
16729 static modctl_t *
dtrace_modctl_lookup(struct kmod_info * kmod)16730 dtrace_modctl_lookup(struct kmod_info * kmod)
16731 {
16732 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16733
16734 struct modctl * ctl;
16735
16736 for (ctl = dtrace_modctl_list; ctl; ctl=ctl->mod_next) {
16737 if (ctl->mod_id == kmod->id)
16738 return(ctl);
16739 }
16740 return (NULL);
16741 }
16742
16743 /*
16744 * This routine is called from dtrace_module_unloaded().
16745 * It removes a modctl structure and its stale chain
16746 * from the kext shadow list.
16747 */
16748 static void
dtrace_modctl_remove(struct modctl * ctl)16749 dtrace_modctl_remove(struct modctl * ctl)
16750 {
16751 ASSERT(ctl != NULL);
16752 LCK_MTX_ASSERT(&mod_lock, LCK_MTX_ASSERT_OWNED);
16753 modctl_t *prevp, *nextp, *curp;
16754
16755 // Remove stale chain first
16756 for (curp=ctl->mod_stale; curp != NULL; curp=nextp) {
16757 nextp = curp->mod_stale;
16758 /* There should NEVER be user symbols allocated at this point */
16759 ASSERT(curp->mod_user_symbols == NULL);
16760 kmem_free(curp, sizeof(modctl_t));
16761 }
16762
16763 prevp = NULL;
16764 curp = dtrace_modctl_list;
16765
16766 while (curp != ctl) {
16767 prevp = curp;
16768 curp = curp->mod_next;
16769 }
16770
16771 if (prevp != NULL) {
16772 prevp->mod_next = ctl->mod_next;
16773 }
16774 else {
16775 dtrace_modctl_list = ctl->mod_next;
16776 }
16777
16778 /* There should NEVER be user symbols allocated at this point */
16779 ASSERT(ctl->mod_user_symbols == NULL);
16780
16781 kmem_free (ctl, sizeof(modctl_t));
16782 }
16783
16784 /*
16785 * APPLE NOTE: The kext loader will call dtrace_module_loaded
16786 * when the kext is loaded in memory, but before calling the
16787 * kext's start routine.
16788 *
16789 * Return 0 on success
16790 * Return -1 on failure
16791 */
16792
16793 static int
dtrace_module_loaded(struct kmod_info * kmod,uint32_t flag)16794 dtrace_module_loaded(struct kmod_info *kmod, uint32_t flag)
16795 {
16796 dtrace_provider_t *prv;
16797
16798 /*
16799 * If kernel symbols have been disabled, return immediately
16800 * DTRACE_KERNEL_SYMBOLS_NEVER is a permanent mode, it is safe to test without holding locks
16801 */
16802 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER)
16803 return 0;
16804
16805 struct modctl *ctl = NULL;
16806 if (!kmod || kmod->address == 0 || kmod->size == 0)
16807 return(-1);
16808
16809 lck_mtx_lock(&dtrace_provider_lock);
16810 lck_mtx_lock(&mod_lock);
16811
16812 /*
16813 * Have we seen this kext before?
16814 */
16815
16816 ctl = dtrace_modctl_lookup(kmod);
16817
16818 if (ctl != NULL) {
16819 /* bail... we already have this kext in the modctl list */
16820 lck_mtx_unlock(&mod_lock);
16821 lck_mtx_unlock(&dtrace_provider_lock);
16822 if (dtrace_err_verbose)
16823 cmn_err(CE_WARN, "dtrace load module already exists '%s %u' is failing against '%s %u'", kmod->name, (uint_t)kmod->id, ctl->mod_modname, ctl->mod_id);
16824 return(-1);
16825 }
16826 else {
16827 ctl = kmem_alloc(sizeof(struct modctl), KM_SLEEP);
16828 if (ctl == NULL) {
16829 if (dtrace_err_verbose)
16830 cmn_err(CE_WARN, "dtrace module load '%s %u' is failing ", kmod->name, (uint_t)kmod->id);
16831 lck_mtx_unlock(&mod_lock);
16832 lck_mtx_unlock(&dtrace_provider_lock);
16833 return (-1);
16834 }
16835 ctl->mod_next = NULL;
16836 ctl->mod_stale = NULL;
16837 strlcpy (ctl->mod_modname, kmod->name, sizeof(ctl->mod_modname));
16838 ctl->mod_loadcnt = kmod->id;
16839 ctl->mod_nenabled = 0;
16840 ctl->mod_address = kmod->address;
16841 ctl->mod_size = kmod->size;
16842 ctl->mod_id = kmod->id;
16843 ctl->mod_loaded = 1;
16844 ctl->mod_flags = 0;
16845 ctl->mod_user_symbols = NULL;
16846 ctl->mod_sdtprobecnt = 0;
16847 ctl->mod_sdtdesc = NULL;
16848
16849 /*
16850 * Find the UUID for this module, if it has one
16851 */
16852 kernel_mach_header_t* header = (kernel_mach_header_t *)ctl->mod_address;
16853 struct load_command* load_cmd = (struct load_command *)&header[1];
16854 uint32_t i;
16855 for (i = 0; i < header->ncmds; i++) {
16856 if (load_cmd->cmd == LC_UUID) {
16857 struct uuid_command* uuid_cmd = (struct uuid_command *)load_cmd;
16858 memcpy(ctl->mod_uuid, uuid_cmd->uuid, sizeof(uuid_cmd->uuid));
16859 ctl->mod_flags |= MODCTL_HAS_UUID;
16860 break;
16861 }
16862 load_cmd = (struct load_command *)((caddr_t)load_cmd + load_cmd->cmdsize);
16863 }
16864
16865 if (ctl->mod_address == g_kernel_kmod_info.address) {
16866 ctl->mod_flags |= MODCTL_IS_MACH_KERNEL;
16867 memcpy(dtrace_kerneluuid, ctl->mod_uuid, sizeof(dtrace_kerneluuid));
16868 }
16869 /*
16870 * Static kexts have a UUID that is not used for symbolication, as all their
16871 * symbols are in kernel
16872 */
16873 else if ((flag & KMOD_DTRACE_STATIC_KEXT) == KMOD_DTRACE_STATIC_KEXT) {
16874 memcpy(ctl->mod_uuid, dtrace_kerneluuid, sizeof(dtrace_kerneluuid));
16875 ctl->mod_flags |= MODCTL_IS_STATIC_KEXT;
16876 }
16877 }
16878 dtrace_modctl_add(ctl);
16879
16880 /*
16881 * We must hold the dtrace_lock to safely test non permanent dtrace_fbt_symbol_mode(s)
16882 */
16883 lck_mtx_lock(&dtrace_lock);
16884
16885 /*
16886 * DTrace must decide if it will instrument modules lazily via
16887 * userspace symbols (default mode), or instrument immediately via
16888 * kernel symbols (non-default mode)
16889 *
16890 * When in default/lazy mode, DTrace will only support modules
16891 * built with a valid UUID.
16892 *
16893 * Overriding the default can be done explicitly in one of
16894 * the following two ways.
16895 *
16896 * A module can force symbols from kernel space using the plist key,
16897 * OSBundleForceDTraceInit (see kmod.h). If this per kext state is set,
16898 * we fall through and instrument this module now.
16899 *
16900 * Or, the boot-arg, dtrace_kernel_symbol_mode, can be set to force symbols
16901 * from kernel space (see dtrace_impl.h). If this system state is set
16902 * to a non-userspace mode, we fall through and instrument the module now.
16903 */
16904
16905 if ((dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) &&
16906 (!(flag & KMOD_DTRACE_FORCE_INIT)))
16907 {
16908 /* Load SDT section for module. Symbol related data will be handled lazily. */
16909 sdt_load_machsect(ctl);
16910
16911 /* We will instrument the module lazily -- this is the default */
16912 lck_mtx_unlock(&dtrace_lock);
16913 lck_mtx_unlock(&mod_lock);
16914 lck_mtx_unlock(&dtrace_provider_lock);
16915 return 0;
16916 }
16917
16918 /* We will instrument the module immediately using kernel symbols */
16919 if (!(flag & KMOD_DTRACE_NO_KERNEL_SYMS)) {
16920 ctl->mod_flags |= MODCTL_HAS_KERNEL_SYMBOLS;
16921 }
16922
16923 /* Load SDT section for module. Symbol related data will be handled lazily. */
16924 sdt_load_machsect(ctl);
16925
16926 lck_mtx_unlock(&dtrace_lock);
16927
16928 /*
16929 * We're going to call each providers per-module provide operation
16930 * specifying only this module.
16931 */
16932 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
16933 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
16934
16935 /*
16936 * APPLE NOTE: The contract with the kext loader is that once this function
16937 * has completed, it may delete kernel symbols at will.
16938 * We must set this while still holding the mod_lock.
16939 */
16940 ctl->mod_flags &= ~MODCTL_HAS_KERNEL_SYMBOLS;
16941
16942 lck_mtx_unlock(&mod_lock);
16943 lck_mtx_unlock(&dtrace_provider_lock);
16944
16945 /*
16946 * If we have any retained enablings, we need to match against them.
16947 * Enabling probes requires that cpu_lock be held, and we cannot hold
16948 * cpu_lock here -- it is legal for cpu_lock to be held when loading a
16949 * module. (In particular, this happens when loading scheduling
16950 * classes.) So if we have any retained enablings, we need to dispatch
16951 * our task queue to do the match for us.
16952 */
16953 lck_mtx_lock(&dtrace_lock);
16954
16955 if (dtrace_retained == NULL) {
16956 lck_mtx_unlock(&dtrace_lock);
16957 return 0;
16958 }
16959
16960 /* APPLE NOTE!
16961 *
16962 * The cpu_lock mentioned above is only held by dtrace code, Apple's xnu never actually
16963 * holds it for any reason. Thus the comment above is invalid, we can directly invoke
16964 * dtrace_enabling_matchall without jumping through all the hoops, and we can avoid
16965 * the delay call as well.
16966 */
16967 lck_mtx_unlock(&dtrace_lock);
16968
16969 dtrace_enabling_matchall();
16970
16971 return 0;
16972 }
16973
16974 /*
16975 * Return 0 on success
16976 * Return -1 on failure
16977 */
16978 static int
dtrace_module_unloaded(struct kmod_info * kmod)16979 dtrace_module_unloaded(struct kmod_info *kmod)
16980 {
16981 dtrace_probe_t template, *probe, *first, *next;
16982 dtrace_provider_t *prov;
16983 struct modctl *ctl = NULL;
16984 struct modctl *syncctl = NULL;
16985 struct modctl *nextsyncctl = NULL;
16986 int syncmode = 0;
16987
16988 lck_mtx_lock(&dtrace_provider_lock);
16989 lck_mtx_lock(&mod_lock);
16990 lck_mtx_lock(&dtrace_lock);
16991
16992 if (kmod == NULL) {
16993 syncmode = 1;
16994 }
16995 else {
16996 ctl = dtrace_modctl_lookup(kmod);
16997 if (ctl == NULL)
16998 {
16999 lck_mtx_unlock(&dtrace_lock);
17000 lck_mtx_unlock(&mod_lock);
17001 lck_mtx_unlock(&dtrace_provider_lock);
17002 return (-1);
17003 }
17004 ctl->mod_loaded = 0;
17005 ctl->mod_address = 0;
17006 ctl->mod_size = 0;
17007 }
17008
17009 if (dtrace_bymod == NULL) {
17010 /*
17011 * The DTrace module is loaded (obviously) but not attached;
17012 * we don't have any work to do.
17013 */
17014 if (ctl != NULL)
17015 (void)dtrace_modctl_remove(ctl);
17016 lck_mtx_unlock(&dtrace_lock);
17017 lck_mtx_unlock(&mod_lock);
17018 lck_mtx_unlock(&dtrace_provider_lock);
17019 return(0);
17020 }
17021
17022 /* Syncmode set means we target and traverse entire modctl list. */
17023 if (syncmode)
17024 nextsyncctl = dtrace_modctl_list;
17025
17026 syncloop:
17027 if (syncmode)
17028 {
17029 /* find a stale modctl struct */
17030 for (syncctl = nextsyncctl; syncctl != NULL; syncctl=syncctl->mod_next) {
17031 if (syncctl->mod_address == 0)
17032 break;
17033 }
17034 if (syncctl==NULL)
17035 {
17036 /* We have no more work to do */
17037 lck_mtx_unlock(&dtrace_lock);
17038 lck_mtx_unlock(&mod_lock);
17039 lck_mtx_unlock(&dtrace_provider_lock);
17040 return(0);
17041 }
17042 else {
17043 /* keep track of next syncctl in case this one is removed */
17044 nextsyncctl = syncctl->mod_next;
17045 ctl = syncctl;
17046 }
17047 }
17048
17049 template.dtpr_mod = ctl->mod_modname;
17050
17051 for (probe = first = dtrace_hash_lookup(dtrace_bymod, &template);
17052 probe != NULL; probe = probe->dtpr_nextmod) {
17053 if (probe->dtpr_ecb != NULL) {
17054 /*
17055 * This shouldn't _actually_ be possible -- we're
17056 * unloading a module that has an enabled probe in it.
17057 * (It's normally up to the provider to make sure that
17058 * this can't happen.) However, because dtps_enable()
17059 * doesn't have a failure mode, there can be an
17060 * enable/unload race. Upshot: we don't want to
17061 * assert, but we're not going to disable the
17062 * probe, either.
17063 */
17064
17065
17066 if (syncmode) {
17067 /* We're syncing, let's look at next in list */
17068 goto syncloop;
17069 }
17070
17071 lck_mtx_unlock(&dtrace_lock);
17072 lck_mtx_unlock(&mod_lock);
17073 lck_mtx_unlock(&dtrace_provider_lock);
17074
17075 if (dtrace_err_verbose) {
17076 cmn_err(CE_WARN, "unloaded module '%s' had "
17077 "enabled probes", ctl->mod_modname);
17078 }
17079 return(-1);
17080 }
17081 }
17082
17083 probe = first;
17084
17085 for (first = NULL; probe != NULL; probe = next) {
17086 ASSERT(dtrace_probes[probe->dtpr_id - 1] == probe);
17087
17088 dtrace_probes[probe->dtpr_id - 1] = NULL;
17089 probe->dtpr_provider->dtpv_probe_count--;
17090
17091 next = probe->dtpr_nextmod;
17092 dtrace_hash_remove(dtrace_byprov, probe);
17093 dtrace_hash_remove(dtrace_bymod, probe);
17094 dtrace_hash_remove(dtrace_byfunc, probe);
17095 dtrace_hash_remove(dtrace_byname, probe);
17096
17097 if (first == NULL) {
17098 first = probe;
17099 probe->dtpr_nextmod = NULL;
17100 } else {
17101 probe->dtpr_nextmod = first;
17102 first = probe;
17103 }
17104 }
17105
17106 /*
17107 * We've removed all of the module's probes from the hash chains and
17108 * from the probe array. Now issue a dtrace_sync() to be sure that
17109 * everyone has cleared out from any probe array processing.
17110 */
17111 dtrace_sync();
17112
17113 for (probe = first; probe != NULL; probe = first) {
17114 first = probe->dtpr_nextmod;
17115 prov = probe->dtpr_provider;
17116 prov->dtpv_pops.dtps_destroy(prov->dtpv_arg, probe->dtpr_id,
17117 probe->dtpr_arg);
17118 dtrace_strunref(probe->dtpr_mod);
17119 dtrace_strunref(probe->dtpr_func);
17120 dtrace_strunref(probe->dtpr_name);
17121 vmem_free(dtrace_arena, (void *)(uintptr_t)probe->dtpr_id, 1);
17122
17123 zfree(dtrace_probe_t_zone, probe);
17124 }
17125
17126 dtrace_modctl_remove(ctl);
17127
17128 if (syncmode)
17129 goto syncloop;
17130
17131 lck_mtx_unlock(&dtrace_lock);
17132 lck_mtx_unlock(&mod_lock);
17133 lck_mtx_unlock(&dtrace_provider_lock);
17134
17135 return(0);
17136 }
17137
17138 void
dtrace_suspend(void)17139 dtrace_suspend(void)
17140 {
17141 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_suspend));
17142 }
17143
17144 void
dtrace_resume(void)17145 dtrace_resume(void)
17146 {
17147 dtrace_probe_foreach(offsetof(dtrace_pops_t, dtps_resume));
17148 }
17149
17150 static int
dtrace_cpu_setup(cpu_setup_t what,processorid_t cpu)17151 dtrace_cpu_setup(cpu_setup_t what, processorid_t cpu)
17152 {
17153 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17154 lck_mtx_lock(&dtrace_lock);
17155
17156 switch (what) {
17157 case CPU_CONFIG: {
17158 dtrace_state_t *state;
17159 dtrace_optval_t *opt, rs, c;
17160
17161 /*
17162 * For now, we only allocate a new buffer for anonymous state.
17163 */
17164 if ((state = dtrace_anon.dta_state) == NULL)
17165 break;
17166
17167 if (state->dts_activity != DTRACE_ACTIVITY_ACTIVE)
17168 break;
17169
17170 opt = state->dts_options;
17171 c = opt[DTRACEOPT_CPU];
17172
17173 if (c != DTRACE_CPUALL && c != DTRACEOPT_UNSET && c != cpu)
17174 break;
17175
17176 /*
17177 * Regardless of what the actual policy is, we're going to
17178 * temporarily set our resize policy to be manual. We're
17179 * also going to temporarily set our CPU option to denote
17180 * the newly configured CPU.
17181 */
17182 rs = opt[DTRACEOPT_BUFRESIZE];
17183 opt[DTRACEOPT_BUFRESIZE] = DTRACEOPT_BUFRESIZE_MANUAL;
17184 opt[DTRACEOPT_CPU] = (dtrace_optval_t)cpu;
17185
17186 (void) dtrace_state_buffers(state);
17187
17188 opt[DTRACEOPT_BUFRESIZE] = rs;
17189 opt[DTRACEOPT_CPU] = c;
17190
17191 break;
17192 }
17193
17194 case CPU_UNCONFIG:
17195 /*
17196 * We don't free the buffer in the CPU_UNCONFIG case. (The
17197 * buffer will be freed when the consumer exits.)
17198 */
17199 break;
17200
17201 default:
17202 break;
17203 }
17204
17205 lck_mtx_unlock(&dtrace_lock);
17206 return (0);
17207 }
17208
17209 static void
dtrace_cpu_setup_initial(processorid_t cpu)17210 dtrace_cpu_setup_initial(processorid_t cpu)
17211 {
17212 (void) dtrace_cpu_setup(CPU_CONFIG, cpu);
17213 }
17214
17215 static void
dtrace_toxrange_add(uintptr_t base,uintptr_t limit)17216 dtrace_toxrange_add(uintptr_t base, uintptr_t limit)
17217 {
17218 if (dtrace_toxranges >= dtrace_toxranges_max) {
17219 int osize, nsize;
17220 dtrace_toxrange_t *range;
17221
17222 osize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17223
17224 if (osize == 0) {
17225 ASSERT(dtrace_toxrange == NULL);
17226 ASSERT(dtrace_toxranges_max == 0);
17227 dtrace_toxranges_max = 1;
17228 } else {
17229 dtrace_toxranges_max <<= 1;
17230 }
17231
17232 nsize = dtrace_toxranges_max * sizeof (dtrace_toxrange_t);
17233 range = kmem_zalloc(nsize, KM_SLEEP);
17234
17235 if (dtrace_toxrange != NULL) {
17236 ASSERT(osize != 0);
17237 bcopy(dtrace_toxrange, range, osize);
17238 kmem_free(dtrace_toxrange, osize);
17239 }
17240
17241 dtrace_toxrange = range;
17242 }
17243
17244 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_base == 0);
17245 ASSERT(dtrace_toxrange[dtrace_toxranges].dtt_limit == 0);
17246
17247 dtrace_toxrange[dtrace_toxranges].dtt_base = base;
17248 dtrace_toxrange[dtrace_toxranges].dtt_limit = limit;
17249 dtrace_toxranges++;
17250 }
17251
17252 /*
17253 * DTrace Driver Cookbook Functions
17254 */
17255 /*ARGSUSED*/
17256 static int
dtrace_attach(dev_info_t * devi)17257 dtrace_attach(dev_info_t *devi)
17258 {
17259 dtrace_provider_id_t id;
17260 dtrace_state_t *state = NULL;
17261 dtrace_enabling_t *enab;
17262
17263 lck_mtx_lock(&cpu_lock);
17264 lck_mtx_lock(&dtrace_provider_lock);
17265 lck_mtx_lock(&dtrace_lock);
17266
17267 /* Darwin uses BSD cloning device driver to automagically obtain minor device number. */
17268 dtrace_devi = devi;
17269
17270 dtrace_modload = dtrace_module_loaded;
17271 dtrace_modunload = dtrace_module_unloaded;
17272 dtrace_cpu_init = dtrace_cpu_setup_initial;
17273 dtrace_helpers_cleanup = dtrace_helpers_destroy;
17274 dtrace_helpers_fork = dtrace_helpers_duplicate;
17275 dtrace_cpustart_init = dtrace_suspend;
17276 dtrace_cpustart_fini = dtrace_resume;
17277 dtrace_debugger_init = dtrace_suspend;
17278 dtrace_debugger_fini = dtrace_resume;
17279
17280 register_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
17281
17282 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17283
17284 dtrace_arena = vmem_create("dtrace", (void *)1, INT32_MAX, 1,
17285 NULL, NULL, NULL, 0, VM_SLEEP | VMC_IDENTIFIER);
17286
17287 LCK_MTX_ASSERT(&cpu_lock, LCK_MTX_ASSERT_OWNED);
17288
17289 dtrace_nprobes = dtrace_nprobes_default;
17290 dtrace_probes = kmem_zalloc(sizeof(dtrace_probe_t*) * dtrace_nprobes,
17291 KM_SLEEP);
17292
17293 dtrace_byprov = dtrace_hash_create(dtrace_strkey_probe_provider,
17294 0, /* unused */
17295 offsetof(dtrace_probe_t, dtpr_nextprov),
17296 offsetof(dtrace_probe_t, dtpr_prevprov));
17297
17298 dtrace_bymod = dtrace_hash_create(dtrace_strkey_deref_offset,
17299 offsetof(dtrace_probe_t, dtpr_mod),
17300 offsetof(dtrace_probe_t, dtpr_nextmod),
17301 offsetof(dtrace_probe_t, dtpr_prevmod));
17302
17303 dtrace_byfunc = dtrace_hash_create(dtrace_strkey_deref_offset,
17304 offsetof(dtrace_probe_t, dtpr_func),
17305 offsetof(dtrace_probe_t, dtpr_nextfunc),
17306 offsetof(dtrace_probe_t, dtpr_prevfunc));
17307
17308 dtrace_byname = dtrace_hash_create(dtrace_strkey_deref_offset,
17309 offsetof(dtrace_probe_t, dtpr_name),
17310 offsetof(dtrace_probe_t, dtpr_nextname),
17311 offsetof(dtrace_probe_t, dtpr_prevname));
17312
17313 if (dtrace_retain_max < 1) {
17314 cmn_err(CE_WARN, "illegal value (%lu) for dtrace_retain_max; "
17315 "setting to 1", dtrace_retain_max);
17316 dtrace_retain_max = 1;
17317 }
17318
17319 /*
17320 * Now discover our toxic ranges.
17321 */
17322 dtrace_toxic_ranges(dtrace_toxrange_add);
17323
17324 /*
17325 * Before we register ourselves as a provider to our own framework,
17326 * we would like to assert that dtrace_provider is NULL -- but that's
17327 * not true if we were loaded as a dependency of a DTrace provider.
17328 * Once we've registered, we can assert that dtrace_provider is our
17329 * pseudo provider.
17330 */
17331 (void) dtrace_register("dtrace", &dtrace_provider_attr,
17332 DTRACE_PRIV_NONE, 0, &dtrace_provider_ops, NULL, &id);
17333
17334 ASSERT(dtrace_provider != NULL);
17335 ASSERT((dtrace_provider_id_t)dtrace_provider == id);
17336
17337 #if defined (__x86_64__)
17338 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17339 dtrace_provider, NULL, NULL, "BEGIN", 1, NULL);
17340 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17341 dtrace_provider, NULL, NULL, "END", 0, NULL);
17342 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17343 dtrace_provider, NULL, NULL, "ERROR", 3, NULL);
17344 #elif (defined(__arm__) || defined(__arm64__))
17345 dtrace_probeid_begin = dtrace_probe_create((dtrace_provider_id_t)
17346 dtrace_provider, NULL, NULL, "BEGIN", 2, NULL);
17347 dtrace_probeid_end = dtrace_probe_create((dtrace_provider_id_t)
17348 dtrace_provider, NULL, NULL, "END", 1, NULL);
17349 dtrace_probeid_error = dtrace_probe_create((dtrace_provider_id_t)
17350 dtrace_provider, NULL, NULL, "ERROR", 4, NULL);
17351 #else
17352 #error Unknown Architecture
17353 #endif
17354
17355 dtrace_anon_property();
17356 lck_mtx_unlock(&cpu_lock);
17357
17358 /*
17359 * If DTrace helper tracing is enabled, we need to allocate the
17360 * trace buffer and initialize the values.
17361 */
17362 if (dtrace_helptrace_enabled) {
17363 ASSERT(dtrace_helptrace_buffer == NULL);
17364 dtrace_helptrace_buffer =
17365 kmem_zalloc(dtrace_helptrace_bufsize, KM_SLEEP);
17366 dtrace_helptrace_next = 0;
17367 }
17368
17369 /*
17370 * If there are already providers, we must ask them to provide their
17371 * probes, and then match any anonymous enabling against them. Note
17372 * that there should be no other retained enablings at this time:
17373 * the only retained enablings at this time should be the anonymous
17374 * enabling.
17375 */
17376 if (dtrace_anon.dta_enabling != NULL) {
17377 ASSERT(dtrace_retained == dtrace_anon.dta_enabling);
17378
17379 /*
17380 * APPLE NOTE: if handling anonymous dof, switch symbol modes.
17381 */
17382 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17383 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17384 }
17385
17386 dtrace_enabling_provide(NULL);
17387 state = dtrace_anon.dta_state;
17388
17389 /*
17390 * We couldn't hold cpu_lock across the above call to
17391 * dtrace_enabling_provide(), but we must hold it to actually
17392 * enable the probes. We have to drop all of our locks, pick
17393 * up cpu_lock, and regain our locks before matching the
17394 * retained anonymous enabling.
17395 */
17396 lck_mtx_unlock(&dtrace_lock);
17397 lck_mtx_unlock(&dtrace_provider_lock);
17398
17399 lck_mtx_lock(&cpu_lock);
17400 lck_mtx_lock(&dtrace_provider_lock);
17401 lck_mtx_lock(&dtrace_lock);
17402
17403 if ((enab = dtrace_anon.dta_enabling) != NULL)
17404 (void) dtrace_enabling_match(enab, NULL, NULL);
17405
17406 lck_mtx_unlock(&cpu_lock);
17407 }
17408
17409 lck_mtx_unlock(&dtrace_lock);
17410 lck_mtx_unlock(&dtrace_provider_lock);
17411
17412 if (state != NULL) {
17413 /*
17414 * If we created any anonymous state, set it going now.
17415 */
17416 (void) dtrace_state_go(state, &dtrace_anon.dta_beganon);
17417 }
17418
17419 return (DDI_SUCCESS);
17420 }
17421
17422 /*ARGSUSED*/
17423 static int
dtrace_open(dev_t * devp,int flag,int otyp,cred_t * cred_p)17424 dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
17425 {
17426 #pragma unused(flag, otyp)
17427 dtrace_state_t *state;
17428 uint32_t priv;
17429 uid_t uid;
17430 zoneid_t zoneid;
17431 int rv;
17432
17433 /* APPLE: Darwin puts Helper on its own major device. */
17434
17435 /*
17436 * If no DTRACE_PRIV_* bits are set in the credential, then the
17437 * caller lacks sufficient permission to do anything with DTrace.
17438 */
17439 dtrace_cred2priv(cred_p, &priv, &uid, &zoneid);
17440 if (priv == DTRACE_PRIV_NONE)
17441 return (EACCES);
17442
17443 /*
17444 * APPLE NOTE: We delay the initialization of fasttrap as late as possible.
17445 * It certainly can't be later than now!
17446 */
17447 fasttrap_init();
17448
17449 /*
17450 * Ask all providers to provide all their probes.
17451 */
17452 lck_mtx_lock(&dtrace_provider_lock);
17453 dtrace_probe_provide(NULL, NULL);
17454 lck_mtx_unlock(&dtrace_provider_lock);
17455
17456 lck_mtx_lock(&cpu_lock);
17457 lck_mtx_lock(&dtrace_lock);
17458 dtrace_opens++;
17459 dtrace_membar_producer();
17460
17461 #ifdef illumos
17462 /*
17463 * If the kernel debugger is active (that is, if the kernel debugger
17464 * modified text in some way), we won't allow the open.
17465 */
17466 if (kdi_dtrace_set(KDI_DTSET_DTRACE_ACTIVATE) != 0) {
17467 dtrace_opens--;
17468 lck_mtx_unlock(&dtrace_lock);
17469 lck_mtx_unlock(&cpu_lock);
17470 return (EBUSY);
17471 }
17472 #endif
17473
17474 rv = dtrace_state_create(devp, cred_p, &state);
17475 lck_mtx_unlock(&cpu_lock);
17476
17477 if (rv != 0 || state == NULL) {
17478 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17479 #ifdef illumos
17480 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17481 #endif
17482 }
17483 lck_mtx_unlock(&dtrace_lock);
17484 /* propagate EAGAIN or ERESTART */
17485 return (rv);
17486 }
17487
17488 lck_mtx_unlock(&dtrace_lock);
17489
17490 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17491
17492 /*
17493 * If we are currently lazy, transition states.
17494 *
17495 * Unlike dtrace_close, we do not need to check the
17496 * value of dtrace_opens, as any positive value (and
17497 * we count as 1) means we transition states.
17498 */
17499 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_ON) {
17500 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_OFF;
17501 /*
17502 * We do not need to hold the exclusive lock while processing
17503 * DOF on processes. We do need to make sure the mode does not get
17504 * changed to DTRACE_DOF_MODE_LAZY_ON during that stage though
17505 * (which should not happen anyway since it only happens in
17506 * dtrace_close). There is no way imcomplete USDT probes can be
17507 * activate by any DTrace clients here since they all have to
17508 * call dtrace_open and be blocked on dtrace_dof_mode_lock
17509 */
17510 lck_rw_lock_exclusive_to_shared(&dtrace_dof_mode_lock);
17511 /*
17512 * Iterate all existing processes and load lazy dofs.
17513 */
17514 proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS,
17515 dtrace_lazy_dofs_proc_iterate_doit,
17516 NULL,
17517 dtrace_lazy_dofs_proc_iterate_filter,
17518 NULL);
17519
17520 lck_rw_unlock_shared(&dtrace_dof_mode_lock);
17521 }
17522 else {
17523 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17524 }
17525
17526
17527 /*
17528 * Update kernel symbol state.
17529 *
17530 * We must own the provider and dtrace locks.
17531 *
17532 * NOTE! It may appear there is a race by setting this value so late
17533 * after dtrace_probe_provide. However, any kext loaded after the
17534 * call to probe provide and before we set LAZY_OFF will be marked as
17535 * eligible for symbols from userspace. The same dtrace that is currently
17536 * calling dtrace_open() (this call!) will get a list of kexts needing
17537 * symbols and fill them in, thus closing the race window.
17538 *
17539 * We want to set this value only after it certain it will succeed, as
17540 * this significantly reduces the complexity of error exits.
17541 */
17542 lck_mtx_lock(&dtrace_lock);
17543 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE) {
17544 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_KERNEL;
17545 }
17546 lck_mtx_unlock(&dtrace_lock);
17547
17548 return (0);
17549 }
17550
17551 /*ARGSUSED*/
17552 static int
dtrace_close(dev_t dev,int flag,int otyp,cred_t * cred_p)17553 dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
17554 {
17555 #pragma unused(flag, otyp, cred_p) /* __APPLE__ */
17556 minor_t minor = getminor(dev);
17557 dtrace_state_t *state;
17558
17559 /* APPLE NOTE: Darwin puts Helper on its own major device. */
17560 state = dtrace_state_get(minor);
17561
17562 lck_mtx_lock(&cpu_lock);
17563 lck_mtx_lock(&dtrace_lock);
17564
17565 if (state->dts_anon) {
17566 /*
17567 * There is anonymous state. Destroy that first.
17568 */
17569 ASSERT(dtrace_anon.dta_state == NULL);
17570 dtrace_state_destroy(state->dts_anon);
17571 }
17572
17573 dtrace_state_destroy(state);
17574 ASSERT(dtrace_opens > 0);
17575
17576 /*
17577 * Only relinquish control of the kernel debugger interface when there
17578 * are no consumers and no anonymous enablings.
17579 */
17580 if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) {
17581 #ifdef illumos
17582 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
17583 #endif
17584 }
17585
17586 lck_mtx_unlock(&dtrace_lock);
17587 lck_mtx_unlock(&cpu_lock);
17588
17589 /*
17590 * Lock ordering requires the dof mode lock be taken before
17591 * the dtrace_lock.
17592 */
17593 lck_rw_lock_exclusive(&dtrace_dof_mode_lock);
17594 lck_mtx_lock(&dtrace_lock);
17595
17596 if (dtrace_opens == 0) {
17597 /*
17598 * If we are currently lazy-off, and this is the last close, transition to
17599 * lazy state.
17600 */
17601 if (dtrace_dof_mode == DTRACE_DOF_MODE_LAZY_OFF) {
17602 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
17603 }
17604
17605 /*
17606 * If we are the last dtrace client, switch back to lazy (from userspace) symbols
17607 */
17608 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_FROM_KERNEL) {
17609 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
17610 }
17611 }
17612
17613 lck_mtx_unlock(&dtrace_lock);
17614 lck_rw_unlock_exclusive(&dtrace_dof_mode_lock);
17615
17616 /*
17617 * Kext probes may be retained past the end of the kext's lifespan. The
17618 * probes are kept until the last reference to them has been removed.
17619 * Since closing an active dtrace context is likely to drop that last reference,
17620 * lets take a shot at cleaning out the orphaned probes now.
17621 */
17622 dtrace_module_unloaded(NULL);
17623
17624 return (0);
17625 }
17626
17627 /*ARGSUSED*/
17628 static int
dtrace_ioctl_helper(u_long cmd,caddr_t arg,int * rv)17629 dtrace_ioctl_helper(u_long cmd, caddr_t arg, int *rv)
17630 {
17631 #pragma unused(rv)
17632 /*
17633 * Safe to check this outside the dof mode lock
17634 */
17635 if (dtrace_dof_mode == DTRACE_DOF_MODE_NEVER)
17636 return KERN_SUCCESS;
17637
17638 switch (cmd) {
17639 #if defined (__arm64__)
17640 case DTRACEHIOC_ADDDOF_U32:
17641 case DTRACEHIOC_ADDDOF_U64:
17642 #else
17643 case DTRACEHIOC_ADDDOF:
17644 #endif /* __arm64__*/
17645 {
17646 dof_helper_t *dhp = NULL;
17647 size_t dof_ioctl_data_size;
17648 dof_ioctl_data_t* multi_dof;
17649 unsigned int i;
17650 int rval = 0;
17651 user_addr_t user_address = *(user_addr_t*)arg;
17652 uint64_t dof_count;
17653 int multi_dof_claimed = 0;
17654 proc_t* p = current_proc();
17655
17656 /*
17657 * If this is a restricted process and dtrace is restricted,
17658 * do not allow DOFs to be registered
17659 */
17660 if (dtrace_is_restricted() &&
17661 !dtrace_are_restrictions_relaxed() &&
17662 !dtrace_can_attach_to_proc(current_proc())) {
17663 return (EACCES);
17664 }
17665
17666 /*
17667 * Read the number of DOF sections being passed in.
17668 */
17669 if (copyin(user_address + offsetof(dof_ioctl_data_t, dofiod_count),
17670 &dof_count,
17671 sizeof(dof_count))) {
17672 dtrace_dof_error(NULL, "failed to copyin dofiod_count");
17673 return (EFAULT);
17674 }
17675
17676 /*
17677 * Range check the count.
17678 */
17679 if (dof_count == 0 || dof_count > 1024) {
17680 dtrace_dof_error(NULL, "dofiod_count is not valid");
17681 return (EINVAL);
17682 }
17683
17684 /*
17685 * Allocate a correctly sized structure and copyin the data.
17686 */
17687 dof_ioctl_data_size = DOF_IOCTL_DATA_T_SIZE(dof_count);
17688 if ((multi_dof = kmem_alloc(dof_ioctl_data_size, KM_SLEEP)) == NULL)
17689 return (ENOMEM);
17690
17691 /* NOTE! We can no longer exit this method via return */
17692 if (copyin(user_address, multi_dof, dof_ioctl_data_size) != 0) {
17693 dtrace_dof_error(NULL, "failed copyin of dof_ioctl_data_t");
17694 rval = EFAULT;
17695 goto cleanup;
17696 }
17697
17698 /*
17699 * Check that the count didn't change between the first copyin and the second.
17700 */
17701 if (multi_dof->dofiod_count != dof_count) {
17702 rval = EINVAL;
17703 goto cleanup;
17704 }
17705
17706 /*
17707 * Try to process lazily first.
17708 */
17709 rval = dtrace_lazy_dofs_add(p, multi_dof, &multi_dof_claimed);
17710
17711 /*
17712 * If rval is EACCES, we must be non-lazy.
17713 */
17714 if (rval == EACCES) {
17715 rval = 0;
17716 /*
17717 * Process each dof_helper_t
17718 */
17719 i = 0;
17720 do {
17721 dhp = &multi_dof->dofiod_helpers[i];
17722
17723 dof_hdr_t *dof = dtrace_dof_copyin(dhp->dofhp_dof, &rval);
17724
17725 if (dof != NULL) {
17726 lck_mtx_lock(&dtrace_meta_lock);
17727 lck_mtx_lock(&dtrace_lock);
17728
17729 /*
17730 * dtrace_helper_slurp() takes responsibility for the dof --
17731 * it may free it now or it may save it and free it later.
17732 */
17733 if ((dhp->dofhp_dof = (uint64_t)dtrace_helper_slurp(p, dof, dhp)) == -1ULL) {
17734 rval = EINVAL;
17735 }
17736
17737 lck_mtx_unlock(&dtrace_lock);
17738 lck_mtx_unlock(&dtrace_meta_lock);
17739 }
17740 } while (++i < multi_dof->dofiod_count && rval == 0);
17741 }
17742
17743 /*
17744 * We need to copyout the multi_dof struct, because it contains
17745 * the generation (unique id) values needed to call DTRACEHIOC_REMOVE
17746 *
17747 * This could certainly be better optimized.
17748 */
17749 if (copyout(multi_dof, user_address, dof_ioctl_data_size) != 0) {
17750 dtrace_dof_error(NULL, "failed copyout of dof_ioctl_data_t");
17751 /* Don't overwrite pre-existing error code */
17752 if (rval == 0) rval = EFAULT;
17753 }
17754
17755 cleanup:
17756 /*
17757 * If we had to allocate struct memory, free it.
17758 */
17759 if (multi_dof != NULL && !multi_dof_claimed) {
17760 kmem_free(multi_dof, dof_ioctl_data_size);
17761 }
17762
17763 return rval;
17764 }
17765
17766 case DTRACEHIOC_REMOVE: {
17767 int generation = *(int*)arg;
17768 proc_t* p = current_proc();
17769
17770 /*
17771 * Try lazy first.
17772 */
17773 int rval = dtrace_lazy_dofs_remove(p, generation);
17774
17775 /*
17776 * EACCES means non-lazy
17777 */
17778 if (rval == EACCES) {
17779 lck_mtx_lock(&dtrace_meta_lock);
17780 lck_mtx_lock(&dtrace_lock);
17781 rval = dtrace_helper_destroygen(p, generation);
17782 lck_mtx_unlock(&dtrace_lock);
17783 lck_mtx_unlock(&dtrace_meta_lock);
17784 }
17785
17786 return (rval);
17787 }
17788
17789 default:
17790 break;
17791 }
17792
17793 return ENOTTY;
17794 }
17795
17796 /*ARGSUSED*/
17797 static int
dtrace_ioctl(dev_t dev,u_long cmd,user_addr_t arg,int md,cred_t * cr,int * rv)17798 dtrace_ioctl(dev_t dev, u_long cmd, user_addr_t arg, int md, cred_t *cr, int *rv)
17799 {
17800 #pragma unused(md)
17801 minor_t minor = getminor(dev);
17802 dtrace_state_t *state;
17803 int rval;
17804
17805 /* Darwin puts Helper on its own major device. */
17806
17807 state = dtrace_state_get(minor);
17808
17809 if (state->dts_anon) {
17810 ASSERT(dtrace_anon.dta_state == NULL);
17811 state = state->dts_anon;
17812 }
17813
17814 switch (cmd) {
17815 case DTRACEIOC_PROVIDER: {
17816 dtrace_providerdesc_t pvd;
17817 dtrace_provider_t *pvp;
17818
17819 if (copyin(arg, &pvd, sizeof (pvd)) != 0)
17820 return (EFAULT);
17821
17822 pvd.dtvd_name[DTRACE_PROVNAMELEN - 1] = '\0';
17823 lck_mtx_lock(&dtrace_provider_lock);
17824
17825 for (pvp = dtrace_provider; pvp != NULL; pvp = pvp->dtpv_next) {
17826 if (strncmp(pvp->dtpv_name, pvd.dtvd_name, DTRACE_PROVNAMELEN) == 0)
17827 break;
17828 }
17829
17830 lck_mtx_unlock(&dtrace_provider_lock);
17831
17832 if (pvp == NULL)
17833 return (ESRCH);
17834
17835 bcopy(&pvp->dtpv_priv, &pvd.dtvd_priv, sizeof (dtrace_ppriv_t));
17836 bcopy(&pvp->dtpv_attr, &pvd.dtvd_attr, sizeof (dtrace_pattr_t));
17837 if (copyout(&pvd, arg, sizeof (pvd)) != 0)
17838 return (EFAULT);
17839
17840 return (0);
17841 }
17842
17843 case DTRACEIOC_EPROBE: {
17844 dtrace_eprobedesc_t epdesc;
17845 dtrace_ecb_t *ecb;
17846 dtrace_action_t *act;
17847 void *buf;
17848 size_t size;
17849 uintptr_t dest;
17850 int nrecs;
17851
17852 if (copyin(arg, &epdesc, sizeof (epdesc)) != 0)
17853 return (EFAULT);
17854
17855 lck_mtx_lock(&dtrace_lock);
17856
17857 if ((ecb = dtrace_epid2ecb(state, epdesc.dtepd_epid)) == NULL) {
17858 lck_mtx_unlock(&dtrace_lock);
17859 return (EINVAL);
17860 }
17861
17862 if (ecb->dte_probe == NULL) {
17863 lck_mtx_unlock(&dtrace_lock);
17864 return (EINVAL);
17865 }
17866
17867 epdesc.dtepd_probeid = ecb->dte_probe->dtpr_id;
17868 epdesc.dtepd_uarg = ecb->dte_uarg;
17869 epdesc.dtepd_size = ecb->dte_size;
17870
17871 nrecs = epdesc.dtepd_nrecs;
17872 epdesc.dtepd_nrecs = 0;
17873 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17874 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17875 continue;
17876
17877 epdesc.dtepd_nrecs++;
17878 }
17879
17880 /*
17881 * Now that we have the size, we need to allocate a temporary
17882 * buffer in which to store the complete description. We need
17883 * the temporary buffer to be able to drop dtrace_lock()
17884 * across the copyout(), below.
17885 */
17886 size = sizeof (dtrace_eprobedesc_t) +
17887 (epdesc.dtepd_nrecs * sizeof (dtrace_recdesc_t));
17888
17889 buf = kmem_alloc(size, KM_SLEEP);
17890 dest = (uintptr_t)buf;
17891
17892 bcopy(&epdesc, (void *)dest, sizeof (epdesc));
17893 dest += offsetof(dtrace_eprobedesc_t, dtepd_rec[0]);
17894
17895 for (act = ecb->dte_action; act != NULL; act = act->dta_next) {
17896 if (DTRACEACT_ISAGG(act->dta_kind) || act->dta_intuple)
17897 continue;
17898
17899 if (nrecs-- == 0)
17900 break;
17901
17902 bcopy(&act->dta_rec, (void *)dest,
17903 sizeof (dtrace_recdesc_t));
17904 dest += sizeof (dtrace_recdesc_t);
17905 }
17906
17907 lck_mtx_unlock(&dtrace_lock);
17908
17909 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
17910 kmem_free(buf, size);
17911 return (EFAULT);
17912 }
17913
17914 kmem_free(buf, size);
17915 return (0);
17916 }
17917
17918 case DTRACEIOC_AGGDESC: {
17919 dtrace_aggdesc_t aggdesc;
17920 dtrace_action_t *act;
17921 dtrace_aggregation_t *agg;
17922 int nrecs;
17923 uint32_t offs;
17924 dtrace_recdesc_t *lrec;
17925 void *buf;
17926 size_t size;
17927 uintptr_t dest;
17928
17929 if (copyin(arg, &aggdesc, sizeof (aggdesc)) != 0)
17930 return (EFAULT);
17931
17932 lck_mtx_lock(&dtrace_lock);
17933
17934 if ((agg = dtrace_aggid2agg(state, aggdesc.dtagd_id)) == NULL) {
17935 lck_mtx_unlock(&dtrace_lock);
17936 return (EINVAL);
17937 }
17938
17939 aggdesc.dtagd_epid = agg->dtag_ecb->dte_epid;
17940
17941 nrecs = aggdesc.dtagd_nrecs;
17942 aggdesc.dtagd_nrecs = 0;
17943
17944 offs = agg->dtag_base;
17945 lrec = &agg->dtag_action.dta_rec;
17946 aggdesc.dtagd_size = lrec->dtrd_offset + lrec->dtrd_size - offs;
17947
17948 for (act = agg->dtag_first; ; act = act->dta_next) {
17949 ASSERT(act->dta_intuple ||
17950 DTRACEACT_ISAGG(act->dta_kind));
17951
17952 /*
17953 * If this action has a record size of zero, it
17954 * denotes an argument to the aggregating action.
17955 * Because the presence of this record doesn't (or
17956 * shouldn't) affect the way the data is interpreted,
17957 * we don't copy it out to save user-level the
17958 * confusion of dealing with a zero-length record.
17959 */
17960 if (act->dta_rec.dtrd_size == 0) {
17961 ASSERT(agg->dtag_hasarg);
17962 continue;
17963 }
17964
17965 aggdesc.dtagd_nrecs++;
17966
17967 if (act == &agg->dtag_action)
17968 break;
17969 }
17970
17971 /*
17972 * Now that we have the size, we need to allocate a temporary
17973 * buffer in which to store the complete description. We need
17974 * the temporary buffer to be able to drop dtrace_lock()
17975 * across the copyout(), below.
17976 */
17977 size = sizeof (dtrace_aggdesc_t) +
17978 (aggdesc.dtagd_nrecs * sizeof (dtrace_recdesc_t));
17979
17980 buf = kmem_alloc(size, KM_SLEEP);
17981 dest = (uintptr_t)buf;
17982
17983 bcopy(&aggdesc, (void *)dest, sizeof (aggdesc));
17984 dest += offsetof(dtrace_aggdesc_t, dtagd_rec[0]);
17985
17986 for (act = agg->dtag_first; ; act = act->dta_next) {
17987 dtrace_recdesc_t rec = act->dta_rec;
17988
17989 /*
17990 * See the comment in the above loop for why we pass
17991 * over zero-length records.
17992 */
17993 if (rec.dtrd_size == 0) {
17994 ASSERT(agg->dtag_hasarg);
17995 continue;
17996 }
17997
17998 if (nrecs-- == 0)
17999 break;
18000
18001 rec.dtrd_offset -= offs;
18002 bcopy(&rec, (void *)dest, sizeof (rec));
18003 dest += sizeof (dtrace_recdesc_t);
18004
18005 if (act == &agg->dtag_action)
18006 break;
18007 }
18008
18009 lck_mtx_unlock(&dtrace_lock);
18010
18011 if (copyout(buf, arg, dest - (uintptr_t)buf) != 0) {
18012 kmem_free(buf, size);
18013 return (EFAULT);
18014 }
18015
18016 kmem_free(buf, size);
18017 return (0);
18018 }
18019
18020 case DTRACEIOC_ENABLE: {
18021 dof_hdr_t *dof;
18022 dtrace_enabling_t *enab = NULL;
18023 dtrace_vstate_t *vstate;
18024 int err = 0;
18025
18026 *rv = 0;
18027
18028 /*
18029 * If a NULL argument has been passed, we take this as our
18030 * cue to reevaluate our enablings.
18031 */
18032 if (arg == 0) {
18033 dtrace_enabling_matchall();
18034
18035 return (0);
18036 }
18037
18038 if ((dof = dtrace_dof_copyin(arg, &rval)) == NULL)
18039 return (rval);
18040
18041 lck_mtx_lock(&cpu_lock);
18042 lck_mtx_lock(&dtrace_lock);
18043 vstate = &state->dts_vstate;
18044
18045 if (state->dts_activity != DTRACE_ACTIVITY_INACTIVE) {
18046 lck_mtx_unlock(&dtrace_lock);
18047 lck_mtx_unlock(&cpu_lock);
18048 dtrace_dof_destroy(dof);
18049 return (EBUSY);
18050 }
18051
18052 if (dtrace_dof_slurp(dof, vstate, cr, &enab, 0, B_TRUE) != 0) {
18053 lck_mtx_unlock(&dtrace_lock);
18054 lck_mtx_unlock(&cpu_lock);
18055 dtrace_dof_destroy(dof);
18056 return (EINVAL);
18057 }
18058
18059 if ((rval = dtrace_dof_options(dof, state)) != 0) {
18060 dtrace_enabling_destroy(enab);
18061 lck_mtx_unlock(&dtrace_lock);
18062 lck_mtx_unlock(&cpu_lock);
18063 dtrace_dof_destroy(dof);
18064 return (rval);
18065 }
18066
18067 if ((err = dtrace_enabling_match(enab, rv, NULL)) == 0) {
18068 err = dtrace_enabling_retain(enab);
18069 } else {
18070 dtrace_enabling_destroy(enab);
18071 }
18072
18073 lck_mtx_unlock(&dtrace_lock);
18074 lck_mtx_unlock(&cpu_lock);
18075 dtrace_dof_destroy(dof);
18076
18077 return (err);
18078 }
18079
18080 case DTRACEIOC_REPLICATE: {
18081 dtrace_repldesc_t desc;
18082 dtrace_probedesc_t *match = &desc.dtrpd_match;
18083 dtrace_probedesc_t *create = &desc.dtrpd_create;
18084 int err;
18085
18086 if (copyin(arg, &desc, sizeof (desc)) != 0)
18087 return (EFAULT);
18088
18089 match->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18090 match->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18091 match->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18092 match->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18093
18094 create->dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18095 create->dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18096 create->dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18097 create->dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18098
18099 lck_mtx_lock(&dtrace_lock);
18100 err = dtrace_enabling_replicate(state, match, create);
18101 lck_mtx_unlock(&dtrace_lock);
18102
18103 return (err);
18104 }
18105
18106 case DTRACEIOC_PROBEMATCH:
18107 case DTRACEIOC_PROBES: {
18108 dtrace_probe_t *probe = NULL;
18109 dtrace_probedesc_t desc;
18110 dtrace_probekey_t pkey;
18111 dtrace_id_t i;
18112 int m = 0;
18113 uint32_t priv;
18114 uid_t uid;
18115 zoneid_t zoneid;
18116
18117 if (copyin(arg, &desc, sizeof (desc)) != 0)
18118 return (EFAULT);
18119
18120 desc.dtpd_provider[DTRACE_PROVNAMELEN - 1] = '\0';
18121 desc.dtpd_mod[DTRACE_MODNAMELEN - 1] = '\0';
18122 desc.dtpd_func[DTRACE_FUNCNAMELEN - 1] = '\0';
18123 desc.dtpd_name[DTRACE_NAMELEN - 1] = '\0';
18124
18125 /*
18126 * Before we attempt to match this probe, we want to give
18127 * all providers the opportunity to provide it.
18128 */
18129 if (desc.dtpd_id == DTRACE_IDNONE) {
18130 lck_mtx_lock(&dtrace_provider_lock);
18131 dtrace_probe_provide(&desc, NULL);
18132 lck_mtx_unlock(&dtrace_provider_lock);
18133 desc.dtpd_id++;
18134 }
18135
18136 dtrace_cred2priv(cr, &priv, &uid, &zoneid);
18137
18138 lck_mtx_lock(&dtrace_lock);
18139
18140 if (cmd == DTRACEIOC_PROBEMATCH) {
18141 dtrace_probekey(&desc, &pkey);
18142 pkey.dtpk_id = DTRACE_IDNONE;
18143
18144 /* Quiet compiler warning */
18145 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18146 if ((probe = dtrace_probes[i - 1]) != NULL &&
18147 (m = dtrace_match_probe(probe, &pkey,
18148 priv, uid, zoneid)) != 0)
18149 break;
18150 }
18151
18152 if (m < 0) {
18153 lck_mtx_unlock(&dtrace_lock);
18154 return (EINVAL);
18155 }
18156 dtrace_probekey_release(&pkey);
18157
18158 } else {
18159 /* Quiet compiler warning */
18160 for (i = desc.dtpd_id; i <= (dtrace_id_t)dtrace_nprobes; i++) {
18161 if ((probe = dtrace_probes[i - 1]) != NULL &&
18162 dtrace_match_priv(probe, priv, uid, zoneid))
18163 break;
18164 }
18165 }
18166
18167 if (probe == NULL) {
18168 lck_mtx_unlock(&dtrace_lock);
18169 return (ESRCH);
18170 }
18171
18172 dtrace_probe_description(probe, &desc);
18173 lck_mtx_unlock(&dtrace_lock);
18174
18175 if (copyout(&desc, arg, sizeof (desc)) != 0)
18176 return (EFAULT);
18177
18178 return (0);
18179 }
18180
18181 case DTRACEIOC_PROBEARG: {
18182 dtrace_argdesc_t desc;
18183 dtrace_probe_t *probe;
18184 dtrace_provider_t *prov;
18185
18186 if (copyin(arg, &desc, sizeof (desc)) != 0)
18187 return (EFAULT);
18188
18189 if (desc.dtargd_id == DTRACE_IDNONE)
18190 return (EINVAL);
18191
18192 if (desc.dtargd_ndx == DTRACE_ARGNONE)
18193 return (EINVAL);
18194
18195 lck_mtx_lock(&dtrace_provider_lock);
18196 lck_mtx_lock(&mod_lock);
18197 lck_mtx_lock(&dtrace_lock);
18198
18199 /* Quiet compiler warning */
18200 if (desc.dtargd_id > (dtrace_id_t)dtrace_nprobes) {
18201 lck_mtx_unlock(&dtrace_lock);
18202 lck_mtx_unlock(&mod_lock);
18203 lck_mtx_unlock(&dtrace_provider_lock);
18204 return (EINVAL);
18205 }
18206
18207 if ((probe = dtrace_probes[desc.dtargd_id - 1]) == NULL) {
18208 lck_mtx_unlock(&dtrace_lock);
18209 lck_mtx_unlock(&mod_lock);
18210 lck_mtx_unlock(&dtrace_provider_lock);
18211 return (EINVAL);
18212 }
18213
18214 lck_mtx_unlock(&dtrace_lock);
18215
18216 prov = probe->dtpr_provider;
18217
18218 if (prov->dtpv_pops.dtps_getargdesc == NULL) {
18219 /*
18220 * There isn't any typed information for this probe.
18221 * Set the argument number to DTRACE_ARGNONE.
18222 */
18223 desc.dtargd_ndx = DTRACE_ARGNONE;
18224 } else {
18225 desc.dtargd_native[0] = '\0';
18226 desc.dtargd_xlate[0] = '\0';
18227 desc.dtargd_mapping = desc.dtargd_ndx;
18228
18229 prov->dtpv_pops.dtps_getargdesc(prov->dtpv_arg,
18230 probe->dtpr_id, probe->dtpr_arg, &desc);
18231 }
18232
18233 lck_mtx_unlock(&mod_lock);
18234 lck_mtx_unlock(&dtrace_provider_lock);
18235
18236 if (copyout(&desc, arg, sizeof (desc)) != 0)
18237 return (EFAULT);
18238
18239 return (0);
18240 }
18241
18242 case DTRACEIOC_GO: {
18243 processorid_t cpuid;
18244 rval = dtrace_state_go(state, &cpuid);
18245
18246 if (rval != 0)
18247 return (rval);
18248
18249 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18250 return (EFAULT);
18251
18252 return (0);
18253 }
18254
18255 case DTRACEIOC_STOP: {
18256 processorid_t cpuid;
18257
18258 lck_mtx_lock(&dtrace_lock);
18259 rval = dtrace_state_stop(state, &cpuid);
18260 lck_mtx_unlock(&dtrace_lock);
18261
18262 if (rval != 0)
18263 return (rval);
18264
18265 if (copyout(&cpuid, arg, sizeof (cpuid)) != 0)
18266 return (EFAULT);
18267
18268 return (0);
18269 }
18270
18271 case DTRACEIOC_DOFGET: {
18272 dof_hdr_t hdr, *dof;
18273 uint64_t len;
18274
18275 if (copyin(arg, &hdr, sizeof (hdr)) != 0)
18276 return (EFAULT);
18277
18278 lck_mtx_lock(&dtrace_lock);
18279 dof = dtrace_dof_create(state);
18280 lck_mtx_unlock(&dtrace_lock);
18281
18282 len = MIN(hdr.dofh_loadsz, dof->dofh_loadsz);
18283 rval = copyout(dof, arg, len);
18284 dtrace_dof_destroy(dof);
18285
18286 return (rval == 0 ? 0 : EFAULT);
18287 }
18288
18289 case DTRACEIOC_SLEEP: {
18290 int64_t time;
18291 uint64_t abstime;
18292 uint64_t rvalue = DTRACE_WAKE_TIMEOUT;
18293
18294 if (copyin(arg, &time, sizeof(time)) != 0)
18295 return (EFAULT);
18296
18297 nanoseconds_to_absolutetime((uint64_t)time, &abstime);
18298 clock_absolutetime_interval_to_deadline(abstime, &abstime);
18299
18300 if (assert_wait_deadline(state, THREAD_ABORTSAFE, abstime) == THREAD_WAITING) {
18301 if (state->dts_buf_over_limit > 0) {
18302 clear_wait(current_thread(), THREAD_INTERRUPTED);
18303 rvalue = DTRACE_WAKE_BUF_LIMIT;
18304 } else {
18305 thread_block(THREAD_CONTINUE_NULL);
18306 if (state->dts_buf_over_limit > 0) {
18307 rvalue = DTRACE_WAKE_BUF_LIMIT;
18308 }
18309 }
18310 }
18311
18312 if (copyout(&rvalue, arg, sizeof(rvalue)) != 0)
18313 return (EFAULT);
18314
18315 return (0);
18316 }
18317
18318 case DTRACEIOC_SIGNAL: {
18319 wakeup(state);
18320 return (0);
18321 }
18322
18323 case DTRACEIOC_AGGSNAP:
18324 case DTRACEIOC_BUFSNAP: {
18325 dtrace_bufdesc_t desc;
18326 caddr_t cached;
18327 boolean_t over_limit;
18328 dtrace_buffer_t *buf;
18329
18330 if (copyin(arg, &desc, sizeof (desc)) != 0)
18331 return (EFAULT);
18332
18333 if ((int)desc.dtbd_cpu < 0 || desc.dtbd_cpu >= NCPU)
18334 return (EINVAL);
18335
18336 lck_mtx_lock(&dtrace_lock);
18337
18338 if (cmd == DTRACEIOC_BUFSNAP) {
18339 buf = &state->dts_buffer[desc.dtbd_cpu];
18340 } else {
18341 buf = &state->dts_aggbuffer[desc.dtbd_cpu];
18342 }
18343
18344 if (buf->dtb_flags & (DTRACEBUF_RING | DTRACEBUF_FILL)) {
18345 size_t sz = buf->dtb_offset;
18346
18347 if (state->dts_activity != DTRACE_ACTIVITY_STOPPED) {
18348 lck_mtx_unlock(&dtrace_lock);
18349 return (EBUSY);
18350 }
18351
18352 /*
18353 * If this buffer has already been consumed, we're
18354 * going to indicate that there's nothing left here
18355 * to consume.
18356 */
18357 if (buf->dtb_flags & DTRACEBUF_CONSUMED) {
18358 lck_mtx_unlock(&dtrace_lock);
18359
18360 desc.dtbd_size = 0;
18361 desc.dtbd_drops = 0;
18362 desc.dtbd_errors = 0;
18363 desc.dtbd_oldest = 0;
18364 sz = sizeof (desc);
18365
18366 if (copyout(&desc, arg, sz) != 0)
18367 return (EFAULT);
18368
18369 return (0);
18370 }
18371
18372 /*
18373 * If this is a ring buffer that has wrapped, we want
18374 * to copy the whole thing out.
18375 */
18376 if (buf->dtb_flags & DTRACEBUF_WRAPPED) {
18377 dtrace_buffer_polish(buf);
18378 sz = buf->dtb_size;
18379 }
18380
18381 if (copyout(buf->dtb_tomax, (user_addr_t)desc.dtbd_data, sz) != 0) {
18382 lck_mtx_unlock(&dtrace_lock);
18383 return (EFAULT);
18384 }
18385
18386 desc.dtbd_size = sz;
18387 desc.dtbd_drops = buf->dtb_drops;
18388 desc.dtbd_errors = buf->dtb_errors;
18389 desc.dtbd_oldest = buf->dtb_xamot_offset;
18390 desc.dtbd_timestamp = dtrace_gethrtime();
18391
18392 lck_mtx_unlock(&dtrace_lock);
18393
18394 if (copyout(&desc, arg, sizeof (desc)) != 0)
18395 return (EFAULT);
18396
18397 buf->dtb_flags |= DTRACEBUF_CONSUMED;
18398
18399 return (0);
18400 }
18401
18402 if (buf->dtb_tomax == NULL) {
18403 ASSERT(buf->dtb_xamot == NULL);
18404 lck_mtx_unlock(&dtrace_lock);
18405 return (ENOENT);
18406 }
18407
18408 cached = buf->dtb_tomax;
18409 over_limit = buf->dtb_cur_limit == buf->dtb_size;
18410
18411 ASSERT(!(buf->dtb_flags & DTRACEBUF_NOSWITCH));
18412
18413 dtrace_xcall(desc.dtbd_cpu,
18414 (dtrace_xcall_t)dtrace_buffer_switch, buf);
18415
18416 state->dts_errors += buf->dtb_xamot_errors;
18417
18418 /*
18419 * If the buffers did not actually switch, then the cross call
18420 * did not take place -- presumably because the given CPU is
18421 * not in the ready set. If this is the case, we'll return
18422 * ENOENT.
18423 */
18424 if (buf->dtb_tomax == cached) {
18425 ASSERT(buf->dtb_xamot != cached);
18426 lck_mtx_unlock(&dtrace_lock);
18427 return (ENOENT);
18428 }
18429
18430 ASSERT(cached == buf->dtb_xamot);
18431 /*
18432 * At this point we know the buffer have switched, so we
18433 * can decrement the over limit count if the buffer was over
18434 * its limit. The new buffer might already be over its limit
18435 * yet, but we don't care since we're guaranteed not to be
18436 * checking the buffer over limit count at this point.
18437 */
18438 if (over_limit) {
18439 uint32_t old = os_atomic_dec_orig(&state->dts_buf_over_limit, relaxed);
18440 #pragma unused(old)
18441
18442 /*
18443 * Verify that we didn't underflow the value
18444 */
18445 ASSERT(old != 0);
18446 }
18447
18448 /*
18449 * We have our snapshot; now copy it out.
18450 */
18451 if (dtrace_buffer_copyout(buf->dtb_xamot,
18452 (user_addr_t)desc.dtbd_data,
18453 buf->dtb_xamot_offset) != 0) {
18454 lck_mtx_unlock(&dtrace_lock);
18455 return (EFAULT);
18456 }
18457
18458 desc.dtbd_size = buf->dtb_xamot_offset;
18459 desc.dtbd_drops = buf->dtb_xamot_drops;
18460 desc.dtbd_errors = buf->dtb_xamot_errors;
18461 desc.dtbd_oldest = 0;
18462 desc.dtbd_timestamp = buf->dtb_switched;
18463
18464 lck_mtx_unlock(&dtrace_lock);
18465
18466 /*
18467 * Finally, copy out the buffer description.
18468 */
18469 if (copyout(&desc, arg, sizeof (desc)) != 0)
18470 return (EFAULT);
18471
18472 return (0);
18473 }
18474
18475 case DTRACEIOC_CONF: {
18476 dtrace_conf_t conf;
18477
18478 bzero(&conf, sizeof (conf));
18479 conf.dtc_difversion = DIF_VERSION;
18480 conf.dtc_difintregs = DIF_DIR_NREGS;
18481 conf.dtc_diftupregs = DIF_DTR_NREGS;
18482 conf.dtc_ctfmodel = CTF_MODEL_NATIVE;
18483
18484 if (copyout(&conf, arg, sizeof (conf)) != 0)
18485 return (EFAULT);
18486
18487 return (0);
18488 }
18489
18490 case DTRACEIOC_STATUS: {
18491 dtrace_status_t stat;
18492 dtrace_dstate_t *dstate;
18493 int j;
18494 uint64_t nerrs;
18495
18496 /*
18497 * See the comment in dtrace_state_deadman() for the reason
18498 * for setting dts_laststatus to INT64_MAX before setting
18499 * it to the correct value.
18500 */
18501 state->dts_laststatus = INT64_MAX;
18502 dtrace_membar_producer();
18503 state->dts_laststatus = dtrace_gethrtime();
18504
18505 bzero(&stat, sizeof (stat));
18506
18507 lck_mtx_lock(&dtrace_lock);
18508
18509 if (state->dts_activity == DTRACE_ACTIVITY_INACTIVE) {
18510 lck_mtx_unlock(&dtrace_lock);
18511 return (ENOENT);
18512 }
18513
18514 if (state->dts_activity == DTRACE_ACTIVITY_DRAINING)
18515 stat.dtst_exiting = 1;
18516
18517 nerrs = state->dts_errors;
18518 dstate = &state->dts_vstate.dtvs_dynvars;
18519
18520 zpercpu_foreach_cpu(i) {
18521 dtrace_dstate_percpu_t *dcpu = zpercpu_get_cpu(dstate->dtds_percpu, i);
18522
18523 stat.dtst_dyndrops += dcpu->dtdsc_drops;
18524 stat.dtst_dyndrops_dirty += dcpu->dtdsc_dirty_drops;
18525 stat.dtst_dyndrops_rinsing += dcpu->dtdsc_rinsing_drops;
18526
18527 if (state->dts_buffer[i].dtb_flags & DTRACEBUF_FULL)
18528 stat.dtst_filled++;
18529
18530 nerrs += state->dts_buffer[i].dtb_errors;
18531
18532 for (j = 0; j < state->dts_nspeculations; j++) {
18533 dtrace_speculation_t *spec;
18534 dtrace_buffer_t *buf;
18535
18536 spec = &state->dts_speculations[j];
18537 buf = &spec->dtsp_buffer[i];
18538 stat.dtst_specdrops += buf->dtb_xamot_drops;
18539 }
18540 }
18541
18542 stat.dtst_specdrops_busy = state->dts_speculations_busy;
18543 stat.dtst_specdrops_unavail = state->dts_speculations_unavail;
18544 stat.dtst_stkstroverflows = state->dts_stkstroverflows;
18545 stat.dtst_dblerrors = state->dts_dblerrors;
18546 stat.dtst_killed =
18547 (state->dts_activity == DTRACE_ACTIVITY_KILLED);
18548 stat.dtst_errors = nerrs;
18549
18550 lck_mtx_unlock(&dtrace_lock);
18551
18552 if (copyout(&stat, arg, sizeof (stat)) != 0)
18553 return (EFAULT);
18554
18555 return (0);
18556 }
18557
18558 case DTRACEIOC_FORMAT: {
18559 dtrace_fmtdesc_t fmt;
18560 char *str;
18561 int len;
18562
18563 if (copyin(arg, &fmt, sizeof (fmt)) != 0)
18564 return (EFAULT);
18565
18566 lck_mtx_lock(&dtrace_lock);
18567
18568 if (fmt.dtfd_format == 0 ||
18569 fmt.dtfd_format > state->dts_nformats) {
18570 lck_mtx_unlock(&dtrace_lock);
18571 return (EINVAL);
18572 }
18573
18574 /*
18575 * Format strings are allocated contiguously and they are
18576 * never freed; if a format index is less than the number
18577 * of formats, we can assert that the format map is non-NULL
18578 * and that the format for the specified index is non-NULL.
18579 */
18580 ASSERT(state->dts_formats != NULL);
18581 str = state->dts_formats[fmt.dtfd_format - 1]->dtf_str;
18582 ASSERT(str != NULL);
18583
18584 len = strlen(str) + 1;
18585
18586 if (len > fmt.dtfd_length) {
18587 fmt.dtfd_length = len;
18588
18589 if (copyout(&fmt, arg, sizeof (fmt)) != 0) {
18590 lck_mtx_unlock(&dtrace_lock);
18591 return (EINVAL);
18592 }
18593 } else {
18594 if (copyout(str, (user_addr_t)fmt.dtfd_string, len) != 0) {
18595 lck_mtx_unlock(&dtrace_lock);
18596 return (EINVAL);
18597 }
18598 }
18599
18600 lck_mtx_unlock(&dtrace_lock);
18601 return (0);
18602 }
18603
18604 case DTRACEIOC_MODUUIDSLIST: {
18605 size_t module_uuids_list_size;
18606 dtrace_module_uuids_list_t* uuids_list;
18607 uint64_t dtmul_count;
18608
18609 /*
18610 * Security restrictions make this operation illegal, if this is enabled DTrace
18611 * must refuse to provide any fbt probes.
18612 */
18613 if (dtrace_fbt_probes_restricted()) {
18614 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18615 return (EPERM);
18616 }
18617
18618 /*
18619 * Fail if the kernel symbol mode makes this operation illegal.
18620 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18621 * for them without holding the dtrace_lock.
18622 */
18623 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18624 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18625 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_MODUUIDSLIST", dtrace_kernel_symbol_mode);
18626 return (EPERM);
18627 }
18628
18629 /*
18630 * Read the number of symbolsdesc structs being passed in.
18631 */
18632 if (copyin(arg + offsetof(dtrace_module_uuids_list_t, dtmul_count),
18633 &dtmul_count,
18634 sizeof(dtmul_count))) {
18635 cmn_err(CE_WARN, "failed to copyin dtmul_count");
18636 return (EFAULT);
18637 }
18638
18639 /*
18640 * Range check the count. More than 2k kexts is probably an error.
18641 */
18642 if (dtmul_count > 2048) {
18643 cmn_err(CE_WARN, "dtmul_count is not valid");
18644 return (EINVAL);
18645 }
18646
18647 /*
18648 * For all queries, we return EINVAL when the user specified
18649 * count does not match the actual number of modules we find
18650 * available.
18651 *
18652 * If the user specified count is zero, then this serves as a
18653 * simple query to count the available modules in need of symbols.
18654 */
18655
18656 rval = 0;
18657
18658 if (dtmul_count == 0)
18659 {
18660 lck_mtx_lock(&mod_lock);
18661 struct modctl* ctl = dtrace_modctl_list;
18662 while (ctl) {
18663 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18664 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18665 dtmul_count++;
18666 rval = EINVAL;
18667 }
18668 ctl = ctl->mod_next;
18669 }
18670 lck_mtx_unlock(&mod_lock);
18671
18672 if (copyout(&dtmul_count, arg, sizeof (dtmul_count)) != 0)
18673 return (EFAULT);
18674 else
18675 return (rval);
18676 }
18677
18678 /*
18679 * If we reach this point, then we have a request for full list data.
18680 * Allocate a correctly sized structure and copyin the data.
18681 */
18682 module_uuids_list_size = DTRACE_MODULE_UUIDS_LIST_SIZE(dtmul_count);
18683 if ((uuids_list = kmem_alloc(module_uuids_list_size, KM_SLEEP)) == NULL)
18684 return (ENOMEM);
18685
18686 /* NOTE! We can no longer exit this method via return */
18687 if (copyin(arg, uuids_list, module_uuids_list_size) != 0) {
18688 cmn_err(CE_WARN, "failed copyin of dtrace_module_uuids_list_t");
18689 rval = EFAULT;
18690 goto moduuidslist_cleanup;
18691 }
18692
18693 /*
18694 * Check that the count didn't change between the first copyin and the second.
18695 */
18696 if (uuids_list->dtmul_count != dtmul_count) {
18697 rval = EINVAL;
18698 goto moduuidslist_cleanup;
18699 }
18700
18701 /*
18702 * Build the list of UUID's that need symbols
18703 */
18704 lck_mtx_lock(&mod_lock);
18705
18706 dtmul_count = 0;
18707
18708 struct modctl* ctl = dtrace_modctl_list;
18709 while (ctl) {
18710 /*
18711 * We assume that userspace symbols will be "better" than kernel level symbols,
18712 * as userspace can search for dSYM(s) and symbol'd binaries. Even if kernel syms
18713 * are available, add user syms if the module might use them.
18714 */
18715 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18716 if (!MOD_SYMBOLS_DONE(ctl) && !MOD_IS_STATIC_KEXT(ctl)) {
18717 UUID* uuid = &uuids_list->dtmul_uuid[dtmul_count];
18718 if (dtmul_count++ < uuids_list->dtmul_count) {
18719 memcpy(uuid, ctl->mod_uuid, sizeof(UUID));
18720 }
18721 }
18722 ctl = ctl->mod_next;
18723 }
18724
18725 lck_mtx_unlock(&mod_lock);
18726
18727 if (uuids_list->dtmul_count < dtmul_count)
18728 rval = EINVAL;
18729
18730 uuids_list->dtmul_count = dtmul_count;
18731
18732 /*
18733 * Copyout the symbols list (or at least the count!)
18734 */
18735 if (copyout(uuids_list, arg, module_uuids_list_size) != 0) {
18736 cmn_err(CE_WARN, "failed copyout of dtrace_symbolsdesc_list_t");
18737 rval = EFAULT;
18738 }
18739
18740 moduuidslist_cleanup:
18741 /*
18742 * If we had to allocate struct memory, free it.
18743 */
18744 if (uuids_list != NULL) {
18745 kmem_free(uuids_list, module_uuids_list_size);
18746 }
18747
18748 return rval;
18749 }
18750
18751 case DTRACEIOC_PROVMODSYMS: {
18752 size_t module_symbols_size;
18753 dtrace_module_symbols_t* module_symbols;
18754 uint64_t dtmodsyms_count;
18755
18756 /*
18757 * Security restrictions make this operation illegal, if this is enabled DTrace
18758 * must refuse to provide any fbt probes.
18759 */
18760 if (dtrace_fbt_probes_restricted()) {
18761 cmn_err(CE_WARN, "security restrictions disallow DTRACEIOC_MODUUIDSLIST");
18762 return (EPERM);
18763 }
18764
18765 /*
18766 * Fail if the kernel symbol mode makes this operation illegal.
18767 * Both NEVER & ALWAYS_FROM_KERNEL are permanent states, it is legal to check
18768 * for them without holding the dtrace_lock.
18769 */
18770 if (dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_NEVER ||
18771 dtrace_kernel_symbol_mode == DTRACE_KERNEL_SYMBOLS_ALWAYS_FROM_KERNEL) {
18772 cmn_err(CE_WARN, "dtrace_kernel_symbol_mode of %u disallows DTRACEIOC_PROVMODSYMS", dtrace_kernel_symbol_mode);
18773 return (EPERM);
18774 }
18775
18776 /*
18777 * Read the number of module symbols structs being passed in.
18778 */
18779 if (copyin(arg + offsetof(dtrace_module_symbols_t, dtmodsyms_count),
18780 &dtmodsyms_count,
18781 sizeof(dtmodsyms_count))) {
18782 cmn_err(CE_WARN, "failed to copyin dtmodsyms_count");
18783 return (EFAULT);
18784 }
18785
18786 /* Ensure that we have at least one symbol. */
18787 if (dtmodsyms_count == 0) {
18788 cmn_err(CE_WARN, "Invalid dtmodsyms_count value");
18789 return (EINVAL);
18790 }
18791
18792 /* Safely calculate size we need for copyin buffer. */
18793 module_symbols_size = DTRACE_MODULE_SYMBOLS_SIZE(dtmodsyms_count);
18794 if (module_symbols_size == 0 || module_symbols_size > (size_t)dtrace_copy_maxsize()) {
18795 cmn_err(CE_WARN, "Invalid module_symbols_size %ld", module_symbols_size);
18796 return (EINVAL);
18797 }
18798
18799 if ((module_symbols = kmem_alloc(module_symbols_size, KM_SLEEP)) == NULL)
18800 return (ENOMEM);
18801
18802 rval = 0;
18803
18804 /* NOTE! We can no longer exit this method via return */
18805 if (copyin(arg, module_symbols, module_symbols_size) != 0) {
18806 cmn_err(CE_WARN, "failed copyin of dtrace_module_symbols_t");
18807 rval = EFAULT;
18808 goto module_symbols_cleanup;
18809 }
18810
18811 /*
18812 * Check that the count didn't change between the first copyin and the second.
18813 */
18814 if (module_symbols->dtmodsyms_count != dtmodsyms_count) {
18815 rval = EINVAL;
18816 goto module_symbols_cleanup;
18817 }
18818
18819 /*
18820 * Find the modctl to add symbols to.
18821 */
18822 lck_mtx_lock(&dtrace_provider_lock);
18823 lck_mtx_lock(&mod_lock);
18824
18825 struct modctl* ctl = dtrace_modctl_list;
18826 while (ctl) {
18827 ASSERT(!MOD_HAS_USERSPACE_SYMBOLS(ctl));
18828 if (MOD_HAS_UUID(ctl) && !MOD_SYMBOLS_DONE(ctl) && memcmp(module_symbols->dtmodsyms_uuid, ctl->mod_uuid, sizeof(UUID)) == 0) {
18829 dtrace_provider_t *prv;
18830 ctl->mod_user_symbols = module_symbols;
18831
18832 /*
18833 * We're going to call each providers per-module provide operation
18834 * specifying only this module.
18835 */
18836 for (prv = dtrace_provider; prv != NULL; prv = prv->dtpv_next)
18837 prv->dtpv_pops.dtps_provide_module(prv->dtpv_arg, ctl);
18838 /*
18839 * We gave every provider a chance to provide with the user syms, go ahead and clear them
18840 */
18841 ctl->mod_user_symbols = NULL; /* MUST reset this to clear HAS_USERSPACE_SYMBOLS */
18842 }
18843 ctl = ctl->mod_next;
18844 }
18845
18846 lck_mtx_unlock(&mod_lock);
18847 lck_mtx_unlock(&dtrace_provider_lock);
18848
18849 module_symbols_cleanup:
18850 /*
18851 * If we had to allocate struct memory, free it.
18852 */
18853 if (module_symbols != NULL) {
18854 kmem_free(module_symbols, module_symbols_size);
18855 }
18856
18857 return rval;
18858 }
18859
18860 case DTRACEIOC_PROCWAITFOR: {
18861 dtrace_procdesc_t pdesc = {
18862 .p_name = {0},
18863 .p_pid = -1
18864 };
18865
18866 if ((rval = copyin(arg, &pdesc, sizeof(pdesc))) != 0)
18867 goto proc_waitfor_error;
18868
18869 if ((rval = dtrace_proc_waitfor(&pdesc)) != 0)
18870 goto proc_waitfor_error;
18871
18872 if ((rval = copyout(&pdesc, arg, sizeof(pdesc))) != 0)
18873 goto proc_waitfor_error;
18874
18875 return 0;
18876
18877 proc_waitfor_error:
18878 /* The process was suspended, revert this since the client will not do it. */
18879 if (pdesc.p_pid != -1) {
18880 proc_t *proc = proc_find(pdesc.p_pid);
18881 if (proc != PROC_NULL) {
18882 task_pidresume(proc->task);
18883 proc_rele(proc);
18884 }
18885 }
18886
18887 return rval;
18888 }
18889
18890 default:
18891 break;
18892 }
18893
18894 return (ENOTTY);
18895 }
18896
18897 /*
18898 * APPLE NOTE: dtrace_detach not implemented
18899 */
18900 #if !defined(__APPLE__)
18901 /*ARGSUSED*/
18902 static int
dtrace_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)18903 dtrace_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
18904 {
18905 dtrace_state_t *state;
18906
18907 switch (cmd) {
18908 case DDI_DETACH:
18909 break;
18910
18911 case DDI_SUSPEND:
18912 return (DDI_SUCCESS);
18913
18914 default:
18915 return (DDI_FAILURE);
18916 }
18917
18918 lck_mtx_lock(&cpu_lock);
18919 lck_mtx_lock(&dtrace_provider_lock);
18920 lck_mtx_lock(&dtrace_lock);
18921
18922 ASSERT(dtrace_opens == 0);
18923
18924 if (dtrace_helpers > 0) {
18925 lck_mtx_unlock(&dtrace_lock);
18926 lck_mtx_unlock(&dtrace_provider_lock);
18927 lck_mtx_unlock(&cpu_lock);
18928 return (DDI_FAILURE);
18929 }
18930
18931 if (dtrace_unregister((dtrace_provider_id_t)dtrace_provider) != 0) {
18932 lck_mtx_unlock(&dtrace_lock);
18933 lck_mtx_unlock(&dtrace_provider_lock);
18934 lck_mtx_unlock(&cpu_lock);
18935 return (DDI_FAILURE);
18936 }
18937
18938 dtrace_provider = NULL;
18939
18940 if ((state = dtrace_anon_grab()) != NULL) {
18941 /*
18942 * If there were ECBs on this state, the provider should
18943 * have not been allowed to detach; assert that there is
18944 * none.
18945 */
18946 ASSERT(state->dts_necbs == 0);
18947 dtrace_state_destroy(state);
18948
18949 /*
18950 * If we're being detached with anonymous state, we need to
18951 * indicate to the kernel debugger that DTrace is now inactive.
18952 */
18953 (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
18954 }
18955
18956 bzero(&dtrace_anon, sizeof (dtrace_anon_t));
18957 unregister_cpu_setup_func((cpu_setup_func_t *)dtrace_cpu_setup, NULL);
18958 dtrace_cpu_init = NULL;
18959 dtrace_helpers_cleanup = NULL;
18960 dtrace_helpers_fork = NULL;
18961 dtrace_cpustart_init = NULL;
18962 dtrace_cpustart_fini = NULL;
18963 dtrace_debugger_init = NULL;
18964 dtrace_debugger_fini = NULL;
18965 dtrace_kreloc_init = NULL;
18966 dtrace_kreloc_fini = NULL;
18967 dtrace_modload = NULL;
18968 dtrace_modunload = NULL;
18969
18970 lck_mtx_unlock(&cpu_lock);
18971
18972 if (dtrace_helptrace_enabled) {
18973 kmem_free(dtrace_helptrace_buffer, dtrace_helptrace_bufsize);
18974 dtrace_helptrace_buffer = NULL;
18975 }
18976
18977 kmem_free(dtrace_probes, dtrace_nprobes * sizeof (dtrace_probe_t *));
18978 dtrace_probes = NULL;
18979 dtrace_nprobes = 0;
18980
18981 dtrace_hash_destroy(dtrace_strings);
18982 dtrace_hash_destroy(dtrace_byprov);
18983 dtrace_hash_destroy(dtrace_bymod);
18984 dtrace_hash_destroy(dtrace_byfunc);
18985 dtrace_hash_destroy(dtrace_byname);
18986 dtrace_strings = NULL;
18987 dtrace_byprov = NULL;
18988 dtrace_bymod = NULL;
18989 dtrace_byfunc = NULL;
18990 dtrace_byname = NULL;
18991
18992 kmem_cache_destroy(dtrace_state_cache);
18993 vmem_destroy(dtrace_arena);
18994
18995 if (dtrace_toxrange != NULL) {
18996 kmem_free(dtrace_toxrange,
18997 dtrace_toxranges_max * sizeof (dtrace_toxrange_t));
18998 dtrace_toxrange = NULL;
18999 dtrace_toxranges = 0;
19000 dtrace_toxranges_max = 0;
19001 }
19002
19003 ddi_remove_minor_node(dtrace_devi, NULL);
19004 dtrace_devi = NULL;
19005
19006 ddi_soft_state_fini(&dtrace_softstate);
19007
19008 ASSERT(dtrace_vtime_references == 0);
19009 ASSERT(dtrace_opens == 0);
19010 ASSERT(dtrace_retained == NULL);
19011
19012 lck_mtx_unlock(&dtrace_lock);
19013 lck_mtx_unlock(&dtrace_provider_lock);
19014
19015 #ifdef illumos
19016 /*
19017 * We don't destroy the task queue until after we have dropped our
19018 * locks (taskq_destroy() may block on running tasks). To prevent
19019 * attempting to do work after we have effectively detached but before
19020 * the task queue has been destroyed, all tasks dispatched via the
19021 * task queue must check that DTrace is still attached before
19022 * performing any operation.
19023 */
19024 taskq_destroy(dtrace_taskq);
19025 dtrace_taskq = NULL;
19026 #endif
19027
19028 return (DDI_SUCCESS);
19029 }
19030 #endif /* __APPLE__ */
19031
19032 d_open_t _dtrace_open, helper_open;
19033 d_close_t _dtrace_close, helper_close;
19034 d_ioctl_t _dtrace_ioctl, helper_ioctl;
19035
19036 int
_dtrace_open(dev_t dev,int flags,int devtype,struct proc * p)19037 _dtrace_open(dev_t dev, int flags, int devtype, struct proc *p)
19038 {
19039 #pragma unused(p)
19040 dev_t locdev = dev;
19041
19042 return dtrace_open( &locdev, flags, devtype, CRED());
19043 }
19044
19045 int
helper_open(dev_t dev,int flags,int devtype,struct proc * p)19046 helper_open(dev_t dev, int flags, int devtype, struct proc *p)
19047 {
19048 #pragma unused(dev,flags,devtype,p)
19049 return 0;
19050 }
19051
19052 int
_dtrace_close(dev_t dev,int flags,int devtype,struct proc * p)19053 _dtrace_close(dev_t dev, int flags, int devtype, struct proc *p)
19054 {
19055 #pragma unused(p)
19056 return dtrace_close( dev, flags, devtype, CRED());
19057 }
19058
19059 int
helper_close(dev_t dev,int flags,int devtype,struct proc * p)19060 helper_close(dev_t dev, int flags, int devtype, struct proc *p)
19061 {
19062 #pragma unused(dev,flags,devtype,p)
19063 return 0;
19064 }
19065
19066 int
_dtrace_ioctl(dev_t dev,u_long cmd,caddr_t data,int fflag,struct proc * p)19067 _dtrace_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19068 {
19069 #pragma unused(p)
19070 int err, rv = 0;
19071 user_addr_t uaddrp;
19072
19073 if (proc_is64bit(p))
19074 uaddrp = *(user_addr_t *)data;
19075 else
19076 uaddrp = (user_addr_t) *(uint32_t *)data;
19077
19078 err = dtrace_ioctl(dev, cmd, uaddrp, fflag, CRED(), &rv);
19079
19080 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19081 if (err != 0) {
19082 ASSERT( (err & 0xfffff000) == 0 );
19083 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19084 } else if (rv != 0) {
19085 ASSERT( (rv & 0xfff00000) == 0 );
19086 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19087 } else
19088 return 0;
19089 }
19090
19091 int
helper_ioctl(dev_t dev,u_long cmd,caddr_t data,int fflag,struct proc * p)19092 helper_ioctl(dev_t dev, u_long cmd, caddr_t data, int fflag, struct proc *p)
19093 {
19094 #pragma unused(dev,fflag,p)
19095 int err, rv = 0;
19096
19097 err = dtrace_ioctl_helper(cmd, data, &rv);
19098 /* Darwin's BSD ioctls only return -1 or zero. Overload errno to mimic Solaris. 20 bits suffice. */
19099 if (err != 0) {
19100 ASSERT( (err & 0xfffff000) == 0 );
19101 return (err & 0xfff); /* ioctl will return -1 and will set errno to an error code < 4096 */
19102 } else if (rv != 0) {
19103 ASSERT( (rv & 0xfff00000) == 0 );
19104 return (((rv & 0xfffff) << 12)); /* ioctl will return -1 and will set errno to a value >= 4096 */
19105 } else
19106 return 0;
19107 }
19108
19109 #define HELPER_MAJOR -24 /* let the kernel pick the device number */
19110
19111 const static struct cdevsw helper_cdevsw =
19112 {
19113 .d_open = helper_open,
19114 .d_close = helper_close,
19115 .d_read = eno_rdwrt,
19116 .d_write = eno_rdwrt,
19117 .d_ioctl = helper_ioctl,
19118 .d_stop = (stop_fcn_t *)nulldev,
19119 .d_reset = (reset_fcn_t *)nulldev,
19120 .d_select = eno_select,
19121 .d_mmap = eno_mmap,
19122 .d_strategy = eno_strat,
19123 .d_reserved_1 = eno_getc,
19124 .d_reserved_2 = eno_putc,
19125 };
19126
19127 static int helper_majdevno = 0;
19128
19129 static int gDTraceInited = 0;
19130
19131 void
helper_init(void)19132 helper_init( void )
19133 {
19134 /*
19135 * Once the "helper" is initialized, it can take ioctl calls that use locks
19136 * and zones initialized in dtrace_init. Make certain dtrace_init was called
19137 * before us.
19138 */
19139
19140 if (!gDTraceInited) {
19141 panic("helper_init before dtrace_init");
19142 }
19143
19144 if (0 >= helper_majdevno)
19145 {
19146 helper_majdevno = cdevsw_add(HELPER_MAJOR, &helper_cdevsw);
19147
19148 if (helper_majdevno < 0) {
19149 printf("helper_init: failed to allocate a major number!\n");
19150 return;
19151 }
19152
19153 if (NULL == devfs_make_node( makedev(helper_majdevno, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19154 DTRACEMNR_HELPER )) {
19155 printf("dtrace_init: failed to devfs_make_node for helper!\n");
19156 return;
19157 }
19158 } else
19159 panic("helper_init: called twice!");
19160 }
19161
19162 #undef HELPER_MAJOR
19163
19164 static int
dtrace_clone_func(dev_t dev,int action)19165 dtrace_clone_func(dev_t dev, int action)
19166 {
19167 #pragma unused(dev)
19168
19169 if (action == DEVFS_CLONE_ALLOC) {
19170 return dtrace_state_reserve();
19171 }
19172 else if (action == DEVFS_CLONE_FREE) {
19173 return 0;
19174 }
19175 else return -1;
19176 }
19177
19178 void dtrace_ast(void);
19179
19180 void
dtrace_ast(void)19181 dtrace_ast(void)
19182 {
19183 int i;
19184 uint32_t clients = os_atomic_xchg(&dtrace_wake_clients, 0, relaxed);
19185 if (clients == 0)
19186 return;
19187 /**
19188 * We disable preemption here to be sure that we won't get
19189 * interrupted by a wakeup to a thread that is higher
19190 * priority than us, so that we do issue all wakeups
19191 */
19192 disable_preemption();
19193 for (i = 0; i < DTRACE_NCLIENTS; i++) {
19194 if (clients & (1 << i)) {
19195 dtrace_state_t *state = dtrace_state_get(i);
19196 if (state) {
19197 wakeup(state);
19198 }
19199
19200 }
19201 }
19202 enable_preemption();
19203 }
19204
19205
19206 #define DTRACE_MAJOR -24 /* let the kernel pick the device number */
19207
19208 static const struct cdevsw dtrace_cdevsw =
19209 {
19210 .d_open = _dtrace_open,
19211 .d_close = _dtrace_close,
19212 .d_read = eno_rdwrt,
19213 .d_write = eno_rdwrt,
19214 .d_ioctl = _dtrace_ioctl,
19215 .d_stop = (stop_fcn_t *)nulldev,
19216 .d_reset = (reset_fcn_t *)nulldev,
19217 .d_select = eno_select,
19218 .d_mmap = eno_mmap,
19219 .d_strategy = eno_strat,
19220 .d_reserved_1 = eno_getc,
19221 .d_reserved_2 = eno_putc,
19222 };
19223
19224 LCK_ATTR_DECLARE(dtrace_lck_attr, 0, 0);
19225 LCK_GRP_DECLARE(dtrace_lck_grp, "dtrace");
19226
19227 static int gMajDevNo;
19228
dtrace_early_init(void)19229 void dtrace_early_init (void)
19230 {
19231 dtrace_restriction_policy_load();
19232
19233 /*
19234 * See dtrace_impl.h for a description of kernel symbol modes.
19235 * The default is to wait for symbols from userspace (lazy symbols).
19236 */
19237 if (!PE_parse_boot_argn("dtrace_kernel_symbol_mode", &dtrace_kernel_symbol_mode, sizeof (dtrace_kernel_symbol_mode))) {
19238 dtrace_kernel_symbol_mode = DTRACE_KERNEL_SYMBOLS_FROM_USERSPACE;
19239 }
19240 }
19241
19242 void
dtrace_init(void)19243 dtrace_init( void )
19244 {
19245 if (0 == gDTraceInited) {
19246 unsigned int i, ncpu;
19247 size_t size = sizeof(dtrace_buffer_memory_maxsize);
19248
19249 /*
19250 * Disable destructive actions when dtrace is running
19251 * in a restricted environment
19252 */
19253 dtrace_destructive_disallow = dtrace_is_restricted() &&
19254 !dtrace_are_restrictions_relaxed();
19255
19256 /*
19257 * DTrace allocates buffers based on the maximum number
19258 * of enabled cpus. This call avoids any race when finding
19259 * that count.
19260 */
19261 ASSERT(dtrace_max_cpus == 0);
19262 ncpu = dtrace_max_cpus = ml_wait_max_cpus();
19263
19264 /*
19265 * Retrieve the size of the physical memory in order to define
19266 * the state buffer memory maximal size. If we cannot retrieve
19267 * this value, we'll consider that we have 1Gb of memory per CPU, that's
19268 * still better than raising a kernel panic.
19269 */
19270 if (0 != kernel_sysctlbyname("hw.memsize", &dtrace_buffer_memory_maxsize,
19271 &size, NULL, 0))
19272 {
19273 dtrace_buffer_memory_maxsize = ncpu * 1024 * 1024 * 1024;
19274 printf("dtrace_init: failed to retrieve the hw.memsize, defaulted to %lld bytes\n",
19275 dtrace_buffer_memory_maxsize);
19276 }
19277
19278 /*
19279 * Finally, divide by three to prevent DTrace from eating too
19280 * much memory.
19281 */
19282 dtrace_buffer_memory_maxsize /= 3;
19283 ASSERT(dtrace_buffer_memory_maxsize > 0);
19284
19285 gMajDevNo = cdevsw_add(DTRACE_MAJOR, &dtrace_cdevsw);
19286
19287 if (gMajDevNo < 0) {
19288 printf("dtrace_init: failed to allocate a major number!\n");
19289 gDTraceInited = 0;
19290 return;
19291 }
19292
19293 if (NULL == devfs_make_node_clone( makedev(gMajDevNo, 0), DEVFS_CHAR, UID_ROOT, GID_WHEEL, 0666,
19294 dtrace_clone_func, DTRACEMNR_DTRACE )) {
19295 printf("dtrace_init: failed to devfs_make_node_clone for dtrace!\n");
19296 gDTraceInited = 0;
19297 return;
19298 }
19299
19300 /*
19301 * The cpu_core structure consists of per-CPU state available in any context.
19302 * On some architectures, this may mean that the page(s) containing the
19303 * NCPU-sized array of cpu_core structures must be locked in the TLB -- it
19304 * is up to the platform to assure that this is performed properly. Note that
19305 * the structure is sized to avoid false sharing.
19306 */
19307
19308 /*
19309 * Initialize the CPU offline/online hooks.
19310 */
19311 dtrace_install_cpu_hooks();
19312
19313 dtrace_modctl_list = NULL;
19314
19315 cpu_core = (cpu_core_t *)kmem_zalloc( ncpu * sizeof(cpu_core_t), KM_SLEEP );
19316 for (i = 0; i < ncpu; ++i) {
19317 lck_mtx_init(&cpu_core[i].cpuc_pid_lock, &dtrace_lck_grp, &dtrace_lck_attr);
19318 }
19319
19320 cpu_list = (dtrace_cpu_t *)kmem_zalloc( ncpu * sizeof(dtrace_cpu_t), KM_SLEEP );
19321 for (i = 0; i < ncpu; ++i) {
19322 cpu_list[i].cpu_id = (processorid_t)i;
19323 cpu_list[i].cpu_next = &(cpu_list[(i+1) % ncpu]);
19324 LIST_INIT(&cpu_list[i].cpu_cyc_list);
19325 lck_rw_init(&cpu_list[i].cpu_ft_lock, &dtrace_lck_grp, &dtrace_lck_attr);
19326 }
19327
19328 lck_mtx_lock(&cpu_lock);
19329 for (i = 0; i < ncpu; ++i)
19330 /* FIXME: track CPU configuration */
19331 dtrace_cpu_setup_initial( (processorid_t)i ); /* In lieu of register_cpu_setup_func() callback */
19332 lck_mtx_unlock(&cpu_lock);
19333
19334 (void)dtrace_abs_to_nano(0LL); /* Force once only call to clock_timebase_info (which can take a lock) */
19335
19336 dtrace_strings = dtrace_hash_create(dtrace_strkey_offset,
19337 offsetof(dtrace_string_t, dtst_str),
19338 offsetof(dtrace_string_t, dtst_next),
19339 offsetof(dtrace_string_t, dtst_prev));
19340
19341 /*
19342 * See dtrace_impl.h for a description of dof modes.
19343 * The default is lazy dof.
19344 *
19345 * FIXME: Warn if state is LAZY_OFF? It won't break anything, but
19346 * makes no sense...
19347 */
19348 if (!PE_parse_boot_argn("dtrace_dof_mode", &dtrace_dof_mode, sizeof (dtrace_dof_mode))) {
19349 #if defined(XNU_TARGET_OS_OSX)
19350 dtrace_dof_mode = DTRACE_DOF_MODE_LAZY_ON;
19351 #else
19352 dtrace_dof_mode = DTRACE_DOF_MODE_NEVER;
19353 #endif
19354 }
19355
19356 /*
19357 * Sanity check of dof mode value.
19358 */
19359 switch (dtrace_dof_mode) {
19360 case DTRACE_DOF_MODE_NEVER:
19361 case DTRACE_DOF_MODE_LAZY_ON:
19362 /* valid modes, but nothing else we need to do */
19363 break;
19364
19365 case DTRACE_DOF_MODE_LAZY_OFF:
19366 case DTRACE_DOF_MODE_NON_LAZY:
19367 /* Cannot wait for a dtrace_open to init fasttrap */
19368 fasttrap_init();
19369 break;
19370
19371 default:
19372 /* Invalid, clamp to non lazy */
19373 dtrace_dof_mode = DTRACE_DOF_MODE_NON_LAZY;
19374 fasttrap_init();
19375 break;
19376 }
19377
19378 #if CONFIG_DTRACE
19379 if (dtrace_dof_mode != DTRACE_DOF_MODE_NEVER)
19380 commpage_update_dof(true);
19381 #endif
19382
19383 gDTraceInited = 1;
19384
19385 } else
19386 panic("dtrace_init: called twice!");
19387 }
19388
19389 void
dtrace_postinit(void)19390 dtrace_postinit(void)
19391 {
19392 /*
19393 * Called from bsd_init after all provider's *_init() routines have been
19394 * run. That way, anonymous DOF enabled under dtrace_attach() is safe
19395 * to go.
19396 */
19397 dtrace_attach( (dev_info_t *)(uintptr_t)makedev(gMajDevNo, 0)); /* Punning a dev_t to a dev_info_t* */
19398
19399 /*
19400 * Add the mach_kernel to the module list for lazy processing
19401 */
19402 struct kmod_info fake_kernel_kmod;
19403 memset(&fake_kernel_kmod, 0, sizeof(fake_kernel_kmod));
19404
19405 strlcpy(fake_kernel_kmod.name, "mach_kernel", sizeof(fake_kernel_kmod.name));
19406 fake_kernel_kmod.id = 1;
19407 fake_kernel_kmod.address = g_kernel_kmod_info.address;
19408 fake_kernel_kmod.size = g_kernel_kmod_info.size;
19409
19410 /* Ensure we don't try to touch symbols if they are gone. */
19411 boolean_t keepsyms = false;
19412 PE_parse_boot_argn("keepsyms", &keepsyms, sizeof(keepsyms));
19413
19414 if (dtrace_module_loaded(&fake_kernel_kmod, (keepsyms) ? 0 : KMOD_DTRACE_NO_KERNEL_SYMS) != 0) {
19415 printf("dtrace_postinit: Could not register mach_kernel modctl\n");
19416 }
19417
19418 (void)OSKextRegisterKextsWithDTrace();
19419 }
19420 #undef DTRACE_MAJOR
19421
19422 /*
19423 * Routines used to register interest in cpu's being added to or removed
19424 * from the system.
19425 */
19426 void
register_cpu_setup_func(cpu_setup_func_t * ignore1,void * ignore2)19427 register_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19428 {
19429 #pragma unused(ignore1,ignore2)
19430 }
19431
19432 void
unregister_cpu_setup_func(cpu_setup_func_t * ignore1,void * ignore2)19433 unregister_cpu_setup_func(cpu_setup_func_t *ignore1, void *ignore2)
19434 {
19435 #pragma unused(ignore1,ignore2)
19436 }
19437