xref: /xnu-12377.81.4/bsd/kern/kdebug.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 /*
2  * Copyright (c) 2000-2021 Apple Inc. All rights reserved.
3  *
4  * @Apple_LICENSE_HEADER_START@
5  *
6  * The contents of this file constitute Original Code as defined in and
7  * are subject to the Apple Public Source License Version 1.1 (the
8  * "License").  You may not use this file except in compliance with the
9  * License.  Please obtain a copy of the License at
10  * http://www.apple.com/publicsource and read it before using this file.
11  *
12  * This Original Code and all software distributed under the License are
13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
17  * License for the specific language governing rights and limitations
18  * under the License.
19  *
20  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
21  */
22 
23 #include <sys/errno.h>
24 #include <sys/kdebug_private.h>
25 #include <sys/proc_internal.h>
26 #include <sys/vm.h>
27 #include <sys/sysctl.h>
28 #include <sys/kdebug_common.h>
29 #include <sys/kdebug.h>
30 #include <sys/kdebug_triage.h>
31 #include <sys/kauth.h>
32 #include <sys/ktrace.h>
33 #include <sys/sysproto.h>
34 #include <sys/bsdtask_info.h>
35 #include <sys/random.h>
36 
37 #include <mach/mach_vm.h>
38 #include <machine/atomic.h>
39 
40 #include <mach/machine.h>
41 #include <mach/vm_map.h>
42 #include <kern/clock.h>
43 
44 #include <kern/task.h>
45 #include <kern/debug.h>
46 #include <kern/kalloc.h>
47 #include <kern/telemetry.h>
48 #include <kern/sched_prim.h>
49 #include <sys/lock.h>
50 #include <pexpert/device_tree.h>
51 #include <os/atomic.h>
52 
53 #include <sys/malloc.h>
54 
55 #include <sys/vnode.h>
56 #include <sys/vnode_internal.h>
57 #include <sys/fcntl.h>
58 #include <sys/file_internal.h>
59 #include <sys/ubc.h>
60 #include <sys/param.h>                  /* for isset() */
61 
62 #include <vm/vm_kern_xnu.h>
63 #include <vm/vm_map_xnu.h>
64 
65 #include <libkern/OSAtomic.h>
66 
67 #include <machine/pal_routines.h>
68 #include <machine/atomic.h>
69 
70 
71 extern unsigned int wake_nkdbufs;
72 extern unsigned int trace_wrap;
73 
74 // Coprocessors (or "IOP"s)
75 //
76 // Coprocessors are auxiliary cores that want to participate in kdebug event
77 // logging.  They are registered dynamically, as devices match hardware, and are
78 // each assigned an ID at registration.
79 //
80 // Once registered, a coprocessor is permanent; it cannot be unregistered.
81 // The current implementation depends on this for thread safety.
82 //
83 // The `kd_coprocs` list may be safely walked at any time, without holding
84 // locks.
85 //
86 // When starting a trace session, the current `kd_coprocs` head is captured. Any
87 // operations that depend on the buffer state (such as flushing IOP traces on
88 // reads, etc.) should use the captured list head. This will allow registrations
89 // to take place while trace is in use, though their events will be rejected
90 // until the next time a trace session is started.
91 
92 struct kd_coproc {
93 	char                  full_name[32];
94 	kdebug_coproc_flags_t flags;
95 	kd_callback_t         callback;
96 	uint32_t              cpu_id;
97 	struct kd_coproc     *next;
98 	struct mpsc_queue_chain chain;
99 };
100 
101 static struct kd_coproc *kd_coprocs = NULL;
102 
103 // Use an MPSC queue to notify coprocessors of the current trace state during
104 // registration, if space is available for them in the current trace session.
105 static struct mpsc_daemon_queue _coproc_notify_queue;
106 
107 // Typefilter(s)
108 //
109 // A typefilter is a 8KB bitmap that is used to selectively filter events
110 // being recorded. It is able to individually address every class & subclass.
111 //
112 // There is a shared typefilter in the kernel which is lazily allocated. Once
113 // allocated, the shared typefilter is never deallocated. The shared typefilter
114 // is also mapped on demand into userspace processes that invoke kdebug_trace
115 // API from Libsyscall. When mapped into a userspace process, the memory is
116 // read only, and does not have a fixed address.
117 //
118 // It is a requirement that the kernel's shared typefilter always pass DBG_TRACE
119 // events. This is enforced automatically, by having the needed bits set any
120 // time the shared typefilter is mutated.
121 
122 typedef uint8_t *typefilter_t;
123 
124 static typefilter_t kdbg_typefilter;
125 static mach_port_t kdbg_typefilter_memory_entry;
126 
127 /*
128  * There are 3 combinations of page sizes:
129  *
130  *  4KB /  4KB
131  *  4KB / 16KB
132  * 16KB / 16KB
133  *
134  * The typefilter is exactly 8KB. In the first two scenarios, we would like
135  * to use 2 pages exactly; in the third scenario we must make certain that
136  * a full page is allocated so we do not inadvertantly share 8KB of random
137  * data to userspace. The round_page_32 macro rounds to kernel page size.
138  */
139 #define TYPEFILTER_ALLOC_SIZE MAX(round_page_32(KDBG_TYPEFILTER_BITMAP_SIZE), KDBG_TYPEFILTER_BITMAP_SIZE)
140 
141 static typefilter_t
typefilter_create(void)142 typefilter_create(void)
143 {
144 	typefilter_t tf;
145 	if (KERN_SUCCESS == kmem_alloc(kernel_map, (vm_offset_t*)&tf,
146 	    TYPEFILTER_ALLOC_SIZE, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG)) {
147 		return tf;
148 	}
149 	return NULL;
150 }
151 
152 static void
typefilter_deallocate(typefilter_t tf)153 typefilter_deallocate(typefilter_t tf)
154 {
155 	assert(tf != NULL);
156 	assert(tf != kdbg_typefilter);
157 	kmem_free(kernel_map, (vm_offset_t)tf, TYPEFILTER_ALLOC_SIZE);
158 }
159 
160 static void
typefilter_copy(typefilter_t dst,typefilter_t src)161 typefilter_copy(typefilter_t dst, typefilter_t src)
162 {
163 	assert(src != NULL);
164 	assert(dst != NULL);
165 	memcpy(dst, src, KDBG_TYPEFILTER_BITMAP_SIZE);
166 }
167 
168 static void
typefilter_reject_all(typefilter_t tf)169 typefilter_reject_all(typefilter_t tf)
170 {
171 	assert(tf != NULL);
172 	memset(tf, 0, KDBG_TYPEFILTER_BITMAP_SIZE);
173 }
174 
175 static void
typefilter_allow_all(typefilter_t tf)176 typefilter_allow_all(typefilter_t tf)
177 {
178 	assert(tf != NULL);
179 	memset(tf, ~0, KDBG_TYPEFILTER_BITMAP_SIZE);
180 }
181 
182 static void
typefilter_allow_class(typefilter_t tf,uint8_t class)183 typefilter_allow_class(typefilter_t tf, uint8_t class)
184 {
185 	assert(tf != NULL);
186 	const uint32_t BYTES_PER_CLASS = 256 / 8; // 256 subclasses, 1 bit each
187 	memset(&tf[class * BYTES_PER_CLASS], 0xFF, BYTES_PER_CLASS);
188 }
189 
190 static void
typefilter_allow_csc(typefilter_t tf,uint16_t csc)191 typefilter_allow_csc(typefilter_t tf, uint16_t csc)
192 {
193 	assert(tf != NULL);
194 	setbit(tf, csc);
195 }
196 
197 static bool
typefilter_is_debugid_allowed(typefilter_t tf,uint32_t id)198 typefilter_is_debugid_allowed(typefilter_t tf, uint32_t id)
199 {
200 	assert(tf != NULL);
201 	return isset(tf, KDBG_EXTRACT_CSC(id));
202 }
203 
204 static mach_port_t
typefilter_create_memory_entry(typefilter_t tf)205 typefilter_create_memory_entry(typefilter_t tf)
206 {
207 	assert(tf != NULL);
208 
209 	mach_port_t memory_entry = MACH_PORT_NULL;
210 	memory_object_size_t size = TYPEFILTER_ALLOC_SIZE;
211 
212 	kern_return_t kr = mach_make_memory_entry_64(kernel_map,
213 	    &size,
214 	    (memory_object_offset_t)tf,
215 	    VM_PROT_READ,
216 	    &memory_entry,
217 	    MACH_PORT_NULL);
218 	if (kr != KERN_SUCCESS) {
219 		return MACH_PORT_NULL;
220 	}
221 
222 	return memory_entry;
223 }
224 
225 static int  kdbg_copyin_typefilter(user_addr_t addr, size_t size);
226 static void kdbg_enable_typefilter(void);
227 static void kdbg_disable_typefilter(void);
228 
229 // External prototypes
230 
231 void commpage_update_kdebug_state(void);
232 
233 static int kdbg_readcurthrmap(user_addr_t, size_t *);
234 static int kdbg_setpidex(kd_regtype *);
235 static int kdbg_setpid(kd_regtype *);
236 static int kdbg_reinit(unsigned int extra_cpus);
237 #if DEVELOPMENT || DEBUG
238 static int kdbg_test(size_t flavor);
239 #endif /* DEVELOPMENT || DEBUG */
240 
241 static int kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size);
242 static void _clear_thread_map(void);
243 
244 static bool kdbg_wait(uint64_t timeout_ms);
245 
246 static void _try_wakeup_waiter(void);
247 static void _wakeup_waiter(void);
248 
249 static int _copy_cpu_map(int version, void **dst, size_t *size);
250 
251 static kd_threadmap *_thread_map_create_live(size_t max_count,
252     vm_size_t *map_size, vm_size_t *map_count);
253 
254 static bool kdebug_current_proc_enabled(uint32_t debugid);
255 static errno_t kdebug_check_trace_string(uint32_t debugid, uint64_t str_id);
256 
257 #define RAW_FLUSH_SIZE (2 * 1024 * 1024)
258 
259 __enum_closed_decl(kd_dest_kind_t, uint32_t, {
260 	KD_DEST_COPYOUT = 0x1,
261 	KD_DEST_VFS = 0x2,
262 });
263 
264 struct kd_dest {
265 	kd_dest_kind_t kdd_kind;
266 	bool kdd_chunk_format;
267 	off_t kdd_cur_offset;
268 	union {
269 		struct {
270 			user_addr_t kdd_user_buffer;
271 			size_t kdd_user_size;
272 		};
273 		struct {
274 			struct vfs_context kdd_vfs_ctx;
275 			vnode_t kdd_vnode;
276 			off_t kdd_file_written_since_flush;
277 		};
278 	};
279 };
280 
281 static inline struct kd_dest
kd_dest_copyout(user_addr_t buf,size_t size)282 kd_dest_copyout(user_addr_t buf, size_t size)
283 {
284 	return (struct kd_dest){
285 		       .kdd_kind = KD_DEST_COPYOUT,
286 		       .kdd_user_buffer = buf,
287 		       .kdd_user_size = size,
288 	};
289 }
290 
291 static inline int
kd_dest_init_write(struct kd_dest * dest,int fd,struct fileproc ** fp_out)292 kd_dest_init_write(struct kd_dest *dest, int fd, struct fileproc **fp_out)
293 {
294 	dest->kdd_kind = KD_DEST_VFS;
295 	proc_t p = current_proc();
296 	struct fileproc *fp = NULL;
297 	if (fp_get_ftype(p, fd, DTYPE_VNODE, EBADF, &fp)) {
298 		return EBADF;
299 	}
300 
301 	dest->kdd_vnode = fp_get_data(fp);
302 	int error = vnode_getwithref(dest->kdd_vnode);
303 	if (error != 0) {
304 		fp_drop(p, fd, fp, 0);
305 		return error;
306 	}
307 	dest->kdd_vfs_ctx.vc_thread = current_thread();
308 	dest->kdd_vfs_ctx.vc_ucred = fp->fp_glob->fg_cred;
309 	dest->kdd_cur_offset = fp->fp_glob->fg_offset;
310 	*fp_out = fp;
311 	return 0;
312 }
313 
314 static inline void
kd_dest_finish_write(struct kd_dest * dest,struct fileproc * fp,int fd)315 kd_dest_finish_write(struct kd_dest *dest, struct fileproc *fp, int fd)
316 {
317 	fp->fp_glob->fg_offset = dest->kdd_cur_offset;
318 	vnode_put(dest->kdd_vnode);
319 	fp_drop(current_proc(), fd, fp, 0);
320 }
321 
322 static int _send_events(struct kd_dest *dest, const void *src,
323     size_t event_count);
324 static int kdbg_write_thread_map(struct kd_dest *dest);
325 static int _write_legacy_header(bool write_thread_map, struct kd_dest *dest);
326 
327 extern void IOSleep(int);
328 
329 unsigned int kdebug_enable = 0;
330 
331 // A static buffer to record events prior to the start of regular logging.
332 
333 #define KD_EARLY_BUFFER_SIZE (16 * 1024)
334 #define KD_EARLY_EVENT_COUNT (KD_EARLY_BUFFER_SIZE / sizeof(kd_buf))
335 #if defined(__x86_64__)
336 __attribute__((aligned(KD_EARLY_BUFFER_SIZE)))
337 static kd_buf kd_early_buffer[KD_EARLY_EVENT_COUNT];
338 #else /* defined(__x86_64__) */
339 // On ARM, the space for this is carved out by osfmk/arm/data.s -- clang
340 // has problems aligning to greater than 4K.
341 extern kd_buf kd_early_buffer[KD_EARLY_EVENT_COUNT];
342 #endif /* !defined(__x86_64__) */
343 
344 static __security_const_late unsigned int kd_early_index = 0;
345 static __security_const_late bool kd_early_overflow = false;
346 static __security_const_late bool kd_early_done = false;
347 
348 static bool kd_waiter = false;
349 static LCK_SPIN_DECLARE(kd_wait_lock, &kdebug_lck_grp);
350 // Synchronize access to coprocessor list for kdebug trace.
351 static LCK_SPIN_DECLARE(kd_coproc_spinlock, &kdebug_lck_grp);
352 
353 #define TRACE_KDCOPYBUF_COUNT 8192
354 #define TRACE_KDCOPYBUF_SIZE  (TRACE_KDCOPYBUF_COUNT * sizeof(kd_buf))
355 
356 struct kd_control kd_control_trace = {
357 	.kds_free_list = {.raw = KDS_PTR_NULL},
358 	.enabled = 0,
359 	.mode = KDEBUG_MODE_TRACE,
360 	.kdebug_events_per_storage_unit = TRACE_EVENTS_PER_STORAGE_UNIT,
361 	.kdebug_min_storage_units_per_cpu = TRACE_MIN_STORAGE_UNITS_PER_CPU,
362 	.kdebug_kdcopybuf_count = TRACE_KDCOPYBUF_COUNT,
363 	.kdebug_kdcopybuf_size = TRACE_KDCOPYBUF_SIZE,
364 	.kdc_flags = 0,
365 	.kdc_emit = KDEMIT_DISABLE,
366 	.kdc_oldest_time = 0
367 };
368 
369 struct kd_buffer kd_buffer_trace = {
370 	.kdb_event_count = 0,
371 	.kdb_storage_count = 0,
372 	.kdb_storage_threshold = 0,
373 	.kdb_region_count = 0,
374 	.kdb_info = NULL,
375 	.kd_bufs = NULL,
376 	.kdcopybuf = NULL
377 };
378 
379 unsigned int kdlog_beg = 0;
380 unsigned int kdlog_end = 0;
381 unsigned int kdlog_value1 = 0;
382 unsigned int kdlog_value2 = 0;
383 unsigned int kdlog_value3 = 0;
384 unsigned int kdlog_value4 = 0;
385 
386 kd_threadmap *kd_mapptr = 0;
387 vm_size_t kd_mapsize = 0;
388 vm_size_t kd_mapcount = 0;
389 
390 /*
391  * A globally increasing counter for identifying strings in trace.  Starts at
392  * 1 because 0 is a reserved return value.
393  */
394 __attribute__((aligned(MAX_CPU_CACHE_LINE_SIZE)))
395 static uint64_t g_curr_str_id = 1;
396 
397 #define STR_ID_SIG_OFFSET (48)
398 #define STR_ID_MASK       ((1ULL << STR_ID_SIG_OFFSET) - 1)
399 #define STR_ID_SIG_MASK   (~STR_ID_MASK)
400 
401 /*
402  * A bit pattern for identifying string IDs generated by
403  * kdebug_trace_string(2).
404  */
405 static uint64_t g_str_id_signature = (0x70acULL << STR_ID_SIG_OFFSET);
406 
407 #define RAW_VERSION3    0x00001000
408 
409 #define V3_RAW_EVENTS   0x00001e00
410 
411 static void
_coproc_lock(void)412 _coproc_lock(void)
413 {
414 	lck_spin_lock_grp(&kd_coproc_spinlock, &kdebug_lck_grp);
415 }
416 
417 static void
_coproc_unlock(void)418 _coproc_unlock(void)
419 {
420 	lck_spin_unlock(&kd_coproc_spinlock);
421 }
422 
423 static void
_coproc_list_check(void)424 _coproc_list_check(void)
425 {
426 #if MACH_ASSERT
427 	_coproc_lock();
428 	struct kd_coproc *coproc = kd_control_trace.kdc_coprocs;
429 	if (coproc) {
430 		/* Is list sorted by cpu_id? */
431 		struct kd_coproc* temp = coproc;
432 		do {
433 			assert(!temp->next || temp->next->cpu_id == temp->cpu_id - 1);
434 			assert(temp->next || (temp->cpu_id == kdbg_cpu_count()));
435 		} while ((temp = temp->next));
436 
437 		/* Does each entry have a function and a name? */
438 		temp = coproc;
439 		do {
440 			assert(temp->callback.func);
441 			assert(strlen(temp->callback.iop_name) < sizeof(temp->callback.iop_name));
442 		} while ((temp = temp->next));
443 	}
444 	_coproc_unlock();
445 #endif // MACH_ASSERT
446 }
447 
448 static void
_coproc_list_callback(kd_callback_type type,void * arg)449 _coproc_list_callback(kd_callback_type type, void *arg)
450 {
451 	if (kd_control_trace.kdc_flags & KDBG_DISABLE_COPROCS) {
452 		return;
453 	}
454 
455 	_coproc_lock();
456 	// Coprocessor list is only ever prepended to.
457 	struct kd_coproc *head = kd_control_trace.kdc_coprocs;
458 	_coproc_unlock();
459 	while (head) {
460 		head->callback.func(head->callback.context, type, arg);
461 		head = head->next;
462 	}
463 }
464 
465 // Leave some extra space for coprocessors to register while tracing is active.
466 #define EXTRA_COPROC_COUNT      (16)
467 // There are more coprocessors registering during boot tracing.
468 #define EXTRA_COPROC_COUNT_BOOT (32)
469 
470 static kdebug_emit_filter_t
_trace_emit_filter(void)471 _trace_emit_filter(void)
472 {
473 	if (!kdebug_enable) {
474 		return KDEMIT_DISABLE;
475 	} else if (kd_control_trace.kdc_flags & KDBG_TYPEFILTER_CHECK) {
476 		return KDEMIT_TYPEFILTER;
477 	} else if (kd_control_trace.kdc_flags & KDBG_RANGECHECK) {
478 		return KDEMIT_RANGE;
479 	} else if (kd_control_trace.kdc_flags & KDBG_VALCHECK) {
480 		return KDEMIT_EXACT;
481 	} else {
482 		return KDEMIT_ALL;
483 	}
484 }
485 
486 static void
kdbg_set_tracing_enabled(bool enabled,uint32_t trace_type)487 kdbg_set_tracing_enabled(bool enabled, uint32_t trace_type)
488 {
489 	// Drain any events from coprocessors before making the state change.  On
490 	// enabling, this removes any stale events from before tracing.  On
491 	// disabling, this saves any events up to the point tracing is disabled.
492 	_coproc_list_callback(KD_CALLBACK_SYNC_FLUSH, NULL);
493 
494 	if (!enabled) {
495 		// Give coprocessors a chance to log any events before tracing is
496 		// disabled, outside the lock.
497 		_coproc_list_callback(KD_CALLBACK_KDEBUG_DISABLED, NULL);
498 	}
499 
500 	int intrs_en = kdebug_storage_lock(&kd_control_trace);
501 	if (enabled) {
502 		// Latch the status of the user-controlled flags for wrapping.
503 		kd_control_trace.kdc_live_flags = kd_control_trace.kdc_flags & KDBG_NOWRAP;
504 		// The oldest valid time is now; reject past events from coprocessors.
505 		kd_control_trace.kdc_oldest_time = kdebug_timestamp();
506 		kdebug_enable |= trace_type;
507 		kd_control_trace.kdc_emit = _trace_emit_filter();
508 		kd_control_trace.enabled = 1;
509 		commpage_update_kdebug_state();
510 	} else {
511 		kdebug_enable = 0;
512 		kd_control_trace.kdc_emit = KDEMIT_DISABLE;
513 		kd_control_trace.enabled = 0;
514 		commpage_update_kdebug_state();
515 	}
516 	kdebug_storage_unlock(&kd_control_trace, intrs_en);
517 
518 	if (enabled) {
519 		_coproc_list_callback(KD_CALLBACK_KDEBUG_ENABLED, NULL);
520 	}
521 }
522 
523 static int
create_buffers_trace(unsigned int extra_cpus)524 create_buffers_trace(unsigned int extra_cpus)
525 {
526 	int events_per_storage_unit = kd_control_trace.kdebug_events_per_storage_unit;
527 	int min_storage_units_per_cpu = kd_control_trace.kdebug_min_storage_units_per_cpu;
528 
529 	// For the duration of this allocation, trace code will only reference
530 	// kdc_coprocs.
531 	kd_control_trace.kdc_coprocs = kd_coprocs;
532 	_coproc_list_check();
533 
534 	// If the list is valid, it is sorted from newest to oldest.  Each entry is
535 	// prepended, so the CPU IDs are sorted in descending order.
536 	kd_control_trace.kdebug_cpus = kd_control_trace.kdc_coprocs ?
537 	    kd_control_trace.kdc_coprocs->cpu_id + 1 : kdbg_cpu_count();
538 	kd_control_trace.alloc_cpus = kd_control_trace.kdebug_cpus + extra_cpus;
539 
540 	size_t min_event_count = kd_control_trace.alloc_cpus *
541 	    events_per_storage_unit * min_storage_units_per_cpu;
542 	if (kd_buffer_trace.kdb_event_count < min_event_count) {
543 		kd_buffer_trace.kdb_storage_count = kd_control_trace.alloc_cpus * min_storage_units_per_cpu;
544 	} else {
545 		kd_buffer_trace.kdb_storage_count = kd_buffer_trace.kdb_event_count / events_per_storage_unit;
546 	}
547 
548 	kd_buffer_trace.kdb_event_count = kd_buffer_trace.kdb_storage_count * events_per_storage_unit;
549 
550 	kd_buffer_trace.kd_bufs = NULL;
551 
552 	int error = create_buffers(&kd_control_trace, &kd_buffer_trace,
553 	    VM_KERN_MEMORY_DIAG);
554 	if (!error) {
555 		struct kd_bufinfo *info = kd_buffer_trace.kdb_info;
556 		struct kd_coproc *cur_iop = kd_control_trace.kdc_coprocs;
557 		while (cur_iop != NULL) {
558 			info[cur_iop->cpu_id].continuous_timestamps = ISSET(cur_iop->flags,
559 			    KDCP_CONTINUOUS_TIME);
560 			cur_iop = cur_iop->next;
561 		}
562 		kd_buffer_trace.kdb_storage_threshold = kd_buffer_trace.kdb_storage_count / 2;
563 	}
564 
565 	return error;
566 }
567 
568 static void
delete_buffers_trace(void)569 delete_buffers_trace(void)
570 {
571 	delete_buffers(&kd_control_trace, &kd_buffer_trace);
572 }
573 
574 static int
_register_coproc_internal(const char * name,kdebug_coproc_flags_t flags,kd_callback_fn callback,void * context)575 _register_coproc_internal(const char *name, kdebug_coproc_flags_t flags,
576     kd_callback_fn callback, void *context)
577 {
578 	struct kd_coproc *coproc = NULL;
579 
580 	coproc = zalloc_permanent_type(struct kd_coproc);
581 	coproc->callback.func = callback;
582 	coproc->callback.context = context;
583 	coproc->flags = flags;
584 	strlcpy(coproc->full_name, name, sizeof(coproc->full_name));
585 
586 	_coproc_lock();
587 	coproc->next = kd_coprocs;
588 	coproc->cpu_id = kd_coprocs == NULL ? kdbg_cpu_count() : kd_coprocs->cpu_id + 1;
589 	kd_coprocs = coproc;
590 	if (coproc->cpu_id < kd_control_trace.alloc_cpus) {
591 		kd_control_trace.kdc_coprocs = kd_coprocs;
592 		kd_control_trace.kdebug_cpus += 1;
593 		if (kdebug_enable) {
594 			mpsc_daemon_enqueue(&_coproc_notify_queue, &coproc->chain,
595 			    MPSC_QUEUE_NONE);
596 		}
597 	}
598 	_coproc_unlock();
599 
600 	return coproc->cpu_id;
601 }
602 
603 int
kernel_debug_register_callback(kd_callback_t callback)604 kernel_debug_register_callback(kd_callback_t callback)
605 {
606 	// Be paranoid about using the provided name, but it's too late to reject
607 	// it.
608 	bool is_valid_name = false;
609 	for (uint32_t length = 0; length < sizeof(callback.iop_name); ++length) {
610 		if (callback.iop_name[length] > 0x20 && callback.iop_name[length] < 0x7F) {
611 			continue;
612 		}
613 		if (callback.iop_name[length] == 0) {
614 			if (length) {
615 				is_valid_name = true;
616 			}
617 			break;
618 		}
619 	}
620 	kd_callback_t sane_cb = callback;
621 	if (!is_valid_name) {
622 		strlcpy(sane_cb.iop_name, "IOP-???", sizeof(sane_cb.iop_name));
623 	}
624 
625 	return _register_coproc_internal(sane_cb.iop_name, 0, sane_cb.func,
626 	           sane_cb.context);
627 }
628 
629 int
kdebug_register_coproc(const char * name,kdebug_coproc_flags_t flags,kd_callback_fn callback,void * context)630 kdebug_register_coproc(const char *name, kdebug_coproc_flags_t flags,
631     kd_callback_fn callback, void *context)
632 {
633 	size_t name_len = strlen(name);
634 	if (!name || name_len == 0) {
635 		panic("kdebug: invalid name for coprocessor: %p", name);
636 	}
637 	for (size_t i = 0; i < name_len; i++) {
638 		if (name[i] <= 0x20 || name[i] >= 0x7F) {
639 			panic("kdebug: invalid name for coprocessor: %s", name);
640 		}
641 	}
642 	if (!callback) {
643 		panic("kdebug: no callback for coprocessor `%s'", name);
644 	}
645 	return _register_coproc_internal(name, flags, callback, context);
646 }
647 
648 static inline bool
_should_emit_debugid(kdebug_emit_filter_t emit,uint32_t debugid)649 _should_emit_debugid(kdebug_emit_filter_t emit, uint32_t debugid)
650 {
651 	switch (emit) {
652 	case KDEMIT_DISABLE:
653 		return false;
654 	case KDEMIT_TYPEFILTER:
655 		return typefilter_is_debugid_allowed(kdbg_typefilter, debugid);
656 	case KDEMIT_RANGE:
657 		return debugid >= kdlog_beg && debugid <= kdlog_end;
658 	case KDEMIT_EXACT:;
659 		uint32_t eventid = debugid & KDBG_EVENTID_MASK;
660 		return eventid == kdlog_value1 || eventid == kdlog_value2 ||
661 		       eventid == kdlog_value3 || eventid == kdlog_value4;
662 	case KDEMIT_ALL:
663 		return true;
664 	}
665 }
666 
667 static void
_try_wakeup_above_threshold(uint32_t debugid)668 _try_wakeup_above_threshold(uint32_t debugid)
669 {
670 	bool over_threshold = kd_control_trace.kdc_storage_used >=
671 	    kd_buffer_trace.kdb_storage_threshold;
672 	if (kd_waiter && over_threshold) {
673 		// Wakeup any waiters if called from a safe context.
674 
675 		const uint32_t INTERRUPT_EVENT = 0x01050000;
676 		const uint32_t VMFAULT_EVENT = 0x01300008;
677 		const uint32_t BSD_SYSCALL_CSC = 0x040c0000;
678 		const uint32_t MACH_SYSCALL_CSC = 0x010c0000;
679 
680 		uint32_t eventid = debugid & KDBG_EVENTID_MASK;
681 		uint32_t csc = debugid & KDBG_CSC_MASK;
682 
683 		if (eventid == INTERRUPT_EVENT || eventid == VMFAULT_EVENT ||
684 		    csc == BSD_SYSCALL_CSC || csc == MACH_SYSCALL_CSC) {
685 			_try_wakeup_waiter();
686 		}
687 	}
688 }
689 
690 __attribute__((always_inline))
691 static struct kd_storage *
_next_storage_unit(struct kd_bufinfo * info,unsigned int cpu)692 _next_storage_unit(struct kd_bufinfo *info, unsigned int cpu)
693 {
694 	struct kd_storage *store = NULL;
695 	do {
696 		bool needs_new_store = true;
697 		union kds_ptr kds_raw = info->kd_list_tail;
698 		if (kds_raw.raw != KDS_PTR_NULL) {
699 			store = POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, kds_raw);
700 			if (store->kds_bufindx < kd_control_trace.kdebug_events_per_storage_unit) {
701 				needs_new_store = false;
702 			}
703 		}
704 
705 		if (!needs_new_store) {
706 			return store;
707 		}
708 		bool allocated = kdebug_storage_alloc(&kd_control_trace, &kd_buffer_trace, cpu);
709 		if (!allocated) {
710 			// Failed to allocate while wrapping is disabled.
711 			return NULL;
712 		}
713 	} while (true);
714 }
715 
716 __attribute__((always_inline))
717 static kd_buf *
_next_timestamped_coproc_record(unsigned int cpu,uint64_t timestamp)718 _next_timestamped_coproc_record(unsigned int cpu, uint64_t timestamp)
719 {
720 	struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu];
721 	bool timestamp_is_continuous = info->continuous_timestamps;
722 
723 	if (kdebug_using_continuous_time()) {
724 		if (!timestamp_is_continuous) {
725 			timestamp = absolutetime_to_continuoustime(timestamp);
726 		}
727 	} else {
728 		if (timestamp_is_continuous) {
729 			timestamp = continuoustime_to_absolutetime(timestamp);
730 		}
731 	}
732 	if (timestamp < kd_control_trace.kdc_oldest_time) {
733 		if (info->latest_past_event_timestamp < timestamp) {
734 			info->latest_past_event_timestamp = timestamp;
735 		}
736 		return NULL;
737 	}
738 
739 	struct kd_storage *store = NULL;
740 	uint32_t store_index = 0;
741 
742 	do {
743 		store = _next_storage_unit(info, cpu);
744 		if (!store) {
745 			return NULL;
746 		}
747 		store_index = store->kds_bufindx;
748 		// Prevent an interrupt from stealing this slot in the storage unit,
749 		// retrying if necessary.  No barriers are needed because this only
750 		// concerns visibility on this same CPU.
751 		if (os_atomic_cmpxchg(&store->kds_bufindx, store_index, store_index + 1, relaxed)) {
752 			break;
753 		}
754 	} while (true);
755 
756 	// Make sure kds_timestamp is less than any event in this buffer.  This can
757 	// only happen for coprocessors because this field is initialized to the
758 	// current time when a storage unit is allocated by a CPU.
759 	if (timestamp < store->kds_timestamp) {
760 		store->kds_timestamp = timestamp;
761 	}
762 	os_atomic_inc(&store->kds_bufcnt, relaxed);
763 	kd_buf *kd = &store->kds_records[store_index];
764 	kd->timestamp = timestamp;
765 	return kd;
766 }
767 
768 __attribute__((always_inline))
769 static void
_write_trace_record_coproc_nopreempt(uint64_t timestamp,uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t arg5,unsigned int cpu)770 _write_trace_record_coproc_nopreempt(
771 	uint64_t timestamp,
772 	uint32_t debugid,
773 	uintptr_t arg1,
774 	uintptr_t arg2,
775 	uintptr_t arg3,
776 	uintptr_t arg4,
777 	uintptr_t arg5,
778 	unsigned int cpu)
779 {
780 	if (kd_control_trace.enabled == 0) {
781 		return;
782 	}
783 	kd_buf *kd = _next_timestamped_coproc_record(cpu, timestamp);
784 	if (kd) {
785 		kd->debugid = debugid;
786 		kd->arg1 = arg1;
787 		kd->arg2 = arg2;
788 		kd->arg3 = arg3;
789 		kd->arg4 = arg4;
790 		kd->arg5 = arg5;
791 		kd->cpuid = cpu;
792 	}
793 }
794 
795 __attribute__((always_inline))
796 static kd_buf *
_next_timestamped_record(unsigned int cpu)797 _next_timestamped_record(unsigned int cpu)
798 {
799 	struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu];
800 	struct kd_storage *store = NULL;
801 	uint64_t now = 0;
802 	uint32_t store_index = 0;
803 
804 	do {
805 		store = _next_storage_unit(info, cpu);
806 		if (!store) {
807 			return NULL;
808 		}
809 		store_index = store->kds_bufindx;
810 
811 		// Re-capture the timestamp to ensure time is monotonically-increasing
812 		// within storage units.
813 		now = kdebug_timestamp();
814 		if (os_atomic_cmpxchg(&store->kds_bufindx, store_index, store_index + 1, relaxed)) {
815 			break;
816 		}
817 	} while (true);
818 
819 	os_atomic_inc(&store->kds_bufcnt, relaxed);
820 	kd_buf *kd = &store->kds_records[store_index];
821 	kd->timestamp = now;
822 	return kd;
823 }
824 
825 static bool kdebug_debugid_procfilt_allowed(uint32_t debugid);
826 
827 static void
_write_trace_record(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t arg5,kdebug_emit_flags_t flags)828 _write_trace_record(
829 	uint32_t debugid,
830 	uintptr_t arg1,
831 	uintptr_t arg2,
832 	uintptr_t arg3,
833 	uintptr_t arg4,
834 	uintptr_t arg5,
835 	kdebug_emit_flags_t flags)
836 {
837 	kdebug_emit_filter_t emit = kd_control_trace.kdc_emit;
838 	if (!emit || !kdebug_enable) {
839 		return;
840 	}
841 	bool only_filter = flags & KDBG_FILTER_ONLY;
842 	bool observe_procfilt = !(flags & KDBG_NON_PROCESS);
843 
844 	if (!_should_emit_debugid(emit, debugid)) {
845 		return;
846 	}
847 	if (emit == KDEMIT_ALL && only_filter) {
848 		return;
849 	}
850 	if (!ml_at_interrupt_context() && observe_procfilt &&
851 	    !kdebug_debugid_procfilt_allowed(debugid)) {
852 		return;
853 	}
854 
855 	disable_preemption();
856 	if (kd_control_trace.enabled == 0) {
857 		enable_preemption();
858 		return;
859 	}
860 	unsigned int cpu = cpu_number();
861 	kd_buf *kd = _next_timestamped_record(cpu);
862 	if (kd) {
863 		kd->debugid = debugid;
864 		kd->arg1 = arg1;
865 		kd->arg2 = arg2;
866 		kd->arg3 = arg3;
867 		kd->arg4 = arg4;
868 		kd->arg5 = arg5;
869 		kd->cpuid = cpu;
870 	}
871 	enable_preemption();
872 
873 #if KPERF
874 	kperf_kdebug_callback(debugid, __builtin_frame_address(0));
875 #endif // KPERF
876 }
877 
878 static void
kernel_debug_internal(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t arg5,kdebug_emit_flags_t flags)879 kernel_debug_internal(
880 	uint32_t debugid,
881 	uintptr_t arg1,
882 	uintptr_t arg2,
883 	uintptr_t arg3,
884 	uintptr_t arg4,
885 	uintptr_t arg5,
886 	kdebug_emit_flags_t flags)
887 {
888 	_write_trace_record(debugid, arg1, arg2, arg3, arg4, arg5, flags);
889 	_try_wakeup_above_threshold(debugid);
890 }
891 
892 __attribute__((noinline))
893 void
kernel_debug(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,__unused uintptr_t arg5)894 kernel_debug(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
895     uintptr_t arg4, __unused uintptr_t arg5)
896 {
897 	uintptr_t tid = (uintptr_t)thread_tid(current_thread());
898 	kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, tid, 0);
899 }
900 
901 __attribute__((noinline))
902 void
kernel_debug1(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t arg5)903 kernel_debug1(uint32_t debugid, uintptr_t arg1, uintptr_t arg2, uintptr_t arg3,
904     uintptr_t arg4, uintptr_t arg5)
905 {
906 	kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, arg5, 0);
907 }
908 
909 __attribute__((noinline))
910 void
kernel_debug_flags(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,kdebug_emit_flags_t flags)911 kernel_debug_flags(
912 	uint32_t debugid,
913 	uintptr_t arg1,
914 	uintptr_t arg2,
915 	uintptr_t arg3,
916 	uintptr_t arg4,
917 	kdebug_emit_flags_t flags)
918 {
919 	uintptr_t tid = (uintptr_t)thread_tid(current_thread());
920 	kernel_debug_internal(debugid, arg1, arg2, arg3, arg4, tid, flags);
921 }
922 
923 __attribute__((noinline))
924 void
kernel_debug_filtered(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4)925 kernel_debug_filtered(
926 	uint32_t debugid,
927 	uintptr_t arg1,
928 	uintptr_t arg2,
929 	uintptr_t arg3,
930 	uintptr_t arg4)
931 {
932 	kernel_debug_flags(debugid, arg1, arg2, arg3, arg4, KDBG_FILTER_ONLY);
933 }
934 
935 void
kernel_debug_string_early(const char * message)936 kernel_debug_string_early(const char *message)
937 {
938 	uintptr_t a[4] = { 0 };
939 	strncpy((char *)a, message, sizeof(a));
940 	KERNEL_DEBUG_EARLY(TRACE_INFO_STRING, a[0], a[1], a[2], a[3]);
941 }
942 
943 // Emit events from coprocessors.
944 void
kernel_debug_enter(uint32_t coreid,uint32_t debugid,uint64_t timestamp,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4,uintptr_t threadid)945 kernel_debug_enter(
946 	uint32_t  coreid,
947 	uint32_t  debugid,
948 	uint64_t  timestamp,
949 	uintptr_t arg1,
950 	uintptr_t arg2,
951 	uintptr_t arg3,
952 	uintptr_t arg4,
953 	uintptr_t threadid
954 	)
955 {
956 	if (kd_control_trace.kdc_flags & KDBG_DISABLE_COPROCS) {
957 		return;
958 	}
959 	if (coreid >= kd_control_trace.kdebug_cpus) {
960 		return;
961 	}
962 	kdebug_emit_filter_t emit = kd_control_trace.kdc_emit;
963 	if (!emit || !kdebug_enable) {
964 		return;
965 	}
966 	if (!_should_emit_debugid(emit, debugid)) {
967 		return;
968 	}
969 
970 	disable_preemption();
971 	_write_trace_record_coproc_nopreempt(timestamp, debugid, arg1, arg2, arg3, arg4, threadid, coreid);
972 	enable_preemption();
973 }
974 
975 __pure2
976 static inline proc_t
kdebug_current_proc_unsafe(void)977 kdebug_current_proc_unsafe(void)
978 {
979 	return get_thread_ro_unchecked(current_thread())->tro_proc;
980 }
981 
982 // Return true iff the debug ID should be traced by the current process.
983 __attribute__((always_inline))
984 static bool
kdebug_debugid_procfilt_allowed(uint32_t debugid)985 kdebug_debugid_procfilt_allowed(uint32_t debugid)
986 {
987 	uint32_t procfilt_flags = kd_control_trace.kdc_flags &
988 	    (KDBG_PIDCHECK | KDBG_PIDEXCLUDE);
989 	if (!procfilt_flags) {
990 		return true;
991 	}
992 
993 	// DBG_TRACE and MACH_SCHED tracepoints ignore the process filter.
994 	if ((debugid & KDBG_CSC_MASK) == MACHDBG_CODE(DBG_MACH_SCHED, 0) ||
995 	    (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE)) {
996 		return true;
997 	}
998 
999 	struct proc *curproc = kdebug_current_proc_unsafe();
1000 	// If the process is missing (early in boot), allow it.
1001 	if (!curproc) {
1002 		return true;
1003 	}
1004 
1005 	switch (procfilt_flags) {
1006 	case KDBG_PIDCHECK:
1007 		return curproc->p_kdebug;
1008 	case KDBG_PIDEXCLUDE:
1009 		return !curproc->p_kdebug;
1010 	default:
1011 		panic("kdebug: invalid procfilt flags %x", kd_control_trace.kdc_flags);
1012 	}
1013 }
1014 
1015 #define SIMPLE_STR_LEN (64)
1016 static_assert(SIMPLE_STR_LEN % sizeof(uintptr_t) == 0);
1017 
1018 void
kernel_debug_string_simple(uint32_t eventid,const char * str)1019 kernel_debug_string_simple(uint32_t eventid, const char *str)
1020 {
1021 	if (!kdebug_enable) {
1022 		return;
1023 	}
1024 
1025 	/* array of uintptr_ts simplifies emitting the string as arguments */
1026 	uintptr_t str_buf[(SIMPLE_STR_LEN / sizeof(uintptr_t)) + 1] = { 0 };
1027 	size_t len = strlcpy((char *)str_buf, str, SIMPLE_STR_LEN + 1);
1028 	len = MIN(len, SIMPLE_STR_LEN);
1029 
1030 	uintptr_t thread_id = (uintptr_t)thread_tid(current_thread());
1031 	uint32_t debugid = eventid | DBG_FUNC_START;
1032 
1033 	/* string can fit in a single tracepoint */
1034 	if (len <= (4 * sizeof(uintptr_t))) {
1035 		debugid |= DBG_FUNC_END;
1036 	}
1037 
1038 	kernel_debug_internal(debugid, str_buf[0], str_buf[1], str_buf[2],
1039 	    str_buf[3], thread_id, 0);
1040 
1041 	debugid &= KDBG_EVENTID_MASK;
1042 	int i = 4;
1043 	size_t written = 4 * sizeof(uintptr_t);
1044 
1045 	for (; written < len; i += 4, written += 4 * sizeof(uintptr_t)) {
1046 		/* if this is the last tracepoint to be emitted */
1047 		if ((written + (4 * sizeof(uintptr_t))) >= len) {
1048 			debugid |= DBG_FUNC_END;
1049 		}
1050 		kernel_debug_internal(debugid, str_buf[i], str_buf[i + 1],
1051 		    str_buf[i + 2], str_buf[i + 3], thread_id, 0);
1052 	}
1053 }
1054 
1055 extern int      master_cpu;             /* MACH_KERNEL_PRIVATE */
1056 /*
1057  * Used prior to start_kern_tracing() being called.
1058  * Log temporarily into a static buffer.
1059  */
1060 void
kernel_debug_early(uint32_t debugid,uintptr_t arg1,uintptr_t arg2,uintptr_t arg3,uintptr_t arg4)1061 kernel_debug_early(
1062 	uint32_t        debugid,
1063 	uintptr_t       arg1,
1064 	uintptr_t       arg2,
1065 	uintptr_t       arg3,
1066 	uintptr_t       arg4)
1067 {
1068 #if defined(__x86_64__)
1069 	extern int early_boot;
1070 	/*
1071 	 * Note that "early" isn't early enough in some cases where
1072 	 * we're invoked before gsbase is set on x86, hence the
1073 	 * check of "early_boot".
1074 	 */
1075 	if (early_boot) {
1076 		return;
1077 	}
1078 #endif
1079 
1080 	/* If early tracing is over, use the normal path. */
1081 	if (kd_early_done) {
1082 		KDBG_RELEASE(debugid, arg1, arg2, arg3, arg4);
1083 		return;
1084 	}
1085 
1086 	/* Do nothing if the buffer is full or we're not on the boot cpu. */
1087 	kd_early_overflow = kd_early_index >= KD_EARLY_EVENT_COUNT;
1088 	if (kd_early_overflow || cpu_number() != master_cpu) {
1089 		return;
1090 	}
1091 
1092 	kd_early_buffer[kd_early_index].debugid = debugid;
1093 	kd_early_buffer[kd_early_index].timestamp = mach_absolute_time();
1094 	kd_early_buffer[kd_early_index].arg1 = arg1;
1095 	kd_early_buffer[kd_early_index].arg2 = arg2;
1096 	kd_early_buffer[kd_early_index].arg3 = arg3;
1097 	kd_early_buffer[kd_early_index].arg4 = arg4;
1098 	kd_early_buffer[kd_early_index].arg5 = 0;
1099 	kd_early_index++;
1100 }
1101 
1102 /*
1103  * Transfer the contents of the temporary buffer into the trace buffers.
1104  * Precede that by logging the rebase time (offset) - the TSC-based time (in ns)
1105  * when mach_absolute_time is set to 0.
1106  */
1107 static void
kernel_debug_early_end(void)1108 kernel_debug_early_end(void)
1109 {
1110 	if (cpu_number() != master_cpu) {
1111 		panic("kernel_debug_early_end() not call on boot processor");
1112 	}
1113 
1114 	/* reset the current oldest time to allow early events */
1115 	kd_control_trace.kdc_oldest_time = 0;
1116 
1117 #if defined(__x86_64__)
1118 	/* Fake sentinel marking the start of kernel time relative to TSC */
1119 	kernel_debug_enter(0, TRACE_TIMESTAMPS, 0,
1120 	    (uint32_t)(tsc_rebase_abs_time >> 32), (uint32_t)tsc_rebase_abs_time,
1121 	    tsc_at_boot, 0, 0);
1122 #endif /* defined(__x86_64__) */
1123 	for (unsigned int i = 0; i < kd_early_index; i++) {
1124 		kernel_debug_enter(0,
1125 		    kd_early_buffer[i].debugid,
1126 		    kd_early_buffer[i].timestamp,
1127 		    kd_early_buffer[i].arg1,
1128 		    kd_early_buffer[i].arg2,
1129 		    kd_early_buffer[i].arg3,
1130 		    kd_early_buffer[i].arg4,
1131 		    0);
1132 	}
1133 
1134 	/* Cut events-lost event on overflow */
1135 	if (kd_early_overflow) {
1136 		KDBG_RELEASE(TRACE_LOST_EVENTS, 1);
1137 	}
1138 
1139 	kd_early_done = true;
1140 
1141 	/* This trace marks the start of kernel tracing */
1142 	kernel_debug_string_early("early trace done");
1143 }
1144 
1145 void
kernel_debug_disable(void)1146 kernel_debug_disable(void)
1147 {
1148 	kdbg_set_tracing_enabled(false, 0);
1149 	_wakeup_waiter();
1150 }
1151 
1152 // Returns true if debugid should only be traced from the kernel.
1153 static int
_kernel_only_event(uint32_t debugid)1154 _kernel_only_event(uint32_t debugid)
1155 {
1156 	return KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE;
1157 }
1158 
1159 /*
1160  * Support syscall SYS_kdebug_typefilter.
1161  */
1162 int
kdebug_typefilter(__unused struct proc * p,struct kdebug_typefilter_args * uap,__unused int * retval)1163 kdebug_typefilter(__unused struct proc* p, struct kdebug_typefilter_args* uap,
1164     __unused int *retval)
1165 {
1166 	if (uap->addr == USER_ADDR_NULL || uap->size == USER_ADDR_NULL) {
1167 		return EINVAL;
1168 	}
1169 
1170 	mach_vm_offset_t user_addr = 0;
1171 	vm_map_t user_map = current_map();
1172 	const bool copy = false;
1173 	kern_return_t kr = mach_vm_map_kernel(user_map, &user_addr,
1174 	    TYPEFILTER_ALLOC_SIZE, 0, VM_MAP_KERNEL_FLAGS_ANYWHERE(),
1175 	    kdbg_typefilter_memory_entry, 0, copy,
1176 	    VM_PROT_READ, VM_PROT_READ, VM_INHERIT_SHARE);
1177 	if (kr != KERN_SUCCESS) {
1178 		return mach_to_bsd_errno(kr);
1179 	}
1180 
1181 	vm_size_t user_ptr_size = vm_map_is_64bit(user_map) ? 8 : 4;
1182 	int error = copyout((void *)&user_addr, uap->addr, user_ptr_size);
1183 	if (error != 0) {
1184 		mach_vm_deallocate(user_map, user_addr, TYPEFILTER_ALLOC_SIZE);
1185 	}
1186 	return error;
1187 }
1188 
1189 // Support SYS_kdebug_trace.
1190 int
kdebug_trace(struct proc * p,struct kdebug_trace_args * uap,int32_t * retval)1191 kdebug_trace(struct proc *p, struct kdebug_trace_args *uap, int32_t *retval)
1192 {
1193 	struct kdebug_trace64_args uap64 = {
1194 		.code = uap->code,
1195 		.arg1 = uap->arg1,
1196 		.arg2 = uap->arg2,
1197 		.arg3 = uap->arg3,
1198 		.arg4 = uap->arg4,
1199 	};
1200 	return kdebug_trace64(p, &uap64, retval);
1201 }
1202 
1203 // Support kdebug_trace(2).  64-bit arguments on K32 will get truncated
1204 // to fit in the 32-bit record format.
1205 //
1206 // It is intentional that error conditions are not checked until kdebug is
1207 // enabled. This is to match the userspace wrapper behavior, which is optimizing
1208 // for non-error case performance.
1209 int
kdebug_trace64(__unused struct proc * p,struct kdebug_trace64_args * uap,__unused int32_t * retval)1210 kdebug_trace64(__unused struct proc *p, struct kdebug_trace64_args *uap,
1211     __unused int32_t *retval)
1212 {
1213 	if (__probable(kdebug_enable == 0)) {
1214 		return 0;
1215 	}
1216 	if (_kernel_only_event(uap->code)) {
1217 		return EPERM;
1218 	}
1219 	kernel_debug_internal(uap->code, (uintptr_t)uap->arg1, (uintptr_t)uap->arg2,
1220 	    (uintptr_t)uap->arg3, (uintptr_t)uap->arg4,
1221 	    (uintptr_t)thread_tid(current_thread()), 0);
1222 	return 0;
1223 }
1224 
1225 /*
1226  * Adding enough padding to contain a full tracepoint for the last
1227  * portion of the string greatly simplifies the logic of splitting the
1228  * string between tracepoints.  Full tracepoints can be generated using
1229  * the buffer itself, without having to manually add zeros to pad the
1230  * arguments.
1231  */
1232 
1233 /* 2 string args in first tracepoint and 9 string data tracepoints */
1234 #define STR_BUF_ARGS (2 + (32 * 4))
1235 /* times the size of each arg on K64 */
1236 #define MAX_STR_LEN  (STR_BUF_ARGS * sizeof(uint64_t))
1237 /* on K32, ending straddles a tracepoint, so reserve blanks */
1238 #define STR_BUF_SIZE (MAX_STR_LEN + (2 * sizeof(uint32_t)))
1239 
1240 /*
1241  * This function does no error checking and assumes that it is called with
1242  * the correct arguments, including that the buffer pointed to by str is at
1243  * least STR_BUF_SIZE bytes.  However, str must be aligned to word-size and
1244  * be NUL-terminated.  In cases where a string can fit evenly into a final
1245  * tracepoint without its NUL-terminator, this function will not end those
1246  * strings with a NUL in trace.  It's up to clients to look at the function
1247  * qualifier for DBG_FUNC_END in this case, to end the string.
1248  */
1249 static uint64_t
kernel_debug_string_internal(uint32_t debugid,uint64_t str_id,void * vstr,size_t str_len)1250 kernel_debug_string_internal(uint32_t debugid, uint64_t str_id, void *vstr,
1251     size_t str_len)
1252 {
1253 	/* str must be word-aligned */
1254 	uintptr_t *str = vstr;
1255 	size_t written = 0;
1256 	uintptr_t thread_id;
1257 	int i;
1258 	uint32_t trace_debugid = TRACEDBG_CODE(DBG_TRACE_STRING,
1259 	    TRACE_STRING_GLOBAL);
1260 
1261 	thread_id = (uintptr_t)thread_tid(current_thread());
1262 
1263 	/* if the ID is being invalidated, just emit that */
1264 	if (str_id != 0 && str_len == 0) {
1265 		kernel_debug_internal(trace_debugid | DBG_FUNC_START | DBG_FUNC_END,
1266 		    (uintptr_t)debugid, (uintptr_t)str_id, 0, 0, thread_id, 0);
1267 		return str_id;
1268 	}
1269 
1270 	/* generate an ID, if necessary */
1271 	if (str_id == 0) {
1272 		str_id = OSIncrementAtomic64((SInt64 *)&g_curr_str_id);
1273 		str_id = (str_id & STR_ID_MASK) | g_str_id_signature;
1274 	}
1275 
1276 	trace_debugid |= DBG_FUNC_START;
1277 	/* string can fit in a single tracepoint */
1278 	if (str_len <= (2 * sizeof(uintptr_t))) {
1279 		trace_debugid |= DBG_FUNC_END;
1280 	}
1281 
1282 	kernel_debug_internal(trace_debugid, (uintptr_t)debugid, (uintptr_t)str_id,
1283 	    str[0], str[1], thread_id, 0);
1284 
1285 	trace_debugid &= KDBG_EVENTID_MASK;
1286 	i = 2;
1287 	written += 2 * sizeof(uintptr_t);
1288 
1289 	for (; written < str_len; i += 4, written += 4 * sizeof(uintptr_t)) {
1290 		if ((written + (4 * sizeof(uintptr_t))) >= str_len) {
1291 			trace_debugid |= DBG_FUNC_END;
1292 		}
1293 		kernel_debug_internal(trace_debugid, str[i], str[i + 1], str[i + 2],
1294 		    str[i + 3], thread_id, 0);
1295 	}
1296 
1297 	return str_id;
1298 }
1299 
1300 /*
1301  * Returns true if the current process can emit events, and false otherwise.
1302  * Trace system and scheduling events circumvent this check, as do events
1303  * emitted in interrupt context.
1304  */
1305 static bool
kdebug_current_proc_enabled(uint32_t debugid)1306 kdebug_current_proc_enabled(uint32_t debugid)
1307 {
1308 	/* can't determine current process in interrupt context */
1309 	if (ml_at_interrupt_context()) {
1310 		return true;
1311 	}
1312 
1313 	/* always emit trace system and scheduling events */
1314 	if ((KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE ||
1315 	    (debugid & KDBG_CSC_MASK) == MACHDBG_CODE(DBG_MACH_SCHED, 0))) {
1316 		return true;
1317 	}
1318 
1319 	if (kd_control_trace.kdc_flags & KDBG_PIDCHECK) {
1320 		proc_t cur_proc = kdebug_current_proc_unsafe();
1321 
1322 		/* only the process with the kdebug bit set is allowed */
1323 		if (cur_proc && !(cur_proc->p_kdebug)) {
1324 			return false;
1325 		}
1326 	} else if (kd_control_trace.kdc_flags & KDBG_PIDEXCLUDE) {
1327 		proc_t cur_proc = kdebug_current_proc_unsafe();
1328 
1329 		/* every process except the one with the kdebug bit set is allowed */
1330 		if (cur_proc && cur_proc->p_kdebug) {
1331 			return false;
1332 		}
1333 	}
1334 
1335 	return true;
1336 }
1337 
1338 bool
kdebug_debugid_enabled(uint32_t debugid)1339 kdebug_debugid_enabled(uint32_t debugid)
1340 {
1341 	return _should_emit_debugid(kd_control_trace.kdc_emit, debugid);
1342 }
1343 
1344 bool
kdebug_debugid_explicitly_enabled(uint32_t debugid)1345 kdebug_debugid_explicitly_enabled(uint32_t debugid)
1346 {
1347 	if (kd_control_trace.kdc_flags & KDBG_TYPEFILTER_CHECK) {
1348 		return typefilter_is_debugid_allowed(kdbg_typefilter, debugid);
1349 	} else if (KDBG_EXTRACT_CLASS(debugid) == DBG_TRACE) {
1350 		return true;
1351 	} else if (kd_control_trace.kdc_flags & KDBG_RANGECHECK) {
1352 		if (debugid < kdlog_beg || debugid > kdlog_end) {
1353 			return false;
1354 		}
1355 	} else if (kd_control_trace.kdc_flags & KDBG_VALCHECK) {
1356 		if ((debugid & KDBG_EVENTID_MASK) != kdlog_value1 &&
1357 		    (debugid & KDBG_EVENTID_MASK) != kdlog_value2 &&
1358 		    (debugid & KDBG_EVENTID_MASK) != kdlog_value3 &&
1359 		    (debugid & KDBG_EVENTID_MASK) != kdlog_value4) {
1360 			return false;
1361 		}
1362 	}
1363 
1364 	return true;
1365 }
1366 
1367 /*
1368  * Returns 0 if a string can be traced with these arguments.  Returns errno
1369  * value if error occurred.
1370  */
1371 static errno_t
kdebug_check_trace_string(uint32_t debugid,uint64_t str_id)1372 kdebug_check_trace_string(uint32_t debugid, uint64_t str_id)
1373 {
1374 	if (debugid & (DBG_FUNC_START | DBG_FUNC_END)) {
1375 		return EINVAL;
1376 	}
1377 	if (_kernel_only_event(debugid)) {
1378 		return EPERM;
1379 	}
1380 	if (str_id != 0 && (str_id & STR_ID_SIG_MASK) != g_str_id_signature) {
1381 		return EINVAL;
1382 	}
1383 	return 0;
1384 }
1385 
1386 /*
1387  * Implementation of KPI kernel_debug_string.
1388  */
1389 int
kernel_debug_string(uint32_t debugid,uint64_t * str_id,const char * str)1390 kernel_debug_string(uint32_t debugid, uint64_t *str_id, const char *str)
1391 {
1392 	/* arguments to tracepoints must be word-aligned */
1393 	__attribute__((aligned(sizeof(uintptr_t)))) char str_buf[STR_BUF_SIZE];
1394 	static_assert(sizeof(str_buf) > MAX_STR_LEN);
1395 	vm_size_t len_copied;
1396 	int err;
1397 
1398 	assert(str_id);
1399 
1400 	if (__probable(kdebug_enable == 0)) {
1401 		return 0;
1402 	}
1403 
1404 	if (!kdebug_current_proc_enabled(debugid)) {
1405 		return 0;
1406 	}
1407 
1408 	if (!kdebug_debugid_enabled(debugid)) {
1409 		return 0;
1410 	}
1411 
1412 	if ((err = kdebug_check_trace_string(debugid, *str_id)) != 0) {
1413 		return err;
1414 	}
1415 
1416 	if (str == NULL) {
1417 		if (str_id == 0) {
1418 			return EINVAL;
1419 		}
1420 
1421 		*str_id = kernel_debug_string_internal(debugid, *str_id, NULL, 0);
1422 		return 0;
1423 	}
1424 
1425 	memset(str_buf, 0, sizeof(str_buf));
1426 	len_copied = strlcpy(str_buf, str, MAX_STR_LEN + 1);
1427 	*str_id = kernel_debug_string_internal(debugid, *str_id, str_buf,
1428 	    len_copied);
1429 	return 0;
1430 }
1431 
1432 // Support kdebug_trace_string(2).
1433 int
kdebug_trace_string(__unused struct proc * p,struct kdebug_trace_string_args * uap,uint64_t * retval)1434 kdebug_trace_string(__unused struct proc *p,
1435     struct kdebug_trace_string_args *uap,
1436     uint64_t *retval)
1437 {
1438 	__attribute__((aligned(sizeof(uintptr_t)))) char str_buf[STR_BUF_SIZE];
1439 	static_assert(sizeof(str_buf) > MAX_STR_LEN);
1440 	size_t len_copied;
1441 	int err;
1442 
1443 	if (__probable(kdebug_enable == 0)) {
1444 		return 0;
1445 	}
1446 
1447 	if (!kdebug_current_proc_enabled(uap->debugid)) {
1448 		return 0;
1449 	}
1450 
1451 	if (!kdebug_debugid_enabled(uap->debugid)) {
1452 		return 0;
1453 	}
1454 
1455 	if ((err = kdebug_check_trace_string(uap->debugid, uap->str_id)) != 0) {
1456 		return err;
1457 	}
1458 
1459 	if (uap->str == USER_ADDR_NULL) {
1460 		if (uap->str_id == 0) {
1461 			return EINVAL;
1462 		}
1463 
1464 		*retval = kernel_debug_string_internal(uap->debugid, uap->str_id,
1465 		    NULL, 0);
1466 		return 0;
1467 	}
1468 
1469 	memset(str_buf, 0, sizeof(str_buf));
1470 	err = copyinstr(uap->str, str_buf, MAX_STR_LEN + 1, &len_copied);
1471 
1472 	/* it's alright to truncate the string, so allow ENAMETOOLONG */
1473 	if (err == ENAMETOOLONG) {
1474 		str_buf[MAX_STR_LEN] = '\0';
1475 	} else if (err) {
1476 		return err;
1477 	}
1478 
1479 	if (len_copied <= 1) {
1480 		return EINVAL;
1481 	}
1482 
1483 	/* convert back to a length */
1484 	len_copied--;
1485 
1486 	*retval = kernel_debug_string_internal(uap->debugid, uap->str_id, str_buf,
1487 	    len_copied);
1488 	return 0;
1489 }
1490 
1491 int
kdbg_reinit(unsigned int extra_cpus)1492 kdbg_reinit(unsigned int extra_cpus)
1493 {
1494 	kernel_debug_disable();
1495 	// Wait for any event writers to see the disable status.
1496 	IOSleep(100);
1497 	delete_buffers_trace();
1498 
1499 	_clear_thread_map();
1500 	kd_control_trace.kdc_live_flags &= ~KDBG_WRAPPED;
1501 	return create_buffers_trace(extra_cpus);
1502 }
1503 
1504 void
kdbg_trace_data(struct proc * proc,long * arg_pid,long * arg_uniqueid)1505 kdbg_trace_data(struct proc *proc, long *arg_pid, long *arg_uniqueid)
1506 {
1507 	if (proc) {
1508 		*arg_pid = proc_getpid(proc);
1509 		*arg_uniqueid = (long)proc_uniqueid(proc);
1510 		if ((uint64_t)*arg_uniqueid != proc_uniqueid(proc)) {
1511 			*arg_uniqueid = 0;
1512 		}
1513 	} else {
1514 		*arg_pid = 0;
1515 		*arg_uniqueid = 0;
1516 	}
1517 }
1518 
1519 void kdebug_proc_name_args(struct proc *proc, long args[static 4]);
1520 void
kdebug_proc_name_args(struct proc * proc,long args[static4])1521 kdebug_proc_name_args(struct proc *proc, long args[static 4])
1522 {
1523 	if (proc) {
1524 		strncpy((char *)args, proc_best_name(proc), 4 * sizeof(args[0]));
1525 	}
1526 }
1527 
1528 static void
_copy_ap_name(unsigned int cpuid,void * dst,size_t size)1529 _copy_ap_name(unsigned int cpuid, void *dst, size_t size)
1530 {
1531 	const char *name = "AP";
1532 #if defined(__arm64__)
1533 	const ml_topology_info_t *topology = ml_get_topology_info();
1534 	switch (topology->cpus[cpuid].cluster_type) {
1535 	case CLUSTER_TYPE_E:
1536 		name = "AP-E";
1537 		break;
1538 	case CLUSTER_TYPE_P:
1539 		name = "AP-P";
1540 		break;
1541 	default:
1542 		break;
1543 	}
1544 #else /* defined(__arm64__) */
1545 #pragma unused(cpuid)
1546 #endif /* !defined(__arm64__) */
1547 	strlcpy(dst, name, size);
1548 }
1549 
1550 // Write the specified `map_version` of CPU map to the `dst` buffer, using at
1551 // most `size` bytes.  Returns 0 on success and sets `size` to the number of
1552 // bytes written, and either ENOMEM or EINVAL on failure.
1553 //
1554 // If the value pointed to by `dst` is NULL, memory is allocated, and `size` is
1555 // adjusted to the allocated buffer's size.
1556 //
1557 // NB: `coprocs` is used to determine whether the stashed CPU map captured at
1558 // the start of tracing should be used.
1559 static errno_t
_copy_cpu_map(int map_version,void ** dst,size_t * size)1560 _copy_cpu_map(int map_version, void **dst, size_t *size)
1561 {
1562 	_coproc_lock();
1563 	struct kd_coproc *coprocs = kd_control_trace.kdc_coprocs;
1564 	unsigned int cpu_count = kd_control_trace.kdebug_cpus;
1565 	_coproc_unlock();
1566 
1567 	assert(cpu_count != 0);
1568 	assert(coprocs == NULL || coprocs[0].cpu_id + 1 == cpu_count);
1569 
1570 	bool ext = map_version != RAW_VERSION1;
1571 	size_t stride = ext ? sizeof(kd_cpumap_ext) : sizeof(kd_cpumap);
1572 
1573 	size_t size_needed = sizeof(kd_cpumap_header) + cpu_count * stride;
1574 	size_t size_avail = *size;
1575 	*size = size_needed;
1576 
1577 	if (*dst == NULL) {
1578 		kern_return_t alloc_ret = kmem_alloc(kernel_map, (vm_offset_t *)dst,
1579 		    (vm_size_t)size_needed, KMA_DATA_SHARED | KMA_ZERO, VM_KERN_MEMORY_DIAG);
1580 		if (alloc_ret != KERN_SUCCESS) {
1581 			return ENOMEM;
1582 		}
1583 	} else if (size_avail < size_needed) {
1584 		return EINVAL;
1585 	}
1586 
1587 	kd_cpumap_header *header = *dst;
1588 	header->version_no = map_version;
1589 	header->cpu_count = cpu_count;
1590 
1591 	void *cpus = &header[1];
1592 	size_t name_size = ext ? sizeof(((kd_cpumap_ext *)NULL)->name) :
1593 	    sizeof(((kd_cpumap *)NULL)->name);
1594 
1595 	int i = cpu_count - 1;
1596 	for (struct kd_coproc *cur_coproc = coprocs; cur_coproc != NULL;
1597 	    cur_coproc = cur_coproc->next, i--) {
1598 		kd_cpumap_ext *cpu = (kd_cpumap_ext *)((uintptr_t)cpus + stride * i);
1599 		cpu->cpu_id = cur_coproc->cpu_id;
1600 		cpu->flags = KDBG_CPUMAP_IS_IOP;
1601 		strlcpy((void *)&cpu->name, cur_coproc->full_name, name_size);
1602 	}
1603 	for (; i >= 0; i--) {
1604 		kd_cpumap *cpu = (kd_cpumap *)((uintptr_t)cpus + stride * i);
1605 		cpu->cpu_id = i;
1606 		cpu->flags = 0;
1607 		_copy_ap_name(i, &cpu->name, name_size);
1608 	}
1609 
1610 	return 0;
1611 }
1612 
1613 static void
_threadmap_init(void)1614 _threadmap_init(void)
1615 {
1616 	ktrace_assert_lock_held();
1617 
1618 	if (kd_control_trace.kdc_flags & KDBG_MAPINIT) {
1619 		return;
1620 	}
1621 
1622 	kd_mapptr = _thread_map_create_live(0, &kd_mapsize, &kd_mapcount);
1623 
1624 	if (kd_mapptr) {
1625 		kd_control_trace.kdc_flags |= KDBG_MAPINIT;
1626 	}
1627 }
1628 
1629 struct kd_resolver {
1630 	kd_threadmap *krs_map;
1631 	vm_size_t krs_count;
1632 	vm_size_t krs_maxcount;
1633 };
1634 
1635 static int
_resolve_iterator(proc_t proc,void * opaque)1636 _resolve_iterator(proc_t proc, void *opaque)
1637 {
1638 	if (proc == kernproc) {
1639 		/* Handled specially as it lacks uthreads. */
1640 		return PROC_RETURNED;
1641 	}
1642 	struct kd_resolver *resolver = opaque;
1643 	struct uthread *uth = NULL;
1644 	const char *proc_name = proc_best_name(proc);
1645 	pid_t pid = proc_getpid(proc);
1646 
1647 	proc_lock(proc);
1648 	TAILQ_FOREACH(uth, &proc->p_uthlist, uu_list) {
1649 		if (resolver->krs_count >= resolver->krs_maxcount) {
1650 			break;
1651 		}
1652 		kd_threadmap *map = &resolver->krs_map[resolver->krs_count];
1653 		map->thread = (uintptr_t)uthread_tid(uth);
1654 		(void)strlcpy(map->command, proc_name, sizeof(map->command));
1655 		map->valid = pid;
1656 		resolver->krs_count++;
1657 	}
1658 	proc_unlock(proc);
1659 
1660 	bool done = resolver->krs_count >= resolver->krs_maxcount;
1661 	return done ? PROC_RETURNED_DONE : PROC_RETURNED;
1662 }
1663 
1664 static void
_resolve_kernel_task(thread_t thread,void * opaque)1665 _resolve_kernel_task(thread_t thread, void *opaque)
1666 {
1667 	struct kd_resolver *resolver = opaque;
1668 	if (resolver->krs_count >= resolver->krs_maxcount) {
1669 		return;
1670 	}
1671 	kd_threadmap *map = &resolver->krs_map[resolver->krs_count];
1672 	map->thread = (uintptr_t)thread_tid(thread);
1673 	(void)strlcpy(map->command, "kernel_task", sizeof(map->command));
1674 	map->valid = 1;
1675 	resolver->krs_count++;
1676 }
1677 
1678 static vm_size_t
_resolve_threads(kd_threadmap * map,vm_size_t nthreads)1679 _resolve_threads(kd_threadmap *map, vm_size_t nthreads)
1680 {
1681 	struct kd_resolver resolver = {
1682 		.krs_map = map, .krs_count = 0, .krs_maxcount = nthreads,
1683 	};
1684 
1685 	// Handle kernel_task specially, as it lacks uthreads.
1686 	extern void task_act_iterate_wth_args(task_t, void (*)(thread_t, void *),
1687 	    void *);
1688 	task_act_iterate_wth_args(kernel_task, _resolve_kernel_task, &resolver);
1689 	proc_iterate(PROC_ALLPROCLIST | PROC_NOWAITTRANS, _resolve_iterator,
1690 	    &resolver, NULL, NULL);
1691 	return resolver.krs_count;
1692 }
1693 
1694 static kd_threadmap *
_thread_map_create_live(size_t maxthreads,vm_size_t * mapsize,vm_size_t * mapcount)1695 _thread_map_create_live(size_t maxthreads, vm_size_t *mapsize,
1696     vm_size_t *mapcount)
1697 {
1698 	kd_threadmap *thread_map = NULL;
1699 
1700 	assert(mapsize != NULL);
1701 	assert(mapcount != NULL);
1702 
1703 	extern int threads_count;
1704 	vm_size_t nthreads = threads_count;
1705 
1706 	// Allow 25% more threads to be started while iterating processes.
1707 	if (os_add_overflow(nthreads, nthreads / 4, &nthreads)) {
1708 		return NULL;
1709 	}
1710 
1711 	*mapcount = nthreads;
1712 	if (os_mul_overflow(nthreads, sizeof(kd_threadmap), mapsize)) {
1713 		return NULL;
1714 	}
1715 
1716 	// Wait until the out-parameters have been filled with the needed size to
1717 	// do the bounds checking on the provided maximum.
1718 	if (maxthreads != 0 && maxthreads < nthreads) {
1719 		return NULL;
1720 	}
1721 
1722 	// This allocation can be too large for `Z_NOFAIL`.
1723 	thread_map = kalloc_data_tag(*mapsize, Z_WAITOK | Z_ZERO,
1724 	    VM_KERN_MEMORY_DIAG);
1725 	if (thread_map != NULL) {
1726 		*mapcount = _resolve_threads(thread_map, nthreads);
1727 	}
1728 	return thread_map;
1729 }
1730 
1731 static void
kdbg_clear(void)1732 kdbg_clear(void)
1733 {
1734 	kernel_debug_disable();
1735 	kdbg_disable_typefilter();
1736 
1737 	// Wait for any event writers to see the disable status.
1738 	IOSleep(100);
1739 
1740 	// Reset kdebug status for each process.
1741 	if (kd_control_trace.kdc_flags & (KDBG_PIDCHECK | KDBG_PIDEXCLUDE)) {
1742 		proc_list_lock();
1743 		proc_t p;
1744 		ALLPROC_FOREACH(p) {
1745 			p->p_kdebug = 0;
1746 		}
1747 		proc_list_unlock();
1748 	}
1749 
1750 	kd_control_trace.kdc_flags &= (unsigned int)~KDBG_CKTYPES;
1751 	kd_control_trace.kdc_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK);
1752 	kd_control_trace.kdc_flags &= ~(KDBG_PIDCHECK | KDBG_PIDEXCLUDE);
1753 	kd_control_trace.kdc_flags &= ~KDBG_CONTINUOUS_TIME;
1754 	kd_control_trace.kdc_flags &= ~KDBG_DISABLE_COPROCS;
1755 	kd_control_trace.kdc_flags &= ~KDBG_MATCH_DISABLE;
1756 	kd_control_trace.kdc_flags &= ~(KDBG_NOWRAP | KDBG_WRAPPED);
1757 	kd_control_trace.kdc_live_flags &= ~(KDBG_NOWRAP | KDBG_WRAPPED);
1758 
1759 	kd_control_trace.kdc_oldest_time = 0;
1760 
1761 	delete_buffers_trace();
1762 	kd_buffer_trace.kdb_event_count = 0;
1763 
1764 	_clear_thread_map();
1765 }
1766 
1767 void
kdebug_reset(void)1768 kdebug_reset(void)
1769 {
1770 	ktrace_assert_lock_held();
1771 
1772 	kdbg_clear();
1773 	typefilter_reject_all(kdbg_typefilter);
1774 	typefilter_allow_class(kdbg_typefilter, DBG_TRACE);
1775 }
1776 
1777 void
kdebug_free_early_buf(void)1778 kdebug_free_early_buf(void)
1779 {
1780 #if defined(__x86_64__)
1781 	ml_static_mfree((vm_offset_t)&kd_early_buffer, sizeof(kd_early_buffer));
1782 #endif /* defined(__x86_64__) */
1783 	// ARM handles this as part of the BOOTDATA segment.
1784 }
1785 
1786 int
kdbg_setpid(kd_regtype * kdr)1787 kdbg_setpid(kd_regtype *kdr)
1788 {
1789 	pid_t pid;
1790 	int flag, ret = 0;
1791 	struct proc *p;
1792 
1793 	pid = (pid_t)kdr->value1;
1794 	flag = (int)kdr->value2;
1795 
1796 	if (pid >= 0) {
1797 		if ((p = proc_find(pid)) == NULL) {
1798 			ret = ESRCH;
1799 		} else {
1800 			if (flag == 1) {
1801 				/*
1802 				 * turn on pid check for this and all pids
1803 				 */
1804 				kd_control_trace.kdc_flags |= KDBG_PIDCHECK;
1805 				kd_control_trace.kdc_flags &= ~KDBG_PIDEXCLUDE;
1806 
1807 				p->p_kdebug = 1;
1808 			} else {
1809 				/*
1810 				 * turn off pid check for this pid value
1811 				 * Don't turn off all pid checking though
1812 				 *
1813 				 * kd_control_trace.kdc_flags &= ~KDBG_PIDCHECK;
1814 				 */
1815 				p->p_kdebug = 0;
1816 			}
1817 			proc_rele(p);
1818 		}
1819 	} else {
1820 		ret = EINVAL;
1821 	}
1822 
1823 	return ret;
1824 }
1825 
1826 /* This is for pid exclusion in the trace buffer */
1827 int
kdbg_setpidex(kd_regtype * kdr)1828 kdbg_setpidex(kd_regtype *kdr)
1829 {
1830 	pid_t pid;
1831 	int flag, ret = 0;
1832 	struct proc *p;
1833 
1834 	pid = (pid_t)kdr->value1;
1835 	flag = (int)kdr->value2;
1836 
1837 	if (pid >= 0) {
1838 		if ((p = proc_find(pid)) == NULL) {
1839 			ret = ESRCH;
1840 		} else {
1841 			if (flag == 1) {
1842 				/*
1843 				 * turn on pid exclusion
1844 				 */
1845 				kd_control_trace.kdc_flags |= KDBG_PIDEXCLUDE;
1846 				kd_control_trace.kdc_flags &= ~KDBG_PIDCHECK;
1847 
1848 				p->p_kdebug = 1;
1849 			} else {
1850 				/*
1851 				 * turn off pid exclusion for this pid value
1852 				 * Don't turn off all pid exclusion though
1853 				 *
1854 				 * kd_control_trace.kdc_flags &= ~KDBG_PIDEXCLUDE;
1855 				 */
1856 				p->p_kdebug = 0;
1857 			}
1858 			proc_rele(p);
1859 		}
1860 	} else {
1861 		ret = EINVAL;
1862 	}
1863 
1864 	return ret;
1865 }
1866 
1867 /*
1868  * The following functions all operate on the typefilter singleton.
1869  */
1870 
1871 static int
kdbg_copyin_typefilter(user_addr_t addr,size_t size)1872 kdbg_copyin_typefilter(user_addr_t addr, size_t size)
1873 {
1874 	int ret = ENOMEM;
1875 	typefilter_t tf;
1876 
1877 	ktrace_assert_lock_held();
1878 
1879 	if (size != KDBG_TYPEFILTER_BITMAP_SIZE) {
1880 		return EINVAL;
1881 	}
1882 
1883 	if ((tf = typefilter_create())) {
1884 		if ((ret = copyin(addr, tf, KDBG_TYPEFILTER_BITMAP_SIZE)) == 0) {
1885 			/* The kernel typefilter must always allow DBG_TRACE */
1886 			typefilter_allow_class(tf, DBG_TRACE);
1887 
1888 			typefilter_copy(kdbg_typefilter, tf);
1889 
1890 			kdbg_enable_typefilter();
1891 			_coproc_list_callback(KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter);
1892 		}
1893 
1894 		if (tf) {
1895 			typefilter_deallocate(tf);
1896 		}
1897 	}
1898 
1899 	return ret;
1900 }
1901 
1902 /*
1903  * Enable the flags in the control page for the typefilter.  Assumes that
1904  * kdbg_typefilter has already been allocated, so events being written
1905  * don't see a bad typefilter.
1906  */
1907 static void
kdbg_enable_typefilter(void)1908 kdbg_enable_typefilter(void)
1909 {
1910 	kd_control_trace.kdc_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK);
1911 	kd_control_trace.kdc_flags |= KDBG_TYPEFILTER_CHECK;
1912 	if (kdebug_enable) {
1913 		kd_control_trace.kdc_emit = _trace_emit_filter();
1914 	}
1915 	commpage_update_kdebug_state();
1916 }
1917 
1918 // Disable the flags in the control page for the typefilter.  The typefilter
1919 // may be safely deallocated shortly after this function returns.
1920 static void
kdbg_disable_typefilter(void)1921 kdbg_disable_typefilter(void)
1922 {
1923 	bool notify_coprocs = kd_control_trace.kdc_flags & KDBG_TYPEFILTER_CHECK;
1924 	kd_control_trace.kdc_flags &= ~KDBG_TYPEFILTER_CHECK;
1925 
1926 	commpage_update_kdebug_state();
1927 
1928 	if (notify_coprocs) {
1929 		// Notify coprocessors that the typefilter will now allow everything.
1930 		// Otherwise, they won't know a typefilter is no longer in effect.
1931 		typefilter_allow_all(kdbg_typefilter);
1932 		_coproc_list_callback(KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter);
1933 	}
1934 }
1935 
1936 uint32_t
kdebug_commpage_state(void)1937 kdebug_commpage_state(void)
1938 {
1939 	uint32_t state = 0;
1940 	if (kdebug_enable) {
1941 		state |= KDEBUG_COMMPAGE_ENABLE_TRACE;
1942 		if (kd_control_trace.kdc_flags & KDBG_TYPEFILTER_CHECK) {
1943 			state |= KDEBUG_COMMPAGE_ENABLE_TYPEFILTER;
1944 		}
1945 		if (kd_control_trace.kdc_flags & KDBG_CONTINUOUS_TIME) {
1946 			state |= KDEBUG_COMMPAGE_CONTINUOUS;
1947 		}
1948 	}
1949 	return state;
1950 }
1951 
1952 static int
kdbg_setreg(kd_regtype * kdr)1953 kdbg_setreg(kd_regtype * kdr)
1954 {
1955 	switch (kdr->type) {
1956 	case KDBG_CLASSTYPE:
1957 		kdlog_beg = KDBG_EVENTID(kdr->value1 & 0xff, 0, 0);
1958 		kdlog_end = KDBG_EVENTID(kdr->value2 & 0xff, 0, 0);
1959 		kd_control_trace.kdc_flags &= ~KDBG_VALCHECK;
1960 		kd_control_trace.kdc_flags |= KDBG_RANGECHECK;
1961 		break;
1962 	case KDBG_SUBCLSTYPE:;
1963 		unsigned int cls = kdr->value1 & 0xff;
1964 		unsigned int subcls = kdr->value2 & 0xff;
1965 		unsigned int subcls_end = subcls + 1;
1966 		kdlog_beg = KDBG_EVENTID(cls, subcls, 0);
1967 		kdlog_end = KDBG_EVENTID(cls, subcls_end, 0);
1968 		kd_control_trace.kdc_flags &= ~KDBG_VALCHECK;
1969 		kd_control_trace.kdc_flags |= KDBG_RANGECHECK;
1970 		break;
1971 	case KDBG_RANGETYPE:
1972 		kdlog_beg = kdr->value1;
1973 		kdlog_end = kdr->value2;
1974 		kd_control_trace.kdc_flags &= ~KDBG_VALCHECK;
1975 		kd_control_trace.kdc_flags |= KDBG_RANGECHECK;
1976 		break;
1977 	case KDBG_VALCHECK:
1978 		kdlog_value1 = kdr->value1;
1979 		kdlog_value2 = kdr->value2;
1980 		kdlog_value3 = kdr->value3;
1981 		kdlog_value4 = kdr->value4;
1982 		kd_control_trace.kdc_flags &= ~KDBG_RANGECHECK;
1983 		kd_control_trace.kdc_flags |= KDBG_VALCHECK;
1984 		break;
1985 	case KDBG_TYPENONE:
1986 		kd_control_trace.kdc_flags &= ~(KDBG_RANGECHECK | KDBG_VALCHECK);
1987 		kdlog_beg = 0;
1988 		kdlog_end = 0;
1989 		break;
1990 	default:
1991 		return EINVAL;
1992 	}
1993 	if (kdebug_enable) {
1994 		kd_control_trace.kdc_emit = _trace_emit_filter();
1995 	}
1996 	return 0;
1997 }
1998 
1999 static int
_copyin_event_disable_mask(user_addr_t uaddr,size_t usize)2000 _copyin_event_disable_mask(user_addr_t uaddr, size_t usize)
2001 {
2002 	if (usize < 2 * sizeof(kd_event_matcher)) {
2003 		return ERANGE;
2004 	}
2005 	int ret = copyin(uaddr, &kd_control_trace.disable_event_match,
2006 	    sizeof(kd_event_matcher));
2007 	if (ret != 0) {
2008 		return ret;
2009 	}
2010 	ret = copyin(uaddr + sizeof(kd_event_matcher),
2011 	    &kd_control_trace.disable_event_mask, sizeof(kd_event_matcher));
2012 	if (ret != 0) {
2013 		memset(&kd_control_trace.disable_event_match, 0,
2014 		    sizeof(kd_event_matcher));
2015 		return ret;
2016 	}
2017 	return 0;
2018 }
2019 
2020 static int
_copyout_event_disable_mask(user_addr_t uaddr,size_t usize)2021 _copyout_event_disable_mask(user_addr_t uaddr, size_t usize)
2022 {
2023 	if (usize < 2 * sizeof(kd_event_matcher)) {
2024 		return ERANGE;
2025 	}
2026 	int ret = copyout(&kd_control_trace.disable_event_match, uaddr,
2027 	    sizeof(kd_event_matcher));
2028 	if (ret != 0) {
2029 		return ret;
2030 	}
2031 	ret = copyout(&kd_control_trace.disable_event_mask,
2032 	    uaddr + sizeof(kd_event_matcher), sizeof(kd_event_matcher));
2033 	if (ret != 0) {
2034 		return ret;
2035 	}
2036 	return 0;
2037 }
2038 
2039 static errno_t
_copyout_cpu_map(int map_version,user_addr_t udst,size_t * usize)2040 _copyout_cpu_map(int map_version, user_addr_t udst, size_t *usize)
2041 {
2042 	if ((kd_control_trace.kdc_flags & KDBG_BUFINIT) == 0) {
2043 		return EINVAL;
2044 	}
2045 
2046 	void *cpu_map = NULL;
2047 	size_t size = 0;
2048 	int error = _copy_cpu_map(map_version, &cpu_map, &size);
2049 	if (0 == error) {
2050 		if (udst) {
2051 			size_t copy_size = MIN(*usize, size);
2052 			error = copyout(cpu_map, udst, copy_size);
2053 		}
2054 		*usize = size;
2055 		kmem_free(kernel_map, (vm_offset_t)cpu_map, size);
2056 	}
2057 	if (EINVAL == error && 0 == udst) {
2058 		*usize = size;
2059 		// User space only needs the size if it passes NULL;
2060 		error = 0;
2061 	}
2062 	return error;
2063 }
2064 
2065 int
kdbg_readcurthrmap(user_addr_t buffer,size_t * bufsize)2066 kdbg_readcurthrmap(user_addr_t buffer, size_t *bufsize)
2067 {
2068 	kd_threadmap *mapptr;
2069 	vm_size_t mapsize;
2070 	vm_size_t mapcount;
2071 	int ret = 0;
2072 	size_t count = *bufsize / sizeof(kd_threadmap);
2073 
2074 	*bufsize = 0;
2075 
2076 	if ((mapptr = _thread_map_create_live(count, &mapsize, &mapcount))) {
2077 		if (copyout(mapptr, buffer, mapcount * sizeof(kd_threadmap))) {
2078 			ret = EFAULT;
2079 		} else {
2080 			*bufsize = (mapcount * sizeof(kd_threadmap));
2081 		}
2082 
2083 		kfree_data(mapptr, mapsize);
2084 	} else {
2085 		ret = EINVAL;
2086 	}
2087 
2088 	return ret;
2089 }
2090 static void
_clear_thread_map(void)2091 _clear_thread_map(void)
2092 {
2093 	ktrace_assert_lock_held();
2094 
2095 	if (kd_control_trace.kdc_flags & KDBG_MAPINIT) {
2096 		assert(kd_mapptr != NULL);
2097 		kfree_data(kd_mapptr, kd_mapsize);
2098 		kd_mapptr = NULL;
2099 		kd_mapsize = 0;
2100 		kd_mapcount = 0;
2101 		kd_control_trace.kdc_flags &= ~KDBG_MAPINIT;
2102 	}
2103 }
2104 
2105 /*
2106  * Write out a version 1 header and the thread map, if it is initialized, to a
2107  * vnode.  Used by KDWRITEMAP and kdbg_dump_trace_to_file.
2108  *
2109  * Returns write errors from vn_rdwr if a write fails.  Returns ENODATA if the
2110  * thread map has not been initialized, but the header will still be written.
2111  * Returns ENOMEM if padding could not be allocated.  Returns 0 otherwise.
2112  */
2113 static int
kdbg_write_thread_map(struct kd_dest * dest)2114 kdbg_write_thread_map(struct kd_dest *dest)
2115 {
2116 	ktrace_assert_lock_held();
2117 	if (dest->kdd_kind != KD_DEST_VFS) {
2118 		panic("kdebug: must write thread map to VFS");
2119 	}
2120 
2121 	bool map_initialized = (kd_control_trace.kdc_flags & KDBG_MAPINIT);
2122 	int ret = _write_legacy_header(map_initialized, dest);
2123 	if (ret == 0) {
2124 		if (map_initialized) {
2125 			_clear_thread_map();
2126 		} else {
2127 			ret = ENODATA;
2128 		}
2129 	}
2130 	return ret;
2131 }
2132 
2133 /*
2134  * Copy out the thread map to a user space buffer.  Used by KDTHRMAP.
2135  *
2136  * Returns copyout errors if the copyout fails.  Returns ENODATA if the thread
2137  * map has not been initialized.  Returns EINVAL if the buffer provided is not
2138  * large enough for the entire thread map.  Returns 0 otherwise.
2139  */
2140 static int
kdbg_copyout_thread_map(user_addr_t buffer,size_t * buffer_size)2141 kdbg_copyout_thread_map(user_addr_t buffer, size_t *buffer_size)
2142 {
2143 	bool map_initialized;
2144 	size_t map_size;
2145 	int ret = 0;
2146 
2147 	ktrace_assert_lock_held();
2148 	assert(buffer_size != NULL);
2149 
2150 	map_initialized = (kd_control_trace.kdc_flags & KDBG_MAPINIT);
2151 	if (!map_initialized) {
2152 		return ENODATA;
2153 	}
2154 
2155 	map_size = kd_mapcount * sizeof(kd_threadmap);
2156 	if (*buffer_size < map_size) {
2157 		return EINVAL;
2158 	}
2159 
2160 	ret = copyout(kd_mapptr, buffer, map_size);
2161 	if (ret == 0) {
2162 		_clear_thread_map();
2163 	}
2164 
2165 	return ret;
2166 }
2167 
2168 static void
kdbg_set_nkdbufs_trace(unsigned int req_nkdbufs_trace)2169 kdbg_set_nkdbufs_trace(unsigned int req_nkdbufs_trace)
2170 {
2171 	/*
2172 	 * Only allow allocations of up to half the kernel's data range or "sane
2173 	 * size", whichever is smaller.
2174 	 */
2175 	kmem_range_id_t range_id = kmem_needs_data_share_range() ?
2176 	    KMEM_RANGE_ID_DATA_SHARED : KMEM_RANGE_ID_DATA;
2177 	const uint64_t max_nkdbufs_trace_64 =
2178 	    MIN(kmem_range_id_size(range_id), sane_size) / 2 /
2179 	    sizeof(kd_buf);
2180 	/*
2181 	 * Can't allocate more than 2^38 (2^32 * 64) bytes of events without
2182 	 * switching to a 64-bit event count; should be fine.
2183 	 */
2184 	const unsigned int max_nkdbufs_trace =
2185 	    (unsigned int)MIN(max_nkdbufs_trace_64, UINT_MAX);
2186 
2187 	kd_buffer_trace.kdb_event_count = MIN(req_nkdbufs_trace, max_nkdbufs_trace);
2188 }
2189 
2190 /*
2191  * Block until there are `kd_buffer_trace.kdb_storage_threshold` storage units filled with
2192  * events or `timeout_ms` milliseconds have passed.  If `locked_wait` is true,
2193  * `ktrace_lock` is held while waiting.  This is necessary while waiting to
2194  * write events out of the buffers.
2195  *
2196  * Returns true if the threshold was reached and false otherwise.
2197  *
2198  * Called with `ktrace_lock` locked and interrupts enabled.
2199  */
2200 static bool
kdbg_wait(uint64_t timeout_ms)2201 kdbg_wait(uint64_t timeout_ms)
2202 {
2203 	int wait_result = THREAD_AWAKENED;
2204 	uint64_t deadline_mach = 0;
2205 
2206 	ktrace_assert_lock_held();
2207 
2208 	if (timeout_ms != 0) {
2209 		uint64_t ns = timeout_ms * NSEC_PER_MSEC;
2210 		nanoseconds_to_absolutetime(ns, &deadline_mach);
2211 		clock_absolutetime_interval_to_deadline(deadline_mach, &deadline_mach);
2212 	}
2213 
2214 	bool s = ml_set_interrupts_enabled(false);
2215 	if (!s) {
2216 		panic("kdbg_wait() called with interrupts disabled");
2217 	}
2218 	lck_spin_lock_grp(&kd_wait_lock, &kdebug_lck_grp);
2219 
2220 	/* drop the mutex to allow others to access trace */
2221 	ktrace_unlock();
2222 
2223 	while (wait_result == THREAD_AWAKENED &&
2224 	    kd_control_trace.kdc_storage_used < kd_buffer_trace.kdb_storage_threshold) {
2225 		kd_waiter = true;
2226 
2227 		if (deadline_mach) {
2228 			wait_result = lck_spin_sleep_deadline(&kd_wait_lock, 0, &kd_waiter,
2229 			    THREAD_ABORTSAFE, deadline_mach);
2230 		} else {
2231 			wait_result = lck_spin_sleep(&kd_wait_lock, 0, &kd_waiter,
2232 			    THREAD_ABORTSAFE);
2233 		}
2234 	}
2235 
2236 	bool threshold_exceeded = (kd_control_trace.kdc_storage_used >= kd_buffer_trace.kdb_storage_threshold);
2237 
2238 	lck_spin_unlock(&kd_wait_lock);
2239 	ml_set_interrupts_enabled(s);
2240 
2241 	ktrace_lock();
2242 
2243 	return threshold_exceeded;
2244 }
2245 
2246 /*
2247  * Wakeup a thread waiting using `kdbg_wait` if there are at least
2248  * `kd_buffer_trace.kdb_storage_threshold` storage units in use.
2249  */
2250 static void
_try_wakeup_waiter(void)2251 _try_wakeup_waiter(void)
2252 {
2253 	bool need_kds_wakeup = false;
2254 
2255 	/*
2256 	 * Try to take the lock here to synchronize with the waiter entering
2257 	 * the blocked state.  Use the try mode to prevent deadlocks caused by
2258 	 * re-entering this routine due to various trace points triggered in the
2259 	 * lck_spin_sleep_xxxx routines used to actually enter one of our 2 wait
2260 	 * conditions.  No problem if we fail, there will be lots of additional
2261 	 * events coming in that will eventually succeed in grabbing this lock.
2262 	 */
2263 	bool s = ml_set_interrupts_enabled(false);
2264 
2265 	if (lck_spin_try_lock(&kd_wait_lock)) {
2266 		if (kd_waiter &&
2267 		    (kd_control_trace.kdc_storage_used >= kd_buffer_trace.kdb_storage_threshold)) {
2268 			kd_waiter = 0;
2269 			need_kds_wakeup = true;
2270 		}
2271 		lck_spin_unlock(&kd_wait_lock);
2272 	}
2273 
2274 	ml_set_interrupts_enabled(s);
2275 
2276 	if (need_kds_wakeup == true) {
2277 		wakeup(&kd_waiter);
2278 	}
2279 }
2280 
2281 static void
_wakeup_waiter(void)2282 _wakeup_waiter(void)
2283 {
2284 	bool was_waiting = false;
2285 	bool s = ml_set_interrupts_enabled(false);
2286 	lck_spin_lock(&kd_wait_lock);
2287 	if (kd_waiter) {
2288 		was_waiting = true;
2289 		kd_waiter = 0;
2290 	}
2291 	lck_spin_unlock(&kd_wait_lock);
2292 	ml_set_interrupts_enabled(s);
2293 
2294 	if (was_waiting) {
2295 		wakeup(&kd_waiter);
2296 	}
2297 }
2298 
2299 static void
_storage_free(struct kd_control * kd_ctrl_page,struct kd_buffer * kd_data_page,int cpu,uint32_t kdsp_raw)2300 _storage_free(struct kd_control *kd_ctrl_page, struct kd_buffer *kd_data_page, int cpu, uint32_t kdsp_raw)
2301 {
2302 	struct  kd_storage *kdsp_actual;
2303 	struct kd_bufinfo *kdbp;
2304 	union kds_ptr kdsp;
2305 
2306 	kdbp = &kd_data_page->kdb_info[cpu];
2307 
2308 	kdsp.raw = kdsp_raw;
2309 
2310 	int intrs_en = kdebug_storage_lock(kd_ctrl_page);
2311 
2312 	if (kdsp.raw == kdbp->kd_list_head.raw) {
2313 		/*
2314 		 * it's possible for the storage unit pointed to
2315 		 * by kdsp to have already been stolen... so
2316 		 * check to see if it's still the head of the list
2317 		 * now that we're behind the lock that protects
2318 		 * adding and removing from the queue...
2319 		 * since we only ever release and steal units from
2320 		 * that position, if it's no longer the head
2321 		 * we having nothing to do in this context
2322 		 */
2323 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp);
2324 		kdbp->kd_list_head = kdsp_actual->kds_next;
2325 
2326 		kdsp_actual->kds_next = kd_ctrl_page->kds_free_list;
2327 		kd_ctrl_page->kds_free_list = kdsp;
2328 
2329 		kd_ctrl_page->kdc_storage_used--;
2330 	}
2331 
2332 	kdebug_storage_unlock(kd_ctrl_page, intrs_en);
2333 }
2334 
2335 static bool
_reading_set_flags(struct kd_control * ctl,kdebug_emit_filter_t * old_emit,kdebug_live_flags_t * old_live)2336 _reading_set_flags(
2337 	struct kd_control *ctl,
2338 	kdebug_emit_filter_t *old_emit,
2339 	kdebug_live_flags_t *old_live)
2340 {
2341 	int intrs_en = kdebug_storage_lock(ctl);
2342 
2343 	*old_emit = ctl->kdc_emit;
2344 	*old_live = ctl->kdc_live_flags;
2345 
2346 	bool wrapped = ctl->kdc_live_flags & KDBG_WRAPPED;
2347 	ctl->kdc_live_flags |= KDBG_NOWRAP;
2348 
2349 	kdebug_storage_unlock(ctl, intrs_en);
2350 
2351 	return wrapped;
2352 }
2353 
2354 static bool
_reading_restore_flags(struct kd_control * ctl,kdebug_emit_filter_t old_emit,kdebug_live_flags_t old_live)2355 _reading_restore_flags(
2356 	struct kd_control *ctl,
2357 	kdebug_emit_filter_t old_emit,
2358 	kdebug_live_flags_t old_live)
2359 {
2360 	int intrs_en = kdebug_storage_lock(ctl);
2361 	bool disabled_during_read = !ctl->enabled;
2362 	// The wrapped bit was handled already, by adding a lost-events event, don't
2363 	// replace it.
2364 	ctl->kdc_live_flags = old_live & ~KDBG_WRAPPED;
2365 	bool was_wrapping = (old_live & KDBG_NOWRAP) == 0;
2366 	// Only re-enable trace if the reader causes lost events if wrapping was
2367 	// previously enabled.
2368 	if (was_wrapping && old_emit) {
2369 		ctl->kdc_emit = old_emit;
2370 	}
2371 	kdebug_storage_unlock(ctl, intrs_en);
2372 	return disabled_during_read;
2373 }
2374 
2375 static inline void
_clear_oldest_lostevents(void)2376 _clear_oldest_lostevents(void)
2377 {
2378 	for (unsigned int cpu = 0; cpu < kd_control_trace.kdebug_cpus; cpu++) {
2379 		struct kd_bufinfo *info = &kd_buffer_trace.kdb_info[cpu];
2380 		union kds_ptr oldest_ptr = info->kd_list_head;
2381 		if (oldest_ptr.raw != KDS_PTR_NULL) {
2382 			struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, oldest_ptr);
2383 			store->kds_lostevents = false;
2384 		}
2385 	}
2386 }
2387 
2388 static inline bool
_event_should_disable(kd_buf * event)2389 _event_should_disable(kd_buf *event)
2390 {
2391 	if ((kd_control_trace.kdc_flags & KDBG_MATCH_DISABLE) == 0) {
2392 		return false;
2393 	}
2394 	kd_event_matcher *match = &kd_control_trace.disable_event_match;
2395 	kd_event_matcher *mask = &kd_control_trace.disable_event_mask;
2396 	return (event->debugid & mask->kem_debugid) == match->kem_debugid &&
2397 	       (event->arg1 & mask->kem_args[0]) == match->kem_args[0] &&
2398 	       (event->arg2 & mask->kem_args[1]) == match->kem_args[1] &&
2399 	       (event->arg3 & mask->kem_args[2]) == match->kem_args[2] &&
2400 	       (event->arg4 & mask->kem_args[3]) == match->kem_args[3];
2401 }
2402 
2403 static inline struct kd_storage *
_store_read_inc(struct kd_storage * store,struct kd_bufinfo * info,unsigned int cpu,union kds_ptr * store_ptr)2404 _store_read_inc(struct kd_storage *store, struct kd_bufinfo *info,
2405     unsigned int cpu, union kds_ptr *store_ptr)
2406 {
2407 	store->kds_readlast++;
2408 	if (store->kds_readlast < kd_control_trace.kdebug_events_per_storage_unit) {
2409 		return store;
2410 	}
2411 	_storage_free(&kd_control_trace, &kd_buffer_trace, cpu, store_ptr->raw);
2412 	union kds_ptr oldest_ptr = info->kd_list_head;
2413 	if (oldest_ptr.raw == KDS_PTR_NULL) {
2414 		return NULL;
2415 	}
2416 	*store_ptr = oldest_ptr;
2417 	return POINTER_FROM_KDS_PTR(kd_buffer_trace.kd_bufs, oldest_ptr);
2418 }
2419 
2420 static inline uint64_t
_store_earliest_timestamp(struct kd_storage * store,uint64_t min,uint64_t max,struct kd_bufinfo * info,unsigned int cpu,union kds_ptr store_ptr)2421 _store_earliest_timestamp(
2422 	struct kd_storage *store,
2423 	uint64_t min,
2424 	uint64_t max,
2425 	struct kd_bufinfo *info,
2426 	unsigned int cpu,
2427 	union kds_ptr store_ptr)
2428 {
2429 	while (true) {
2430 		uint32_t rcursor = store->kds_readlast;
2431 		if (rcursor == store->kds_bufindx) {
2432 			// Out of events to read on this store.
2433 			return UINT64_MAX;
2434 		}
2435 		uint64_t t = store->kds_records[rcursor].timestamp;
2436 		if (t > max) {
2437 			return UINT64_MAX;
2438 		} else if (__improbable(t < store->kds_timestamp)) {
2439 			// This can only happen for coprocessors that haven't
2440 			// finished emitting this event, it will be processed the
2441 			// next time through.
2442 			return UINT64_MAX;
2443 		} else if (t >= min) {
2444 			return t;
2445 		}
2446 		// Skip to the next event.
2447 		store = _store_read_inc(store, info, cpu, &store_ptr);
2448 		if (!store) {
2449 			return UINT64_MAX;
2450 		}
2451 	}
2452 }
2453 
2454 static int
_read_trace_events_internal(struct kd_dest * dest,size_t event_count,uint64_t barrier_max,bool wrapped,bool * should_disable,size_t * events_written)2455 _read_trace_events_internal(struct kd_dest *dest, size_t event_count,
2456     uint64_t barrier_max, bool wrapped, bool *should_disable,
2457     size_t *events_written)
2458 {
2459 	bool traced_retrograde = false;
2460 	bool out_of_events = false;
2461 	bool const wrapping_enabled = !(kd_control_trace.kdc_flags & KDBG_NOWRAP);
2462 
2463 	struct kd_bufinfo *kdbip = kd_buffer_trace.kdb_info;
2464 	struct kd_region *kd_bufs = kd_buffer_trace.kd_bufs;
2465 
2466 	event_count = MIN(event_count, kd_buffer_trace.kdb_event_count);
2467 
2468 	if (wrapped) {
2469 		// If buffers have wrapped, do not emit additional lost events for the
2470 		// oldest storage units.
2471 		_clear_oldest_lostevents();
2472 	}
2473 
2474 	uint64_t barrier_min = kd_control_trace.kdc_oldest_time;
2475 
2476 	while (event_count && !out_of_events) {
2477 		kd_buf *tempbuf = kd_buffer_trace.kdcopybuf;
2478 		uint32_t used_count = 0;
2479 
2480 		size_t avail_count = MIN(event_count, kd_control_trace.kdebug_kdcopybuf_count);
2481 		while (used_count < avail_count) {
2482 			bool lostevents = false;
2483 			int lostcpu = -1;
2484 			uint64_t earliest_time = UINT64_MAX;
2485 			int min_cpu = -1;
2486 
2487 			// Find the earliest event from all the oldest storage units.
2488 			for (unsigned int cpu = 0; cpu < kd_control_trace.kdebug_cpus; cpu++) {
2489 				struct kd_bufinfo *info = &kdbip[cpu];
2490 				union kds_ptr oldest_ptr = info->kd_list_head;
2491 				if (oldest_ptr.raw == KDS_PTR_NULL) {
2492 					continue;
2493 				}
2494 				struct kd_storage *store = POINTER_FROM_KDS_PTR(kd_bufs, oldest_ptr);
2495 
2496 				// If the storage unit was stolen, make sure to emit a lost
2497 				// events event with the earliest time to expect an event stream
2498 				// with no gaps.
2499 				if (__improbable(store->kds_lostevents)) {
2500 					store->kds_lostevents = false;
2501 					lostevents = true;
2502 					uint64_t lost_time = store->kds_records[0].timestamp;
2503 					if (kd_control_trace.kdc_oldest_time < lost_time) {
2504 						// This time is now the oldest that can be read to
2505 						// ensure an event stream with no gaps from this point
2506 						// forward.
2507 						kd_control_trace.kdc_oldest_time = barrier_min = lost_time;
2508 						lostcpu = cpu;
2509 					}
2510 					continue;
2511 				} else if (__improbable(lostevents)) {
2512 					// On lost events, just find the latest timestamp of the
2513 					// gaps.
2514 					continue;
2515 				}
2516 
2517 				uint64_t t = _store_earliest_timestamp(store, barrier_min,
2518 				    barrier_max, info, cpu, oldest_ptr);
2519 				if (t < earliest_time) {
2520 					earliest_time = t;
2521 					min_cpu = cpu;
2522 				}
2523 			}
2524 			if (lostevents) {
2525 				wrapped = false;
2526 				// Only emit a lost events event if the user allowed wrapping.
2527 				if (wrapping_enabled) {
2528 					tempbuf[used_count++] = (kd_buf){
2529 						.debugid = TRACE_LOST_EVENTS,
2530 						.timestamp = barrier_min,
2531 						.cpuid = lostcpu,
2532 						.arg1 = 1,
2533 					};
2534 				}
2535 				continue;
2536 			}
2537 			if (min_cpu == -1) {
2538 				out_of_events = true;
2539 				break;
2540 			}
2541 			if (wrapped) {
2542 				// Emit a single lost events event in the case of expected
2543 				// wrapping.
2544 				wrapped = false;
2545 				if (wrapping_enabled) {
2546 					tempbuf[used_count++] = (kd_buf){
2547 						.debugid = TRACE_LOST_EVENTS,
2548 						.timestamp = barrier_min,
2549 					};
2550 				}
2551 			}
2552 
2553 			struct kd_bufinfo *min_info = &kdbip[min_cpu];
2554 			union kds_ptr oldest_ptr = min_info->kd_list_head;
2555 			struct kd_storage *min_store = POINTER_FROM_KDS_PTR(kd_bufs, oldest_ptr);
2556 			kd_buf *earliest_event = &min_store->kds_records[min_store->kds_readlast];
2557 
2558 			if (__improbable(min_info->latest_past_event_timestamp != 0)) {
2559 				if (__improbable(kdbg_debug)) {
2560 					printf("kdebug: PAST EVENT: debugid %#8x: "
2561 					    "time %lld from CPU %u "
2562 					    "(barrier at time %lld)\n",
2563 					    earliest_event->debugid,
2564 					    min_info->latest_past_event_timestamp, min_cpu,
2565 					    barrier_min);
2566 				}
2567 				tempbuf[used_count++] = (kd_buf){
2568 					.timestamp = earliest_time,
2569 					.cpuid = min_cpu,
2570 					.arg1 = (kd_buf_argtype)min_info->latest_past_event_timestamp,
2571 					.arg2 = 0,
2572 					.arg3 = 0,
2573 					.arg4 = 0,
2574 					.debugid = TRACE_PAST_EVENTS,
2575 				};
2576 				min_info->latest_past_event_timestamp = 0;
2577 				continue;
2578 			}
2579 
2580 			if (__improbable(_event_should_disable(earliest_event))) {
2581 				*should_disable = true;
2582 			}
2583 			tempbuf[used_count] = *earliest_event;
2584 			(void)_store_read_inc(min_store, min_info, min_cpu, &oldest_ptr);
2585 			if (__improbable(earliest_time < min_info->kd_prev_timebase)) {
2586 				if (traced_retrograde) {
2587 					continue;
2588 				}
2589 				traced_retrograde = true;
2590 
2591 				if (__improbable(kdbg_debug)) {
2592 					printf("kdebug: RETRO EVENT: debugid %#8x: "
2593 					    "time %lld from CPU %u "
2594 					    "(previous earliest at time %lld)\n",
2595 					    tempbuf[used_count].debugid,
2596 					    earliest_time, min_cpu, min_info->kd_prev_timebase);
2597 				}
2598 
2599 				tempbuf[used_count] = (kd_buf){
2600 					.timestamp = min_info->kd_prev_timebase,
2601 					.cpuid = tempbuf[used_count].cpuid,
2602 					.arg1 = tempbuf->debugid,
2603 					.arg2 = (kd_buf_argtype)earliest_time,
2604 					.arg3 = 0,
2605 					.arg4 = 0,
2606 					.debugid = TRACE_RETROGRADE_EVENTS,
2607 				};
2608 			} else {
2609 				min_info->kd_prev_timebase = earliest_time;
2610 			}
2611 			used_count++;
2612 		}
2613 
2614 		if (used_count > 0) {
2615 			/*
2616 			 * Remember the latest timestamp of events that we've merged so we
2617 			 * don't think we've lost events later.
2618 			 */
2619 			uint64_t latest_time = tempbuf[used_count - 1].timestamp;
2620 			if (kd_control_trace.kdc_oldest_time < latest_time) {
2621 				kd_control_trace.kdc_oldest_time = latest_time;
2622 			}
2623 
2624 			int error = _send_events(dest, kd_buffer_trace.kdcopybuf, used_count);
2625 			if (error != 0) {
2626 				// XXX Why zero this when some events may have been written?
2627 				*events_written = 0;
2628 				return error;
2629 			}
2630 			event_count -= used_count;
2631 			*events_written += used_count;
2632 		}
2633 	}
2634 	return 0;
2635 }
2636 
2637 // Read events from kdebug storage units into a user space buffer or file.
2638 //
2639 // This code runs while events are emitted -- storage unit allocation and
2640 // deallocation will synchronize with the emitters under the storage lock.
2641 // Otherwise, mutual exclusion for this function must be provided by the caller,
2642 // typically using the ktrace lock.
2643 static int
_read_trace_events(struct kd_dest * dest,size_t event_count,size_t * events_written)2644 _read_trace_events(struct kd_dest *dest, size_t event_count, size_t *events_written)
2645 {
2646 	bool should_disable = false;
2647 	int const prev_kdebug_enable = kdebug_enable;
2648 	*events_written = 0;
2649 	if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT) || kd_buffer_trace.kdcopybuf == NULL) {
2650 		return EINVAL;
2651 	}
2652 	thread_set_eager_preempt(current_thread());
2653 
2654 	/*
2655 	 * Capture the current time.  Only sort events that have occured
2656 	 * before now.  Since the IOPs are being flushed here, it is possible
2657 	 * that events occur on the AP while running live tracing.
2658 	 */
2659 	uint64_t barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
2660 
2661 	// Disable wrap so storage units cannot be stolen while inspecting events.
2662 	//
2663 	// With ktrace_lock held, no other control threads can be modifying
2664 	// kdc_flags.  The code that emits new events could be running, but
2665 	// acquiring new storage units requires holding the storage lock, and it
2666 	// looks at the flags there.  The only issue is if events are being written
2667 	// to the same chunk being read from.
2668 	kdebug_emit_filter_t old_emit;
2669 	kdebug_live_flags_t old_live_flags;
2670 	bool wrapped = _reading_set_flags(&kd_control_trace, &old_emit, &old_live_flags);
2671 	bool const no_wrapping = old_live_flags & KDBG_NOWRAP;
2672 	int error = _read_trace_events_internal(dest, event_count, barrier_max,
2673 	    wrapped, &should_disable, events_written);
2674 	bool disabled_during_read = _reading_restore_flags(&kd_control_trace, old_emit,
2675 	    old_live_flags);
2676 	should_disable = should_disable || (disabled_during_read && no_wrapping);
2677 
2678 	thread_clear_eager_preempt(current_thread());
2679 
2680 	if (should_disable) {
2681 		kernel_debug_disable();
2682 	} else if (disabled_during_read && !no_wrapping && old_emit) {
2683 		kd_control_trace.kdc_emit = old_emit;
2684 		kdebug_enable = prev_kdebug_enable;
2685 		kd_control_trace.enabled = 1;
2686 		commpage_update_kdebug_state();
2687 	}
2688 
2689 	return error;
2690 }
2691 
2692 static int
_read_merged_trace_events(struct kd_dest * dest,size_t event_count,size_t * events_written)2693 _read_merged_trace_events(struct kd_dest *dest, size_t event_count, size_t *events_written)
2694 {
2695 	ktrace_assert_lock_held();
2696 	if (event_count == 0 || !(kd_control_trace.kdc_flags & KDBG_BUFINIT) ||
2697 	    kd_buffer_trace.kdcopybuf == 0) {
2698 		*events_written = 0;
2699 		return EINVAL;
2700 	}
2701 
2702 	// Before merging, make sure coprocessors have provided up-to-date events.
2703 	_coproc_list_callback(KD_CALLBACK_SYNC_FLUSH, NULL);
2704 	return _read_trace_events(dest, event_count, events_written);
2705 }
2706 
2707 struct event_chunk_header {
2708 	uint32_t tag;
2709 	uint32_t sub_tag;
2710 	uint64_t length;
2711 	uint64_t future_events_timestamp;
2712 };
2713 
2714 static int
_send_data_vfs(struct kd_dest * dest,const void * src,size_t size)2715 _send_data_vfs(struct kd_dest *dest, const void *src, size_t size)
2716 {
2717 	assert(size < INT_MAX);
2718 	assert(dest->kdd_kind == KD_DEST_VFS);
2719 	return vn_rdwr(UIO_WRITE, dest->kdd_vnode, (caddr_t)(uintptr_t)src,
2720 	           (int)size, dest->kdd_cur_offset, UIO_SYSSPACE, IO_NODELOCKED | IO_UNIT,
2721 	           vfs_context_ucred(&dest->kdd_vfs_ctx), (int *) 0,
2722 	           vfs_context_proc(&dest->kdd_vfs_ctx));
2723 }
2724 
2725 static int
_send_data(struct kd_dest * dest,const void * src,size_t size)2726 _send_data(struct kd_dest *dest, const void *src, size_t size)
2727 {
2728 	int error = 0;
2729 	switch (dest->kdd_kind) {
2730 	case KD_DEST_COPYOUT:
2731 		if (size > dest->kdd_user_size - dest->kdd_cur_offset) {
2732 			return ERANGE;
2733 		}
2734 		error = copyout(src, dest->kdd_user_buffer + dest->kdd_cur_offset, size);
2735 		break;
2736 	case KD_DEST_VFS:
2737 		error = _send_data_vfs(dest, src, size);
2738 		// XXX Previous code flushed with `VNOP_FSYNC` every 2MB, still needed?
2739 		break;
2740 	default:
2741 		panic("kdebug: unrecognized destination %d", dest->kdd_kind);
2742 	}
2743 	if (error == 0) {
2744 		dest->kdd_cur_offset += size;
2745 	}
2746 	return error;
2747 }
2748 
2749 static int
_send_event_chunk_header(struct kd_dest * dest,size_t event_count)2750 _send_event_chunk_header(struct kd_dest *dest, size_t event_count)
2751 {
2752 	struct event_chunk_header header = {
2753 		.tag = V3_RAW_EVENTS,
2754 		.sub_tag = 1,
2755 		.length = event_count * sizeof(kd_buf),
2756 	};
2757 
2758 	return _send_data(dest, &header, sizeof(header));
2759 }
2760 
2761 int
_send_events(struct kd_dest * dest,const void * src,size_t event_count)2762 _send_events(struct kd_dest *dest, const void *src, size_t event_count)
2763 {
2764 	if (dest->kdd_chunk_format) {
2765 		int error = _send_event_chunk_header(dest, event_count);
2766 		if (error != 0) {
2767 			return error;
2768 		}
2769 	}
2770 	return _send_data(dest, src, event_count * sizeof(kd_buf));
2771 }
2772 
2773 static int
_write_legacy_header(bool write_thread_map,struct kd_dest * dest)2774 _write_legacy_header(bool write_thread_map, struct kd_dest *dest)
2775 {
2776 	uint32_t pad_size;
2777 	uint32_t extra_thread_count = 0;
2778 	uint32_t cpumap_size;
2779 	size_t map_size = 0;
2780 	uint32_t map_count = 0;
2781 
2782 	if (write_thread_map) {
2783 		assert(kd_control_trace.kdc_flags & KDBG_MAPINIT);
2784 		if (kd_mapcount > UINT32_MAX) {
2785 			return ERANGE;
2786 		}
2787 		map_count = (uint32_t)kd_mapcount;
2788 		if (os_mul_overflow(map_count, sizeof(kd_threadmap), &map_size)) {
2789 			return ERANGE;
2790 		}
2791 		if (map_size >= INT_MAX) {
2792 			return ERANGE;
2793 		}
2794 	}
2795 
2796 	/*
2797 	 * Without the buffers initialized, we cannot construct a CPU map or a
2798 	 * thread map, and cannot write a header.
2799 	 */
2800 	if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT)) {
2801 		return EINVAL;
2802 	}
2803 
2804 	/*
2805 	 * To write a RAW_VERSION1+ file, we must embed a cpumap in the
2806 	 * "padding" used to page align the events following the threadmap. If
2807 	 * the threadmap happens to not require enough padding, we artificially
2808 	 * increase its footprint until it needs enough padding.
2809 	 */
2810 
2811 	pad_size = 16384 - ((sizeof(RAW_header) + map_size) & PAGE_MASK);
2812 	cpumap_size = sizeof(kd_cpumap_header) + kd_control_trace.kdebug_cpus * sizeof(kd_cpumap);
2813 
2814 	if (cpumap_size > pad_size) {
2815 		/* If the cpu map doesn't fit in the current available pad_size,
2816 		 * we increase the pad_size by 16K. We do this so that the event
2817 		 * data is always  available on a page aligned boundary for both
2818 		 * 4k and 16k systems. We enforce this alignment for the event
2819 		 * data so that we can take advantage of optimized file/disk writes.
2820 		 */
2821 		pad_size += 16384;
2822 	}
2823 
2824 	/* The way we are silently embedding a cpumap in the "padding" is by artificially
2825 	 * increasing the number of thread entries. However, we'll also need to ensure that
2826 	 * the cpumap is embedded in the last 4K page before when the event data is expected.
2827 	 * This way the tools can read the data starting the next page boundary on both
2828 	 * 4K and 16K systems preserving compatibility with older versions of the tools
2829 	 */
2830 	if (pad_size > 4096) {
2831 		pad_size -= 4096;
2832 		extra_thread_count = (pad_size / sizeof(kd_threadmap)) + 1;
2833 	}
2834 
2835 	int error = 0;
2836 	do {
2837 		clock_sec_t secs;
2838 		clock_usec_t usecs;
2839 		clock_get_calendar_microtime(&secs, &usecs);
2840 		RAW_header header = {
2841 			.version_no = RAW_VERSION1,
2842 			.thread_count = map_count + extra_thread_count,
2843 			.TOD_secs = secs,
2844 			.TOD_usecs = usecs,
2845 		};
2846 		error = _send_data(dest, &header, sizeof(header));
2847 		if (error != 0) {
2848 			break;
2849 		}
2850 
2851 		if (write_thread_map) {
2852 			error = _send_data(dest, kd_mapptr, map_size);
2853 			if (error != 0) {
2854 				break;
2855 			}
2856 		}
2857 
2858 		if (extra_thread_count) {
2859 			pad_size = extra_thread_count * sizeof(kd_threadmap);
2860 			void *pad_buf = kalloc_data(pad_size, Z_WAITOK | Z_ZERO);
2861 			if (!pad_buf) {
2862 				error = ENOMEM;
2863 				break;
2864 			}
2865 			error = _send_data(dest, pad_buf, pad_size);
2866 			if (error != 0) {
2867 				break;
2868 			}
2869 		}
2870 
2871 		pad_size = PAGE_SIZE - (dest->kdd_cur_offset & PAGE_MASK);
2872 		if (pad_size) {
2873 			void *pad_buf = kalloc_data(pad_size, Z_WAITOK | Z_ZERO);
2874 			if (!pad_buf) {
2875 				error = ENOMEM;
2876 				break;
2877 			}
2878 
2879 			/*
2880 			 * Embed the CPU map in the padding bytes -- old code will skip it,
2881 			 * while newer code knows it's there.
2882 			 */
2883 			size_t temp = pad_size;
2884 			(void)_copy_cpu_map(RAW_VERSION1, &pad_buf, &temp);
2885 			error = _send_data(dest, pad_buf, pad_size);
2886 			kfree_data(pad_buf, pad_size);
2887 			if (error != 0) {
2888 				break;
2889 			}
2890 		}
2891 	} while (false);
2892 
2893 	return error;
2894 }
2895 
2896 #pragma mark - User space interface
2897 
2898 static int
_kd_sysctl_internal(int op,int value,user_addr_t where,size_t * sizep)2899 _kd_sysctl_internal(int op, int value, user_addr_t where, size_t *sizep)
2900 {
2901 	size_t size = *sizep;
2902 	kd_regtype kd_Reg;
2903 
2904 	bool read_only = (op == KERN_KDGETBUF || op == KERN_KDREADCURTHRMAP);
2905 	int perm_error = read_only ? ktrace_read_check() :
2906 	    ktrace_configure(KTRACE_KDEBUG);
2907 	if (perm_error != 0) {
2908 		return perm_error;
2909 	}
2910 
2911 	switch (op) {
2912 	case KERN_KDGETBUF:;
2913 		pid_t owning_pid = ktrace_get_owning_pid();
2914 		const kbufinfo_t info = {
2915 			.nkdbufs = kd_buffer_trace.kdb_event_count,
2916 			.nkdthreads = (int)MIN(kd_mapcount, INT_MAX),
2917 			.nolog = kd_control_trace.kdc_emit == KDEMIT_DISABLE,
2918 			.flags = kd_control_trace.kdc_flags | kd_control_trace.kdc_live_flags | KDBG_LP64,
2919 			.bufid = owning_pid ?: -1,
2920 		};
2921 		size = MIN(size, sizeof(info));
2922 		return copyout(&info, where, size);
2923 	case KERN_KDREADCURTHRMAP:
2924 		return kdbg_readcurthrmap(where, sizep);
2925 	case KERN_KDEFLAGS:
2926 		value &= KDBG_USERFLAGS;
2927 		kd_control_trace.kdc_flags |= value;
2928 		return 0;
2929 	case KERN_KDDFLAGS:
2930 		value &= KDBG_USERFLAGS;
2931 		kd_control_trace.kdc_flags &= ~value;
2932 		return 0;
2933 	case KERN_KDENABLE:
2934 		if (value) {
2935 			if (!(kd_control_trace.kdc_flags & KDBG_BUFINIT) ||
2936 			    !(value == KDEBUG_ENABLE_TRACE || value == KDEBUG_ENABLE_PPT)) {
2937 				return EINVAL;
2938 			}
2939 			_threadmap_init();
2940 
2941 			kdbg_set_tracing_enabled(true, value);
2942 		} else {
2943 			if (!kdebug_enable) {
2944 				return 0;
2945 			}
2946 
2947 			kernel_debug_disable();
2948 		}
2949 		return 0;
2950 	case KERN_KDSETBUF:
2951 		kdbg_set_nkdbufs_trace(value);
2952 		return 0;
2953 	case KERN_KDSETUP:
2954 		return kdbg_reinit(EXTRA_COPROC_COUNT);
2955 	case KERN_KDREMOVE:
2956 		ktrace_reset(KTRACE_KDEBUG);
2957 		return 0;
2958 	case KERN_KDSETREG:
2959 		if (size < sizeof(kd_regtype)) {
2960 			return EINVAL;
2961 		}
2962 		if (copyin(where, &kd_Reg, sizeof(kd_regtype))) {
2963 			return EINVAL;
2964 		}
2965 		return kdbg_setreg(&kd_Reg);
2966 	case KERN_KDGETREG:
2967 		return EINVAL;
2968 	case KERN_KDREADTR: {
2969 		struct kd_dest copy_dest = kd_dest_copyout(where, *sizep);
2970 		size_t event_count = *sizep / sizeof(kd_buf);
2971 		size_t events_written = 0;
2972 		int error = _read_merged_trace_events(&copy_dest, event_count, &events_written);
2973 		*sizep = events_written;
2974 		return error;
2975 	}
2976 	case KERN_KDWRITETR:
2977 	case KERN_KDWRITETR_V3:
2978 	case KERN_KDWRITEMAP: {
2979 		struct kd_dest write_dest = {};
2980 		int fd = value;
2981 
2982 		if (op == KERN_KDWRITETR || op == KERN_KDWRITETR_V3) {
2983 			(void)kdbg_wait(size);
2984 			// Re-check whether this process can configure ktrace, since waiting
2985 			// will drop the ktrace lock.
2986 			int no_longer_owner_error = ktrace_configure(KTRACE_KDEBUG);
2987 			if (no_longer_owner_error != 0) {
2988 				return no_longer_owner_error;
2989 			}
2990 		}
2991 
2992 		struct  fileproc *fp;
2993 		int error = kd_dest_init_write(&write_dest, fd, &fp);
2994 		if (error != 0) {
2995 			return error;
2996 		}
2997 		if (op == KERN_KDWRITETR || op == KERN_KDWRITETR_V3) {
2998 			size_t event_count = kd_buffer_trace.kdb_event_count;
2999 			size_t events_written = 0;
3000 			if (op == KERN_KDWRITETR_V3) {
3001 				write_dest.kdd_chunk_format = true;
3002 			}
3003 
3004 			KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_START);
3005 			error = _read_merged_trace_events(&write_dest, event_count,
3006 			    &events_written);
3007 			KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_END, events_written);
3008 			*sizep = events_written;
3009 		} else {
3010 			error = kdbg_write_thread_map(&write_dest);
3011 			if (error == 0) {
3012 				*sizep = kd_mapcount * sizeof(kd_threadmap);
3013 			}
3014 		}
3015 		kd_dest_finish_write(&write_dest, fp, fd);
3016 		return error;
3017 	}
3018 	case KERN_KDBUFWAIT:
3019 		*sizep = kdbg_wait(size);
3020 		return 0;
3021 	case KERN_KDPIDTR:
3022 		if (size < sizeof(kd_regtype)) {
3023 			return EINVAL;
3024 		}
3025 		if (copyin(where, &kd_Reg, sizeof(kd_regtype))) {
3026 			return EINVAL;
3027 		}
3028 		return kdbg_setpid(&kd_Reg);
3029 	case KERN_KDPIDEX:
3030 		if (size < sizeof(kd_regtype)) {
3031 			return EINVAL;
3032 		}
3033 		if (copyin(where, &kd_Reg, sizeof(kd_regtype))) {
3034 			return EINVAL;
3035 		}
3036 		return kdbg_setpidex(&kd_Reg);
3037 	case KERN_KDCPUMAP:
3038 		return _copyout_cpu_map(RAW_VERSION1, where, sizep);
3039 	case KERN_KDCPUMAP_EXT:
3040 		return _copyout_cpu_map(1, where, sizep);
3041 	case KERN_KDTHRMAP:
3042 		return kdbg_copyout_thread_map(where, sizep);
3043 	case KERN_KDSET_TYPEFILTER:
3044 		return kdbg_copyin_typefilter(where, size);
3045 	case KERN_KDSET_EDM:
3046 		return _copyin_event_disable_mask(where, size);
3047 	case KERN_KDGET_EDM:
3048 		return _copyout_event_disable_mask(where, size);
3049 #if DEVELOPMENT || DEBUG
3050 	case KERN_KDTEST:
3051 		return kdbg_test(size);
3052 #endif // DEVELOPMENT || DEBUG
3053 
3054 	default:
3055 		return ENOTSUP;
3056 	}
3057 }
3058 
3059 static int
3060 kdebug_sysctl SYSCTL_HANDLER_ARGS
3061 {
3062 	int *names = arg1;
3063 	int name_count = arg2;
3064 	user_addr_t udst = req->oldptr;
3065 	size_t *usize = &req->oldlen;
3066 	int value = 0;
3067 
3068 	if (name_count == 0) {
3069 		return ENOTSUP;
3070 	}
3071 
3072 	int op = names[0];
3073 
3074 	// Some operations have an argument stuffed into the next OID argument.
3075 	switch (op) {
3076 	case KERN_KDWRITETR:
3077 	case KERN_KDWRITETR_V3:
3078 	case KERN_KDWRITEMAP:
3079 	case KERN_KDEFLAGS:
3080 	case KERN_KDDFLAGS:
3081 	case KERN_KDENABLE:
3082 	case KERN_KDSETBUF:
3083 		if (name_count < 2) {
3084 			return EINVAL;
3085 		}
3086 		value = names[1];
3087 		break;
3088 	default:
3089 		break;
3090 	}
3091 
3092 	ktrace_lock();
3093 	int ret = _kd_sysctl_internal(op, value, udst, usize);
3094 	ktrace_unlock();
3095 	if (0 == ret) {
3096 		req->oldidx += req->oldlen;
3097 	}
3098 	return ret;
3099 }
3100 SYSCTL_PROC(_kern, KERN_KDEBUG, kdebug,
3101     CTLTYPE_NODE | CTLFLAG_RD | CTLFLAG_LOCKED, 0, 0, kdebug_sysctl, NULL, "");
3102 
3103 #pragma mark - Tests
3104 
3105 #if DEVELOPMENT || DEBUG
3106 
3107 static int test_coproc = 0;
3108 static int sync_flush_coproc = 0;
3109 
3110 #define KDEBUG_TEST_CODE(code) BSDDBG_CODE(DBG_BSD_KDEBUG_TEST, (code))
3111 
3112 /*
3113  * A test IOP for the SYNC_FLUSH callback.
3114  */
3115 
3116 static void
sync_flush_callback(void * __unused context,kd_callback_type reason,void * __unused arg)3117 sync_flush_callback(void * __unused context, kd_callback_type reason,
3118     void * __unused arg)
3119 {
3120 	assert(sync_flush_coproc > 0);
3121 
3122 	if (reason == KD_CALLBACK_SYNC_FLUSH) {
3123 		kernel_debug_enter(sync_flush_coproc, KDEBUG_TEST_CODE(0xff),
3124 		    kdebug_timestamp(), 0, 0, 0, 0, 0);
3125 	}
3126 }
3127 
3128 static struct kd_callback sync_flush_kdcb = {
3129 	.func = sync_flush_callback,
3130 	.iop_name = "test_sf",
3131 };
3132 
3133 #define TEST_COPROC_CTX 0xabadcafe
3134 
3135 static void
test_coproc_cb(__assert_only void * context,kd_callback_type __unused reason,void * __unused arg)3136 test_coproc_cb(__assert_only void *context, kd_callback_type __unused reason,
3137     void * __unused arg)
3138 {
3139 	assert((uintptr_t)context == TEST_COPROC_CTX);
3140 }
3141 
3142 static int
kdbg_test(size_t flavor)3143 kdbg_test(size_t flavor)
3144 {
3145 	int code = 0;
3146 	int dummy_iop = 0;
3147 
3148 	switch (flavor) {
3149 	case KDTEST_KERNEL_MACROS:
3150 		/* try each macro */
3151 		KDBG(KDEBUG_TEST_CODE(code)); code++;
3152 		KDBG(KDEBUG_TEST_CODE(code), 1); code++;
3153 		KDBG(KDEBUG_TEST_CODE(code), 1, 2); code++;
3154 		KDBG(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
3155 		KDBG(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
3156 
3157 		KDBG_RELEASE(KDEBUG_TEST_CODE(code)); code++;
3158 		KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1); code++;
3159 		KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2); code++;
3160 		KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
3161 		KDBG_RELEASE(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
3162 
3163 		KDBG_FILTERED(KDEBUG_TEST_CODE(code)); code++;
3164 		KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1); code++;
3165 		KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2); code++;
3166 		KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
3167 		KDBG_FILTERED(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
3168 
3169 		KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code)); code++;
3170 		KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1); code++;
3171 		KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2); code++;
3172 		KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
3173 		KDBG_RELEASE_NOPROCFILT(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
3174 
3175 		KDBG_DEBUG(KDEBUG_TEST_CODE(code)); code++;
3176 		KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1); code++;
3177 		KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2); code++;
3178 		KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2, 3); code++;
3179 		KDBG_DEBUG(KDEBUG_TEST_CODE(code), 1, 2, 3, 4); code++;
3180 		break;
3181 
3182 	case KDTEST_OLD_TIMESTAMP:
3183 		if (kd_control_trace.kdc_coprocs) {
3184 			/* avoid the assertion in kernel_debug_enter for a valid IOP */
3185 			dummy_iop = kd_control_trace.kdc_coprocs[0].cpu_id;
3186 		}
3187 
3188 		/* ensure old timestamps are not emitted from kernel_debug_enter */
3189 		kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
3190 		    100 /* very old timestamp */, 0, 0, 0, 0, 0);
3191 		code++;
3192 		kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
3193 		    kdebug_timestamp(), 0, 0, 0, 0, 0);
3194 		code++;
3195 		break;
3196 
3197 	case KDTEST_FUTURE_TIMESTAMP:
3198 		if (kd_control_trace.kdc_coprocs) {
3199 			dummy_iop = kd_control_trace.kdc_coprocs[0].cpu_id;
3200 		}
3201 		kernel_debug_enter(dummy_iop, KDEBUG_TEST_CODE(code),
3202 		    kdebug_timestamp() * 2 /* !!! */, 0, 0, 0, 0, 0);
3203 		break;
3204 
3205 	case KDTEST_SETUP_IOP:
3206 		if (!sync_flush_coproc) {
3207 			ktrace_unlock();
3208 			int new_sync_flush_coproc = kernel_debug_register_callback(
3209 				sync_flush_kdcb);
3210 			assert(new_sync_flush_coproc > 0);
3211 			ktrace_lock();
3212 			if (!sync_flush_coproc) {
3213 				sync_flush_coproc = new_sync_flush_coproc;
3214 			}
3215 		}
3216 		break;
3217 
3218 	case KDTEST_SETUP_COPROCESSOR:
3219 		if (!test_coproc) {
3220 			ktrace_unlock();
3221 			int new_test_coproc = kdebug_register_coproc("test_coproc",
3222 			    KDCP_CONTINUOUS_TIME, test_coproc_cb, (void *)TEST_COPROC_CTX);
3223 			assert(new_test_coproc > 0);
3224 			ktrace_lock();
3225 			if (!test_coproc) {
3226 				test_coproc = new_test_coproc;
3227 			}
3228 		}
3229 		break;
3230 
3231 	case KDTEST_ABSOLUTE_TIMESTAMP:;
3232 		uint64_t atime = mach_absolute_time();
3233 		kernel_debug_enter(sync_flush_coproc, KDEBUG_TEST_CODE(0),
3234 		    atime, (uintptr_t)atime, (uintptr_t)(atime >> 32), 0, 0, 0);
3235 		break;
3236 
3237 	case KDTEST_CONTINUOUS_TIMESTAMP:;
3238 		uint64_t ctime = mach_continuous_time();
3239 		kernel_debug_enter(test_coproc, KDEBUG_TEST_CODE(1),
3240 		    ctime, (uintptr_t)ctime, (uintptr_t)(ctime >> 32), 0, 0, 0);
3241 		break;
3242 
3243 	case KDTEST_PAST_EVENT:;
3244 		uint64_t old_time = 1;
3245 		kernel_debug_enter(test_coproc, KDEBUG_TEST_CODE(1), old_time, 0, 0, 0,
3246 		    0, 0);
3247 		kernel_debug_enter(test_coproc, KDEBUG_TEST_CODE(1), kdebug_timestamp(),
3248 		    0, 0, 0, 0, 0);
3249 		break;
3250 
3251 	default:
3252 		return ENOTSUP;
3253 	}
3254 
3255 	return 0;
3256 }
3257 
3258 #undef KDEBUG_TEST_CODE
3259 
3260 #endif /* DEVELOPMENT || DEBUG */
3261 
3262 static void
_deferred_coproc_notify(mpsc_queue_chain_t e,mpsc_daemon_queue_t queue __unused)3263 _deferred_coproc_notify(mpsc_queue_chain_t e, mpsc_daemon_queue_t queue __unused)
3264 {
3265 	struct kd_coproc *coproc = mpsc_queue_element(e, struct kd_coproc, chain);
3266 	if (kd_control_trace.kdc_emit == KDEMIT_TYPEFILTER) {
3267 		coproc->callback.func(coproc->callback.context,
3268 		    KD_CALLBACK_TYPEFILTER_CHANGED, kdbg_typefilter);
3269 	}
3270 	if (kdebug_enable) {
3271 		coproc->callback.func(coproc->callback.context,
3272 		    KD_CALLBACK_KDEBUG_ENABLED, kdbg_typefilter);
3273 	}
3274 }
3275 
3276 void
kdebug_init(unsigned int n_events,char * filter_desc,enum kdebug_opts opts)3277 kdebug_init(unsigned int n_events, char *filter_desc, enum kdebug_opts opts)
3278 {
3279 	assert(filter_desc != NULL);
3280 
3281 	kdbg_typefilter = typefilter_create();
3282 	assert(kdbg_typefilter != NULL);
3283 	kdbg_typefilter_memory_entry = typefilter_create_memory_entry(kdbg_typefilter);
3284 	assert(kdbg_typefilter_memory_entry != MACH_PORT_NULL);
3285 
3286 	(void)mpsc_daemon_queue_init_with_thread_call(&_coproc_notify_queue,
3287 	    _deferred_coproc_notify, THREAD_CALL_PRIORITY_KERNEL,
3288 	    MPSC_DAEMON_INIT_NONE);
3289 
3290 	kdebug_trace_start(n_events, filter_desc, opts);
3291 }
3292 
3293 static void
kdbg_set_typefilter_string(const char * filter_desc)3294 kdbg_set_typefilter_string(const char *filter_desc)
3295 {
3296 	char *end = NULL;
3297 
3298 	ktrace_assert_lock_held();
3299 
3300 	assert(filter_desc != NULL);
3301 
3302 	typefilter_reject_all(kdbg_typefilter);
3303 	typefilter_allow_class(kdbg_typefilter, DBG_TRACE);
3304 
3305 	/* if the filter description starts with a number, assume it's a csc */
3306 	if (filter_desc[0] >= '0' && filter_desc[0] <= '9') {
3307 		unsigned long csc = strtoul(filter_desc, NULL, 0);
3308 		if (filter_desc != end && csc <= KDBG_CSC_MAX) {
3309 			typefilter_allow_csc(kdbg_typefilter, (uint16_t)csc);
3310 		}
3311 		return;
3312 	}
3313 
3314 	while (filter_desc[0] != '\0') {
3315 		unsigned long allow_value;
3316 
3317 		char filter_type = filter_desc[0];
3318 		if (filter_type != 'C' && filter_type != 'S') {
3319 			printf("kdebug: unexpected filter type `%c'\n", filter_type);
3320 			return;
3321 		}
3322 		filter_desc++;
3323 
3324 		allow_value = strtoul(filter_desc, &end, 0);
3325 		if (filter_desc == end) {
3326 			printf("kdebug: cannot parse `%s' as integer\n", filter_desc);
3327 			return;
3328 		}
3329 
3330 		switch (filter_type) {
3331 		case 'C':
3332 			if (allow_value > KDBG_CLASS_MAX) {
3333 				printf("kdebug: class 0x%lx is invalid\n", allow_value);
3334 				return;
3335 			}
3336 			printf("kdebug: C 0x%lx\n", allow_value);
3337 			typefilter_allow_class(kdbg_typefilter, (uint8_t)allow_value);
3338 			break;
3339 		case 'S':
3340 			if (allow_value > KDBG_CSC_MAX) {
3341 				printf("kdebug: class-subclass 0x%lx is invalid\n", allow_value);
3342 				return;
3343 			}
3344 			printf("kdebug: S 0x%lx\n", allow_value);
3345 			typefilter_allow_csc(kdbg_typefilter, (uint16_t)allow_value);
3346 			break;
3347 		default:
3348 			__builtin_unreachable();
3349 		}
3350 
3351 		/* advance to next filter entry */
3352 		filter_desc = end;
3353 		if (filter_desc[0] == ',') {
3354 			filter_desc++;
3355 		}
3356 	}
3357 }
3358 
3359 uint64_t
kdebug_wake(void)3360 kdebug_wake(void)
3361 {
3362 	if (!wake_nkdbufs) {
3363 		return 0;
3364 	}
3365 	uint64_t start = mach_absolute_time();
3366 	kdebug_trace_start(wake_nkdbufs, NULL, trace_wrap ? KDOPT_WRAPPING : 0);
3367 	return mach_absolute_time() - start;
3368 }
3369 
3370 /*
3371  * This function is meant to be called from the bootstrap thread or kdebug_wake.
3372  */
3373 void
kdebug_trace_start(unsigned int n_events,const char * filter_desc,enum kdebug_opts opts)3374 kdebug_trace_start(unsigned int n_events, const char *filter_desc,
3375     enum kdebug_opts opts)
3376 {
3377 	if (!n_events) {
3378 		kd_early_done = true;
3379 		return;
3380 	}
3381 
3382 	ktrace_start_single_threaded();
3383 
3384 	ktrace_kernel_configure(KTRACE_KDEBUG);
3385 
3386 	kdbg_set_nkdbufs_trace(n_events);
3387 
3388 	kernel_debug_string_early("start_kern_tracing");
3389 
3390 	int error = kdbg_reinit(EXTRA_COPROC_COUNT_BOOT);
3391 	if (error != 0) {
3392 		printf("kdebug: allocation failed, kernel tracing not started: %d\n",
3393 		    error);
3394 		kd_early_done = true;
3395 		goto out;
3396 	}
3397 
3398 	/*
3399 	 * Wrapping is disabled because boot and wake tracing is interested in
3400 	 * the earliest events, at the expense of later ones.
3401 	 */
3402 	if ((opts & KDOPT_WRAPPING) == 0) {
3403 		kd_control_trace.kdc_flags |= KDBG_NOWRAP;
3404 	}
3405 
3406 	if (filter_desc && filter_desc[0] != '\0') {
3407 		kdbg_set_typefilter_string(filter_desc);
3408 		kdbg_enable_typefilter();
3409 	}
3410 
3411 	/*
3412 	 * Hold off interrupts between getting a thread map and enabling trace
3413 	 * and until the early traces are recorded.
3414 	 */
3415 	bool s = ml_set_interrupts_enabled(false);
3416 
3417 	if (!(opts & KDOPT_ATBOOT)) {
3418 		_threadmap_init();
3419 	}
3420 
3421 	kdbg_set_tracing_enabled(true, KDEBUG_ENABLE_TRACE);
3422 
3423 	if ((opts & KDOPT_ATBOOT)) {
3424 		/*
3425 		 * Transfer all very early events from the static buffer into the real
3426 		 * buffers.
3427 		 */
3428 		kernel_debug_early_end();
3429 	}
3430 
3431 	ml_set_interrupts_enabled(s);
3432 
3433 	printf("kernel tracing started with %u events, filter = %s\n", n_events,
3434 	    filter_desc ?: "none");
3435 
3436 out:
3437 	ktrace_end_single_threaded();
3438 }
3439 
3440 void
kdbg_dump_trace_to_file(const char * filename,bool reenable)3441 kdbg_dump_trace_to_file(const char *filename, bool reenable)
3442 {
3443 	vfs_context_t ctx;
3444 	vnode_t vp;
3445 	int ret;
3446 	int reenable_trace = 0;
3447 
3448 	ktrace_lock();
3449 
3450 	if (!(kdebug_enable & KDEBUG_ENABLE_TRACE)) {
3451 		goto out;
3452 	}
3453 
3454 	if (ktrace_get_owning_pid() != 0) {
3455 		/*
3456 		 * Another process owns ktrace and is still active, disable tracing to
3457 		 * prevent wrapping.
3458 		 */
3459 		kdebug_enable = 0;
3460 		kd_control_trace.enabled = 0;
3461 		commpage_update_kdebug_state();
3462 		goto out;
3463 	}
3464 
3465 	KDBG_RELEASE(TRACE_WRITING_EVENTS | DBG_FUNC_START);
3466 
3467 	reenable_trace = reenable ? kdebug_enable : 0;
3468 	kdebug_enable = 0;
3469 	kd_control_trace.enabled = 0;
3470 	commpage_update_kdebug_state();
3471 
3472 	ctx = vfs_context_kernel();
3473 	if (vnode_open(filename, (O_CREAT | FWRITE | O_NOFOLLOW), 0600, 0, &vp, ctx)) {
3474 		goto out;
3475 	}
3476 	struct kd_dest file_dest = {
3477 		.kdd_kind = KD_DEST_VFS,
3478 		.kdd_vnode = vp,
3479 		.kdd_vfs_ctx = *ctx,
3480 	};
3481 
3482 	kdbg_write_thread_map(&file_dest);
3483 
3484 	size_t events_written = 0;
3485 	ret = _read_merged_trace_events(&file_dest, kd_buffer_trace.kdb_event_count,
3486 	    &events_written);
3487 	if (ret) {
3488 		goto out_close;
3489 	}
3490 
3491 	/*
3492 	 * Wait to synchronize the file to capture the I/O in the
3493 	 * TRACE_WRITING_EVENTS interval.
3494 	 */
3495 	ret = VNOP_FSYNC(vp, MNT_WAIT, ctx);
3496 	if (ret == KERN_SUCCESS) {
3497 		ret = VNOP_IOCTL(vp, F_FULLFSYNC, (caddr_t)NULL, 0, ctx);
3498 	}
3499 
3500 	/*
3501 	 * Balance the starting TRACE_WRITING_EVENTS tracepoint manually.
3502 	 */
3503 	kd_buf end_event = {
3504 		.debugid = TRACE_WRITING_EVENTS | DBG_FUNC_END,
3505 		.arg1 = events_written,
3506 		.arg2 = ret,
3507 		.arg5 = (kd_buf_argtype)thread_tid(current_thread()),
3508 		.timestamp = kdebug_timestamp(),
3509 		.cpuid = cpu_number(),
3510 	};
3511 	/* this is best effort -- ignore any errors */
3512 	(void)_send_data_vfs(&file_dest, &end_event, sizeof(kd_buf));
3513 
3514 out_close:
3515 	vnode_close(vp, FWRITE, ctx);
3516 	sync(current_proc(), (void *)NULL, (int *)NULL);
3517 
3518 out:
3519 	if (reenable_trace != 0) {
3520 		kdebug_enable = reenable_trace;
3521 		kd_control_trace.enabled = 1;
3522 		commpage_update_kdebug_state();
3523 	}
3524 
3525 	ktrace_unlock();
3526 }
3527 
3528 SYSCTL_NODE(_kern, OID_AUTO, kdbg, CTLFLAG_RD | CTLFLAG_LOCKED, 0,
3529     "kdbg");
3530 
3531 SYSCTL_INT(_kern_kdbg, OID_AUTO, debug,
3532     CTLFLAG_RW | CTLFLAG_LOCKED,
3533     &kdbg_debug, 0, "Set kdebug debug mode");
3534 
3535 SYSCTL_QUAD(_kern_kdbg, OID_AUTO, oldest_time,
3536     CTLTYPE_QUAD | CTLFLAG_RD | CTLFLAG_LOCKED,
3537     &kd_control_trace.kdc_oldest_time,
3538     "Find the oldest timestamp still in trace");
3539