xref: /xnu-11417.140.69/osfmk/kern/trap_telemetry.c (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2024 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #include <kern/trap_telemetry.h>
30 #include <libkern/coreanalytics/coreanalytics.h>
31 #include <kern/percpu.h>
32 #include <libkern/tree.h>
33 #include <kern/locks.h>
34 #include <kern/thread_call.h>
35 #include <kern/kalloc.h>
36 #include <kern/cpu_data.h>
37 #include <kern/telemetry.h>
38 #include <kern/assert.h>
39 #include <kern/backtrace.h>
40 #include <machine/machine_routines.h>
41 #include <libkern/OSKextLibPrivate.h>
42 #include <libkern/kernel_mach_header.h>
43 
44 #define TAG "[trap_telemetry] "
45 
46 /* ~* Module Configuration *~ */
47 
48 /**
49  * Maximum number of backtrace frames to attempt to report.
50  *
51  * Some reporting destinations may use fewer frames than this due to
52  * encoding/space restrictions.
53  */
54 #define TRAP_TELEMETRY_BT_FRAMES  (15)
55 
56 /** Static length of various CA telemetry event's backtrace string */
57 #define TRAP_TELEMETRY_BT_STR_LEN CA_UBSANBUF_LEN
58 
59 /**
60  * Entry count of the RSB.
61  *
62  * Larger sizes support a higher event volume/can help avoid dropping events
63  * under load.
64  */
65 #define RECORD_SUBMISSION_BUFFER_LENGTH (16)
66 
67 /** Number of last events per-CPU to remember and reject. */
68 #define DEBOUNCE_RECORD_COUNT (2)
69 
70 /**
71  * When true, trap telemetry will not report events to CoreAnalytics.
72  *
73  * Local reporting (via trap_telemetry_dump_event) is not impacted.
74  */
75 static TUNABLE(bool, trap_telemetry_disable_ca, "trap_telemetry_disable_ca", false);
76 
77 /**
78  * Disable all trap telemetry reporting (including local reporting)
79  */
80 static TUNABLE(bool, trap_telemetry_disable_all, "trap_telemetry_disable_all", false);
81 
82 /**
83  * Print matching events to the console. Set to -1 to disable.
84  * Setting type but disabling code will match all codes of the given type.
85  */
86 static TUNABLE(uint32_t, trap_telemetry_dump_type, "trap_telemetry_dump_type",
87     -1);
88 static TUNABLE(uint64_t, trap_telemetry_dump_code, "trap_telemetry_dump_code",
89     -1);
90 
91 /* ~* Data Structures *~ */
92 
93 typedef struct match_record {
94 	/** Slid address at which the exception was thrown */
95 	uintptr_t fault_pc;
96 
97 	/** The trap type or "class" for the record. */
98 	trap_telemetry_type_t trap_type;
99 
100 	/** The trap code disambiguates traps within a class. */
101 	uint64_t trap_code;
102 } match_record_s;
103 
104 typedef struct rsb_entry {
105 	match_record_s record;
106 	trap_telemetry_options_s options;
107 	size_t bt_frames_count;
108 	uintptr_t bt_frames[TRAP_TELEMETRY_BT_FRAMES];
109 } rsb_entry_s;
110 
111 typedef struct trap_telemetry_tree_entry {
112 	SPLAY_ENTRY(trap_telemetry_tree_entry) link;
113 	match_record_s record;
114 } trap_telemetry_tree_entry_s;
115 
116 typedef struct trap_debounce_buffer {
117 	/**
118 	 * Storage array for trap records used to debounce.
119 	 *
120 	 * We don't have valid bits for entries but rather use zero to implicitly
121 	 * indicate an invalid entry (as they should never naturally match any real
122 	 * trap).
123 	 */
124 	match_record_s records[DEBOUNCE_RECORD_COUNT];
125 
126 	/** The index of the entry to replace next (LIFO) */
127 	size_t tail;
128 } trap_debounce_buffer_s;
129 
130 /* ~* Core Analytics *~ */
131 CA_EVENT(kernel_breakpoint_event,
132     CA_INT, brk_type,
133     CA_INT, brk_code,
134     CA_INT, faulting_address,
135     CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
136     CA_STATIC_STRING(CA_UUID_LEN), uuid);
137 
138 CA_EVENT(trap_telemetry_internal,
139     CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
140     CA_INT, trap_code,
141     CA_INT, trap_offset,
142     CA_INT, trap_type,
143     CA_STATIC_STRING(CA_UUID_LEN), trap_uuid);
144 
145 /* ~* Splay tree *~ */
146 static int
match_record_compare(match_record_s * r1,match_record_s * r2)147 match_record_compare(match_record_s *r1,
148     match_record_s *r2)
149 {
150 	if (r1->fault_pc < r2->fault_pc) {
151 		return 1;
152 	} else if (r1->fault_pc > r2->fault_pc) {
153 		return -1;
154 	}
155 
156 	if (r1->trap_type < r2->trap_type) {
157 		return 1;
158 	} else if (r1->trap_type > r2->trap_type) {
159 		return -1;
160 	}
161 
162 	if (r1->trap_code < r2->trap_code) {
163 		return 1;
164 	} else if (r1->trap_code > r2->trap_code) {
165 		return -1;
166 	}
167 
168 	/* Records match */
169 	return 0;
170 }
171 
172 static int
trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s * r1,trap_telemetry_tree_entry_s * r2)173 trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s *r1,
174     trap_telemetry_tree_entry_s *r2)
175 {
176 	return match_record_compare(&r1->record, &r2->record);
177 }
178 
179 SPLAY_HEAD(trap_telemetry_tree, trap_telemetry_tree_entry);
180 /* These functions generated by SPLAY_PROTOTYPE but are currently unused */
181 __unused static struct trap_telemetry_tree_entry *
182 trap_telemetry_tree_SPLAY_NEXT(struct trap_telemetry_tree *head,
183     struct trap_telemetry_tree_entry *elm);
184 __unused static struct trap_telemetry_tree_entry *
185 trap_telemetry_tree_SPLAY_SEARCH(struct trap_telemetry_tree *head,
186     struct trap_telemetry_tree_entry *elm);
187 __unused static struct trap_telemetry_tree_entry *
188 trap_telemetry_tree_SPLAY_MIN_MAX(struct trap_telemetry_tree *head, int val);
189 SPLAY_PROTOTYPE(trap_telemetry_tree,
190     trap_telemetry_tree_entry,
191     link,
192     trap_telemetry_tree_entry_compare);
193 SPLAY_GENERATE(trap_telemetry_tree,
194     trap_telemetry_tree_entry,
195     link,
196     trap_telemetry_tree_entry_compare);
197 
198 /* ~* Globals *~ */
199 /* Lock which protects the event submission queue */
200 static LCK_GRP_DECLARE(trap_telemetry_lock_grp, "trap_telemetry_lock");
201 static LCK_SPIN_DECLARE(trap_telemetry_lock, &trap_telemetry_lock_grp);
202 
203 /*
204  * Since traps are, naturally, caught in an exception context, it is not safe to
205  * allocate. To solve this, we use a short submission ring buffer which collects
206  * records for processing on a submission thread (which can allocate).
207  *
208  * This ring buffer and all its associated control fields are locked by
209  * TRAP_TELEMETRY_LOCK.
210  */
211 static rsb_entry_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH];
212 static size_t rsb_rd_idx;
213 static size_t rsb_wr_idx;
214 static size_t rsb_count;
215 static bool rsb_is_draining;
216 
217 /**
218  * For deduplication, we store hit records in a splay tree.
219  * We use a splay here for performance reasons since traps tend to exhibit a
220  * degree of temporal locality.
221  */
222 static struct trap_telemetry_tree telemetry_splay_tree;
223 
224 /**
225  * Flag indicating whether this CPU is currently trying to acquire the telemetry
226  * lock or has already acquired the lock.
227  * This is used as a deadlock avoidance mechanism.
228  */
229 static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked);
230 
231 /**
232  * In order to avoid reporting the same event many times in quick succession
233  * (especially when report_once_per_site=false) and overwhelming both the trap
234  * telemetry module and CoreAnalytics, we "debounce" all events on a per-CPU
235  * basis. This is done through a buffer which tracks the LIFO
236  * DEBOUNCE_ENTRY_COUNT trap PCs.
237  */
238 static trap_debounce_buffer_s PERCPU_DATA(per_cpu_trap_debounce_buffer);
239 
240 /**
241  * Thread which is responsible for clearing the submission buffer by submitting
242  * to CoreAnalytics and the local tree.
243  */
244 static struct thread_call *drain_record_submission_buffer_callout;
245 
246 #if DEVELOPMENT || DEBUG
247 /**
248  * sysctl debug.trap_telemetry_reported_events
249  *
250  * Counts the number of events which were successfully reported (either locally
251  * or to CoreAnalytics). This does not include events which were ignored,
252  * debounced, or discarded as a duplicate.
253  */
254 unsigned long trap_telemetry_reported_events = 0;
255 
256 /**
257  * sysctl debug.trap_telemetry_capacity_dropped_events
258  *
259  * Counts the number of events which, if not for the RSB being full, would have
260  * been reported successfully. Events in this count indicate telemetry loss.
261  */
262 unsigned long trap_telemetry_capacity_dropped_events = 0;
263 #endif /* DEVELOPMENT || DEBUG */
264 
265 /* ~* Implementation *~ */
266 
267 /**
268  * Try and acquire a spin lock in an interrupt-deadlock safe way.
269  *
270  * This function differs from the standard lck_spin_try_lock function in that it
271  * will block if the lock is expected to be acquired *eventually* but will not
272  * block if it detects that the lock will never be acquired (such as when the
273  * current CPU owns the lock, which can happen if a trap is taken while handling
274  * a telemetry operation under the lock).
275  */
276 static inline bool OS_WARN_RESULT
safe_telemetry_lock_try_lock(void)277 safe_telemetry_lock_try_lock(void)
278 {
279 	uint8_t *telemetry_lock_blocked = NULL;
280 
281 	/*
282 	 * Disable preemption to ensure that our block signal always corresponds
283 	 * to the CPU we're actually running on.
284 	 *
285 	 * If we didn't disable preemption, there is a case where we may mark that
286 	 * we are trying to acquire the lock on core A, get approved, get preempted,
287 	 * get rescheduled on core B, and then take the lock there. If we then take
288 	 * another exception on core B while handling the original exception (ex. we
289 	 * take an IRQ and a telemetry exception is generated there), we may
290 	 * re-enter on core B, (incorrectly) see that we are not blocked, try to
291 	 * acquire the lock, and ultimately deadlock.
292 	 */
293 	disable_preemption();
294 
295 	/*
296 	 * Since we are preemption disabled, we'll get the desired behavior even if
297 	 * we take a telemetry trap in the middle of this sequence because the
298 	 * interrupting context will never return here while holding the telemetry
299 	 * lock.
300 	 */
301 	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
302 	if (*telemetry_lock_blocked) {
303 		/*
304 		 * This CPU has already acquired/is blocked on the telemetry lock.
305 		 * Attempting to acquire again on this CPU will deadlock. Refuse the
306 		 * operation.
307 		 */
308 		enable_preemption();
309 		return false;
310 	}
311 
312 	*telemetry_lock_blocked = 1;
313 
314 	/* We've been approved to acquire the lock on this core! */
315 	lck_spin_lock(&trap_telemetry_lock);
316 	return true;
317 }
318 
319 /**
320  * Attempts to acquire the telemetry lock and panic if it cannot be acquired.
321  */
322 static void
safe_telemetry_lock_lock(void)323 safe_telemetry_lock_lock(void)
324 {
325 	if (!safe_telemetry_lock_try_lock()) {
326 		panic("Unexpectedly could not acquire telemetry lock "
327 		    "(nested acquire will deadlock)");
328 	}
329 }
330 
331 /**
332  * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock
333  */
334 static inline void
safe_telemetry_lock_unlock(void)335 safe_telemetry_lock_unlock(void)
336 {
337 	uint8_t *telemetry_lock_blocked = NULL;
338 
339 	lck_spin_unlock(&trap_telemetry_lock);
340 
341 	/*
342 	 * Clear the block only AFTER having dropped the lock so that we can't
343 	 * hit a really narrow deadlock race where we get interrupted between
344 	 * clearing the block and dropping the lock.
345 	 */
346 	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
347 	os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed);
348 
349 	/* Finally, reenable preemption as this thread is now safe to move */
350 	enable_preemption();
351 }
352 
353 /**
354  * Enqueue SRC into the record submission buffer.
355  * Returns TRUE if successful, false otherwise.
356  * TRAP_TELEMETRY_LOCK must be held during this operation.
357  */
358 static bool
rsb_enqueue_locked(rsb_entry_s * rsb_e)359 rsb_enqueue_locked(rsb_entry_s *rsb_e)
360 {
361 	if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) {
362 		/* We're full. */
363 		return false;
364 	}
365 
366 	/* Write the new entry at the write head */
367 	rsb_entry_s *dst = record_submission_buffer + rsb_wr_idx;
368 	*dst = *rsb_e;
369 
370 	/* Update pointers */
371 	rsb_count += 1;
372 	rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH;
373 
374 	return true;
375 }
376 
377 /**
378  * Enter RECORD into this CPU's debounce buffer, thereby preventing it from
379  * being reported again until it falls off. Records are removed from the
380  * debounce buffer automatically as newer records are inserted.
381  */
382 static bool
trap_debounce_buffer_enter(match_record_s * record)383 trap_debounce_buffer_enter(match_record_s *record)
384 {
385 	trap_debounce_buffer_s *debounce = NULL;
386 	bool match = false;
387 
388 	/*
389 	 * Since we don't lock the debounce buffers and instead rely on them being
390 	 * per-CPU for synchronization, we need to disable preemption to ensure that
391 	 * we only access the correct debounce buffer.
392 	 */
393 	disable_preemption();
394 	debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
395 
396 	/*
397 	 * Enter the record.
398 	 * We do this by overwriting the oldest entry, which naturally gives us a
399 	 * LIFO replacement policy.
400 	 */
401 	debounce->records[debounce->tail] = *record;
402 	debounce->tail = (debounce->tail + 1) % DEBOUNCE_RECORD_COUNT;
403 
404 	enable_preemption();
405 
406 	return match;
407 }
408 
409 
410 /**
411  * Search for RECORD in the per-CPU debounce buffer.
412  *
413  * This is useful for determining if a trap has triggered recently.
414  */
415 static bool
trap_debounce_buffer_has_match(match_record_s * record)416 trap_debounce_buffer_has_match(match_record_s *record)
417 {
418 	trap_debounce_buffer_s *debounce = NULL;
419 	bool match = false;
420 
421 	disable_preemption();
422 	debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
423 
424 	for (size_t i = 0; i < DEBOUNCE_RECORD_COUNT; i++) {
425 		if (match_record_compare(debounce->records + i, record) == 0) {
426 			match = true;
427 			break;
428 		}
429 	}
430 
431 	enable_preemption();
432 
433 	return match;
434 }
435 
436 /**
437  * Should the given trap be dumped to the console for debug?
438  */
439 static inline bool
should_dump_trap(trap_telemetry_type_t trap_type,uint64_t trap_code)440 should_dump_trap(
441 	trap_telemetry_type_t trap_type,
442 	uint64_t trap_code)
443 {
444 	if (trap_telemetry_dump_type == -1 /* type match disabled */ ||
445 	    trap_telemetry_dump_type != (uint32_t)trap_type) {
446 		/* No match on type */
447 		return false;
448 	}
449 
450 	if (trap_telemetry_dump_code != -1 /* code match is enabled */ &&
451 	    /* but it doesn't match the trap code */
452 	    trap_telemetry_dump_code != trap_code) {
453 		return false;
454 	}
455 
456 	/* Matching type and, if applicable, code. */
457 	return true;
458 }
459 
460 /**
461  * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective binary
462  * image. Caller is not responsible for managing the the UUID
463  * memory (i.e. it is not owned by the caller).
464  *
465  * Returns negative on error.
466  *
467  * Acquires a sleeping lock, do not call while interrupts are disabled.
468  */
469 static int
get_uuid_and_text_offset_for_addr(uintptr_t addr,uuid_t ** uuid_out,uint64_t * offset_out)470 get_uuid_and_text_offset_for_addr(
471 	uintptr_t addr, uuid_t **uuid_out, uint64_t *offset_out)
472 {
473 	kernel_mach_header_t *mh = NULL;
474 	kernel_segment_command_t *seg_text = NULL;
475 	void *mh_uuid = NULL;
476 	unsigned long mh_uuid_len = 0;
477 #if __arm64__
478 	const char *text_segment_label = "__TEXT_EXEC";
479 #else
480 	const char *text_segment_label = "__TEXT";
481 #endif
482 
483 	if (!(mh = OSKextKextForAddress((void *)addr))) {
484 		return -1;
485 	}
486 
487 	if (!(seg_text = getsegbynamefromheader(mh, text_segment_label))) {
488 		return -2;
489 	}
490 
491 	if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) {
492 		return -3;
493 	}
494 
495 	if (mh_uuid_len != sizeof(**uuid_out)) {
496 		return -4;
497 	}
498 
499 	*uuid_out = (uuid_t *)(mh_uuid);
500 	*offset_out = addr - seg_text->vmaddr;
501 
502 	return 0;
503 }
504 
505 /**
506  * If it does not already exist, inserts UUID into UUID_CACHE (described by
507  * CACHE_LEN). In either case, return the index of the UUID in the cache through
508  * *IDX_OUT and set *IS_NEW_OUT if UUID was inserted.
509  *
510  */
511 static void
uuid_cache_get_or_insert(uuid_t * uuid,uuid_t ** uuid_cache,size_t cache_len,uint32_t * idx_out,bool * is_new_out)512 uuid_cache_get_or_insert(uuid_t *uuid, uuid_t **uuid_cache, size_t cache_len,
513     uint32_t *idx_out, bool *is_new_out)
514 {
515 	for (uint32_t i = 0; i < cache_len; i++) {
516 		if (uuid_cache[i] == uuid) {
517 			/* Hit on existing entry */
518 			*idx_out = i;
519 			*is_new_out = false;
520 			return;
521 		} else if (uuid_cache[i] == NULL) {
522 			/*
523 			 * Reached the end of the valid entries without finding our UUID.
524 			 * Insert it now.
525 			 */
526 			uuid_cache[i] = uuid;
527 			*idx_out = i;
528 			*is_new_out = true;
529 			return;
530 		}
531 
532 		/* No match yet, but there might be more entries. Keep going. */
533 	}
534 
535 	/*
536 	 * We didn't find the UUID but we also couldn't insert it because we never
537 	 * found a free space. This shouldn't happen if the UUID cache is correctly
538 	 * sized.
539 	 */
540 	panic("Could not find UUID in cache but cache was full");
541 }
542 
543 /**
544  * Convert an array of backtrace addresses in FRAMES into an offset backtrace
545  * string in BUF.
546  *
547  * This backtrace scheme has records deliminated by newline characters. Each
548  * record is either a backtrace entry or a UUID entry. A backtrace entry is
549  * identified by the presence of an `@` character in the record. Any other
550  * record is a UUID entry.
551  *
552  * Example:
553  *
554  * 14760@0\n
555  * 2B417DFA-7964-3EBF-97EE-FC94D26FFABD\n
556  * 9f18@1\n
557  * F9EFB7CA-8F23-3990-8E57-A7DAD698D494\n
558  * 87c974@2\n
559  * 8686ED81-CAA9-358D-B162-1F2F97334C65\n
560  * 87cce4@2\n
561  * 874f64@2\n
562  *
563  * Structurally, this example is equivalent to:
564  *
565  * <text offset>@<uuid entry idx=0>\n
566  * <uuid entry 0>\n
567  * <text offset>@<uuid entry idx=1>\n
568  * <uuid entry 1>\n
569  * <text offset>@<uuid entry idx=2>\n
570  * <uuid entry 2>\n
571  * <text offset>@<uuid entry idx=2>\n
572  * <text offset>@<uuid entry idx=2>\n
573  *
574  * The first record here is a backtrace entry. Backtrace entries encode program
575  * location as a hex offset into the __TEXT/__TEXT_EXEC segment of the enclosing
576  * binary. The enclosing binary is identified by a hex encoded, zero-indexed
577  * UUID entry ID which follows after the `@` in a backtrace entry.
578  *
579  * The second record is a UUID entry. UUID entries are simply records which
580  * contain nothing but the UUID. UUID entries are implicitly assigned IDs,
581  * starting from zero, in the order they appear in the record stream. Entries
582  * may be referenced before they are used.
583  *
584  * Given a 256 byte buffer, we can fit up to ten backtrace entries (assuming
585  * each binary is no larger than 256MB and we have no more than four unique
586  * UUIDs in the backtrace).
587  *
588  * If the encoder runs out of space (for example, because we have more than four
589  * unique UUIDs), the later records will truncate abruptly. In order to provide
590  * as much information as possible, UUIDs are encoded immediately after they are
591  * used. This means that if the encoder does run out of space, all backtrace
592  * entries but the last will always decode correctly.
593  */
594 static void
backtrace_to_offset_bt_string(char * buf,size_t buf_len,const uintptr_t * frames,size_t frames_len)595 backtrace_to_offset_bt_string(
596 	char *buf,
597 	size_t buf_len,
598 	const uintptr_t *frames,
599 	size_t frames_len)
600 {
601 	size_t written = 0;
602 	const size_t uuid_cache_count = TRAP_TELEMETRY_BT_FRAMES;
603 	/*
604 	 * The UUID cache relies on NULL entries to represent free slots, so clear
605 	 * it before use.
606 	 */
607 	uuid_t *uuid_cache[uuid_cache_count] = {0};
608 	assert(frames_len <= uuid_cache_count);
609 
610 	/* Add all frames and store unique UUIDs into the cache */
611 	for (size_t frame_i = 0; frame_i < frames_len; frame_i++) {
612 		uuid_t *uuid = NULL;
613 		uint64_t offset = 0;
614 
615 		if (get_uuid_and_text_offset_for_addr(
616 			    frames[frame_i], &uuid, &offset) == 0) {
617 			/* Success! Insert (or reuse) the UUID and then print the entry. */
618 			uint32_t uuid_i;
619 			bool is_new;
620 			uuid_cache_get_or_insert(
621 				uuid, uuid_cache, uuid_cache_count,
622 				&uuid_i, &is_new);
623 
624 			/* Write backtrace record */
625 			written += scnprintf(buf + written, buf_len - written,
626 			    "%llx@%x\n",
627 			    offset, uuid_i);
628 
629 			/* Write UUID record, if needed. */
630 			if (is_new) {
631 				uuid_string_t uuid_str;
632 				uuid_unparse(*uuid, uuid_str);
633 
634 				written += scnprintf(buf + written, buf_len - written,
635 				    "%s\n",
636 				    uuid_str);
637 			}
638 		} else {
639 			/*
640 			 * Could not find an image for the target?
641 			 * Just return the offset into the executable region with an error
642 			 * UUID ref as it's better than nothing.
643 			 */
644 			written += scnprintf(buf + written, buf_len - written,
645 			    "%lx@!\n",
646 			    frames[frame_i] - vm_kernel_stext);
647 		}
648 	}
649 }
650 
651 
652 /**
653  * Print RSB_E to the console in a human friendly way.
654  */
655 static void
rsb_entry_dump(rsb_entry_s * rsb_e)656 rsb_entry_dump(rsb_entry_s *rsb_e)
657 {
658 	printf(TAG "Triggered trap at PC=0x%08lx "
659 	    "(type=%u, code=0x%04llx). Backtrace:\n",
660 	    rsb_e->record.fault_pc,
661 	    (uint32_t)rsb_e->record.trap_type, rsb_e->record.trap_code);
662 
663 	for (size_t frame_i = 0; frame_i < rsb_e->bt_frames_count; frame_i++) {
664 		printf(TAG "\t0x%08lx\n", rsb_e->bt_frames[frame_i]);
665 	}
666 }
667 
668 /**
669  * Submit RSB_E to CoreAnalytics (or another backing event provider as
670  * appropriate).
671  */
672 static void
rsb_entry_submit(rsb_entry_s * rsb_e)673 rsb_entry_submit(rsb_entry_s *rsb_e)
674 {
675 	trap_telemetry_options_s options = rsb_e->options;
676 
677 	bool matched_dump_bootarg = should_dump_trap(
678 		rsb_e->record.trap_type, rsb_e->record.trap_code);
679 	if (matched_dump_bootarg) {
680 		rsb_entry_dump(rsb_e);
681 	}
682 
683 	ca_event_t ca_event = NULL;
684 	switch (options.telemetry_ca_event) {
685 	case TRAP_TELEMETRY_CA_EVENT_NONE: {
686 		/*
687 		 * Unless the event matches the dump boot-arg, we should never see
688 		 * unreported events in the backend. Instead, we expect these events
689 		 * to be dropped in the frontend without ever being submitted.
690 		 */
691 		assert(matched_dump_bootarg);
692 		break;
693 	}
694 
695 	case TRAP_TELEMETRY_CA_EVENT_KERNEL_BRK: {
696 		ca_event = CA_EVENT_ALLOCATE(kernel_breakpoint_event);
697 		CA_EVENT_TYPE(kernel_breakpoint_event) * event = ca_event->data;
698 
699 		/*
700 		 * The BRK telemetry format is somewhat less dense, so to avoid
701 		 * truncating (and to maintain the historical backtrace count) report
702 		 * five or fewer frames.
703 		 */
704 		uint32_t reported_bt_count =
705 		    MIN((uint32_t)rsb_e->bt_frames_count, 5);
706 		telemetry_backtrace_to_string(
707 			/* buf      */ event->backtrace,
708 			/* buf_size */ TRAP_TELEMETRY_BT_STR_LEN,
709 			/* tot      */ reported_bt_count,
710 			/* frames   */ rsb_e->bt_frames);
711 
712 		event->brk_type = (uint32_t)rsb_e->record.trap_type;
713 		event->brk_code = (uint64_t)rsb_e->record.trap_code;
714 		event->faulting_address = rsb_e->record.fault_pc - vm_kernel_stext;
715 		strlcpy(event->uuid, kernel_uuid_string, CA_UUID_LEN);
716 		break;
717 	}
718 
719 	case TRAP_TELEMETRY_CA_EVENT_INTERNAL: {
720 		int result;
721 		uuid_t *uuid = NULL;
722 		uint64_t offset = 0;
723 
724 		ca_event = CA_EVENT_ALLOCATE(trap_telemetry_internal);
725 		CA_EVENT_TYPE(trap_telemetry_internal) * event = ca_event->data;
726 
727 		backtrace_to_offset_bt_string(
728 			/* buf */ event->backtrace,
729 			/* buf_len */ TRAP_TELEMETRY_BT_STR_LEN,
730 			rsb_e->bt_frames,
731 			rsb_e->bt_frames_count);
732 
733 		/*
734 		 * Internal events report the UUID of the binary containing the
735 		 * fault PC and offset of the fault PC into the executable region of
736 		 * that binary (__TEXT_EXEC).
737 		 */
738 		if ((result = get_uuid_and_text_offset_for_addr(
739 			    rsb_e->record.fault_pc, &uuid, &offset)) == 0) {
740 			/* Success! */
741 			event->trap_offset = offset;
742 			uuid_unparse(*uuid, event->trap_uuid);
743 		} else {
744 			/*
745 			 * We couldn't get the required data for symbolication for some
746 			 * odd reason.
747 			 * Report the offset into the executable region and the error as
748 			 * the UUID instead.
749 			 */
750 			event->trap_offset = rsb_e->record.fault_pc - vm_kernel_stext;
751 			(void)scnprintf(event->trap_uuid, CA_UUID_LEN, "error:%d\n",
752 			    result);
753 		}
754 
755 		event->trap_type = (uint32_t)rsb_e->record.trap_type;
756 		event->trap_code = rsb_e->record.trap_code;
757 		break;
758 	}
759 
760 	default: {
761 		panic("Unexpected telemetry CA event: %u\n",
762 		    options.telemetry_ca_event);
763 	}
764 	}
765 
766 	if (ca_event) {
767 		CA_EVENT_SEND(ca_event);
768 	}
769 }
770 
771 /**
772  * Thread call which drains the record submission buffer.
773  * There must be no more than one instance of this thread running at a time.
774  */
775 static void
drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,__unused thread_call_param_t p1)776 drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,
777     __unused thread_call_param_t p1)
778 {
779 	size_t drain_count = 0;
780 	size_t drain_rd_idx = 0;
781 	trap_telemetry_tree_entry_s *tree_records[RECORD_SUBMISSION_BUFFER_LENGTH];
782 
783 	/*
784 	 * We never expect for the submission thread to be scheduled while another
785 	 * thread which is attempting to enqueue is suspended above it (acquiring
786 	 * disables preemption) or while another submission thread is suspended
787 	 * above it (only one submission thread should ever be running).
788 	 *
789 	 * Thus, failing to acquire the lock anywhere in this function indicates
790 	 * that something is seriously wrong.
791 	 */
792 	safe_telemetry_lock_lock();
793 
794 	/*
795 	 * If we're already draining, that means we either forgot to update
796 	 * rsb_is_draining or we have another thread draining (which should never
797 	 * happen).
798 	 */
799 	assert(!rsb_is_draining);
800 	rsb_is_draining = true;
801 
802 	/*
803 	 * Iteratively drain the submission queue until no entries remain.
804 	 * Drops and reacquires the telemetry lock.
805 	 */
806 	while ((drain_count = rsb_count)) {
807 		/* LOCKED IN */
808 		drain_rd_idx = rsb_rd_idx;
809 		safe_telemetry_lock_unlock();
810 
811 		/*
812 		 * It is safe to read these entries based on snapshots of DRAIN_COUNT
813 		 * and DRAIN_RD_IDX without holding the lock because all of the records'
814 		 * writes will have already become visible due to the lock's store
815 		 * release on the enqueue side. RSB entries are guaranteed to survive
816 		 * even when we aren't holding the lock so long as DRAIN_RD_IDX doesn't
817 		 * pass them. Since we are the only agent updating it, if we sequence
818 		 * the DRAIN_RD_IDX write after, we're fine.
819 		 *
820 		 * We may miss some records in this pass if other CPUs enqueue after the
821 		 * snapshot but we'll just pick them up in the next loop iteration.
822 		 * Additionally, since only one instance of this function will be
823 		 * running at a time, we don't need to worry about duplicate
824 		 * allocations/work.
825 		 */
826 
827 		for (size_t i = 0; i < drain_count; i++) {
828 			size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
829 			rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
830 
831 			/* Finish processing the entry and submit it as needed. */
832 			rsb_entry_submit(rsb_e);
833 
834 			if (rsb_e->options.report_once_per_site) {
835 				/*
836 				 * Though we don't insert it yet since we aren't holding the
837 				 * lock, create our tree record from the RSB entry.
838 				 */
839 				trap_telemetry_tree_entry_s *new_tree_record = kalloc_type(
840 					trap_telemetry_tree_entry_s, Z_WAITOK | Z_NOFAIL);
841 
842 				new_tree_record->record = rsb_e->record;
843 				tree_records[i] = new_tree_record;
844 			} else {
845 				tree_records[i] = NULL;
846 			}
847 		}
848 
849 		safe_telemetry_lock_lock();
850 		/* Insert draining entries into the splay as needed */
851 		for (size_t i = 0; i < drain_count; i++) {
852 			size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
853 			rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
854 
855 			if (rsb_e->options.report_once_per_site) {
856 				trap_telemetry_tree_entry_s *duplicate = SPLAY_INSERT(
857 					trap_telemetry_tree,
858 					&telemetry_splay_tree,
859 					tree_records[i]);
860 
861 				/*
862 				 * Since we scan both the RSB and the splay tree before
863 				 * submitting a report once record, we structurally should never
864 				 * have multiple instances of any such record.
865 				 */
866 				(void)duplicate;
867 				assert(!duplicate);
868 			}
869 		}
870 
871 		/* Dequeue the submitted entries from the RSB */
872 		rsb_rd_idx =
873 		    (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH;
874 		rsb_count -= drain_count;
875 		/* LOCKED OUT */
876 	}
877 
878 	/* Done for now, if submitters have entries they'll need to call again. */
879 	rsb_is_draining = false;
880 	safe_telemetry_lock_unlock();
881 }
882 
883 __startup_func
884 void
trap_telemetry_init(void)885 trap_telemetry_init(void)
886 {
887 	printf(TAG "trap_telemetry_init\n");
888 	SPLAY_INIT(&telemetry_splay_tree);
889 
890 	drain_record_submission_buffer_callout = thread_call_allocate_with_options(
891 		drain_record_submission_buffer_thread_call, NULL,
892 		THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
893 
894 	if (!drain_record_submission_buffer_callout) {
895 		panic("Failed to allocate drain callout!");
896 	}
897 
898 	{
899 		/* Ensure that all telemetry events can be encoded in the bitfield */
900 		trap_telemetry_options_s opt = (trap_telemetry_options_s) {0};
901 		uint8_t last_event = TRAP_TELEMETRY_CA_EVENT_COUNT - 1;
902 		opt.telemetry_ca_event = last_event;
903 		assert(opt.telemetry_ca_event == last_event);
904 	}
905 }
906 
907 /**
908  * Submit RSB_E to the record submission queue if it needs to be submitted.
909  * Returns TRUE if the record was accepted (either enqueued or dupe'd), FALSE
910  * otherwise.
911  */
912 static bool
rsb_enqueue_if_needed(rsb_entry_s * rsb_e)913 rsb_enqueue_if_needed(rsb_entry_s *rsb_e)
914 {
915 	bool record_accepted = true;
916 	bool should_flush_submission_buffer = false;
917 	trap_telemetry_tree_entry_s *splay_found_entry = NULL;
918 	trap_telemetry_tree_entry_s find_tree_e = {0};
919 
920 	if (trap_debounce_buffer_has_match(&rsb_e->record)) {
921 		/* debounce dupe */
922 		return true;
923 	}
924 
925 	if (!safe_telemetry_lock_try_lock()) {
926 		/*
927 		 * Failed to acquire the lock!
928 		 * We're likely in a nested exception. Since we can't safely do anything
929 		 * else with the record, just drop it.
930 		 */
931 		return false;
932 	}
933 
934 	if (rsb_e->options.report_once_per_site) {
935 		/* First, scan the submission queue for matching, queued records */
936 		for (size_t i = 0; i < rsb_count; i++) {
937 			size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
938 			rsb_entry_s *rsb_e_i = record_submission_buffer + rsb_i;
939 			if (match_record_compare(&rsb_e->record, &rsb_e_i->record) == 0) {
940 				/* Match, no need to report again. */
941 				goto DONE_LOCKED;
942 			}
943 		}
944 
945 		/* Next, try for a record in the splay */
946 		find_tree_e.record = rsb_e->record;
947 		splay_found_entry = SPLAY_FIND(trap_telemetry_tree,
948 		    &telemetry_splay_tree,
949 		    &find_tree_e);
950 		if (splay_found_entry) {
951 			/* Match, no need to report again. */
952 			goto DONE_LOCKED;
953 		}
954 	}
955 
956 
957 	/*
958 	 * If we haven't hit any disqualifying conditions, this means we have a new
959 	 * entry which needs to be enqueued for reporting.
960 	 */
961 	record_accepted = rsb_enqueue_locked(rsb_e);
962 	should_flush_submission_buffer = record_accepted && !rsb_is_draining;
963 
964 	if (record_accepted) {
965 		/* We've handled the record, so mark it for debouncing */
966 		trap_debounce_buffer_enter(&rsb_e->record);
967 #if DEVELOPMENT || DEBUG
968 		os_atomic_inc(&trap_telemetry_reported_events, relaxed);
969 #endif /* DEVELOPMENT || DEBUG */
970 	} else {
971 		/*
972 		 * Failed to enqueue. Since we have no better options, drop the event.
973 		 */
974 #if DEVELOPMENT || DEBUG
975 		os_atomic_inc(&trap_telemetry_capacity_dropped_events, relaxed);
976 #endif /* DEVELOPMENT || DEBUG */
977 	}
978 
979 DONE_LOCKED:
980 	safe_telemetry_lock_unlock();
981 
982 	if (should_flush_submission_buffer &&
983 	    startup_phase >= STARTUP_SUB_THREAD_CALL) {
984 		/*
985 		 * We submitted a new entry while the drain thread was either exiting or
986 		 * not running. Queue a new flush. Multiple calls here before the drain
987 		 * starts running will not result in multiple calls being queued due to
988 		 * THREAD_CALL_OPTIONS_ONCE.
989 		 */
990 		thread_call_enter(drain_record_submission_buffer_callout);
991 	}
992 
993 	return record_accepted;
994 }
995 
996 /**
997  * Should a given trap be ignored/not reported?
998  */
999 static bool
should_ignore_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1000 should_ignore_trap(
1001 	trap_telemetry_type_t trap_type,
1002 	uint64_t trap_code,
1003 	trap_telemetry_options_s options)
1004 {
1005 	if (trap_telemetry_disable_all) {
1006 		/* Telemetry is disabled, drop all events. */
1007 		return true;
1008 	}
1009 
1010 	if ((options.telemetry_ca_event == TRAP_TELEMETRY_CA_EVENT_NONE ||
1011 	    trap_telemetry_disable_ca) &&
1012 	    !should_dump_trap(trap_type, trap_code)) {
1013 		/* Trap won't be reported anywhere, so it can be dropped. */
1014 		return true;
1015 	}
1016 
1017 	return false;
1018 }
1019 
1020 bool
trap_telemetry_report_exception(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,void * saved_state)1021 trap_telemetry_report_exception(
1022 	trap_telemetry_type_t trap_type,
1023 	uint64_t trap_code,
1024 	trap_telemetry_options_s options,
1025 	void *saved_state)
1026 {
1027 	if (should_ignore_trap(trap_type, trap_code, options)) {
1028 		/*
1029 		 * Don't bother reporting the trap. Since this is not an error, report
1030 		 * that we handled the trap as expected.
1031 		 */
1032 		return true;
1033 	}
1034 
1035 #if __arm64__
1036 	arm_saved_state_t *state = (arm_saved_state_t *)saved_state;
1037 
1038 	uintptr_t faulting_address = get_saved_state_pc(state);
1039 	uintptr_t saved_fp = get_saved_state_fp(state);
1040 #else
1041 	x86_saved_state64_t *state = (x86_saved_state64_t *)saved_state;
1042 
1043 	uintptr_t faulting_address = state->isf.rip;
1044 	uintptr_t saved_fp = state->rbp;
1045 #endif
1046 
1047 	struct backtrace_control ctl = {
1048 		.btc_frame_addr = (uintptr_t)saved_fp,
1049 	};
1050 
1051 	rsb_entry_s submission_e = { 0 };
1052 	submission_e.record.trap_type = trap_type;
1053 	submission_e.record.trap_code = trap_code;
1054 	submission_e.record.fault_pc = faulting_address;
1055 	submission_e.options = options;
1056 	submission_e.bt_frames_count = backtrace(
1057 		submission_e.bt_frames, TRAP_TELEMETRY_BT_FRAMES, &ctl, NULL);
1058 
1059 	return rsb_enqueue_if_needed(&submission_e);
1060 }
1061 
1062 __attribute__((noinline))
1063 bool
trap_telemetry_report_simulated_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1064 trap_telemetry_report_simulated_trap(
1065 	trap_telemetry_type_t trap_type,
1066 	uint64_t trap_code,
1067 	trap_telemetry_options_s options)
1068 {
1069 	if (should_ignore_trap(trap_type, trap_code, options)) {
1070 		/*
1071 		 * Don't bother reporting the trap. Since this is not an error, report
1072 		 * that we did handle the trap as expected.
1073 		 */
1074 		return true;
1075 	}
1076 
1077 	/*
1078 	 * We want to provide a backtrace as if a trap ocurred at the callsite of
1079 	 * the simulated trap. Doing this safely is somewhat awkward as
1080 	 * __builtin_frame_address with a non-zero argument can itself fault (if our
1081 	 * callers frame pointer is invalid) so instead we take a backtrace starting
1082 	 * in our own frame and chop it up as expected.
1083 	 */
1084 
1085 	const size_t frames_count = TRAP_TELEMETRY_BT_FRAMES + 1;
1086 	uintptr_t frames[frames_count];
1087 
1088 	struct backtrace_control ctl = {
1089 		.btc_frame_addr = (uintptr_t)__builtin_frame_address(0),
1090 	};
1091 
1092 	size_t frames_valid_count = backtrace(frames, frames_count, &ctl, NULL);
1093 	if (frames_valid_count) {
1094 		/*
1095 		 * Take the first backtrace entry as the fault address and then place
1096 		 * all other entries into the backtrace. The first backtrace is our
1097 		 * caller (due to the noinline attribute), which gives us the fault
1098 		 * address as the call site (as desired).
1099 		 */
1100 		return trap_telemetry_report_simulated_trap_with_backtrace(
1101 			trap_type,
1102 			trap_code,
1103 			options,
1104 			/* fault_pc */ frames[0],
1105 			/* frames */ frames + 1,
1106 			/* frames_valid_count */ frames_valid_count - 1);
1107 	} else {
1108 		/* Failed to take a backtrace? Report just the return address then. */
1109 		return trap_telemetry_report_simulated_trap_with_backtrace(
1110 			trap_type,
1111 			trap_code,
1112 			options,
1113 			/* fault_pc */ (uintptr_t)__builtin_return_address(0),
1114 			/* frames */ NULL,
1115 			/* frames_valid_count */ 0);
1116 	}
1117 }
1118 
1119 bool
trap_telemetry_report_simulated_trap_with_backtrace(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,uintptr_t fault_pc,uintptr_t * frames,size_t frames_valid_count)1120 trap_telemetry_report_simulated_trap_with_backtrace(
1121 	trap_telemetry_type_t trap_type,
1122 	uint64_t trap_code,
1123 	trap_telemetry_options_s options,
1124 	uintptr_t fault_pc,
1125 	uintptr_t *frames,
1126 	size_t frames_valid_count)
1127 {
1128 	if (should_ignore_trap(trap_type, trap_code, options)) {
1129 		/*
1130 		 * Don't bother reporting the trap. Since this is not an error, report
1131 		 * that we did handle the trap as expected.
1132 		 */
1133 		return true;
1134 	}
1135 
1136 	rsb_entry_s submission_e = { 0 };
1137 	submission_e.record.trap_type = trap_type;
1138 	submission_e.record.trap_code = trap_code;
1139 	submission_e.options = options;
1140 
1141 	// only copy up to TRAP_TELEMETRY_BT_FRAMES frames
1142 	if (frames_valid_count >= TRAP_TELEMETRY_BT_FRAMES) {
1143 		frames_valid_count = TRAP_TELEMETRY_BT_FRAMES;
1144 	}
1145 
1146 	submission_e.bt_frames_count = frames_valid_count;
1147 	submission_e.record.fault_pc = fault_pc;
1148 
1149 	memcpy(submission_e.bt_frames, frames, frames_valid_count * sizeof(*frames));
1150 
1151 	return rsb_enqueue_if_needed(&submission_e);
1152 }
1153