1 /*
2 * Copyright (c) 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/trap_telemetry.h>
30 #include <libkern/coreanalytics/coreanalytics.h>
31 #include <kern/percpu.h>
32 #include <libkern/tree.h>
33 #include <kern/locks.h>
34 #include <kern/thread_call.h>
35 #include <kern/kalloc.h>
36 #include <kern/cpu_data.h>
37 #include <kern/telemetry.h>
38 #include <kern/assert.h>
39 #include <kern/backtrace.h>
40 #include <machine/machine_routines.h>
41 #include <libkern/OSKextLibPrivate.h>
42 #include <libkern/kernel_mach_header.h>
43
44 #define TAG "[trap_telemetry] "
45
46 /* ~* Module Configuration *~ */
47
48 /**
49 * Maximum number of backtrace frames to attempt to report.
50 *
51 * Some reporting destinations may use fewer frames than this due to
52 * encoding/space restrictions.
53 */
54 #define TRAP_TELEMETRY_BT_FRAMES (15)
55
56 /** Static length of various CA telemetry event's backtrace string */
57 #define TRAP_TELEMETRY_BT_STR_LEN CA_UBSANBUF_LEN
58
59 /**
60 * Entry count of the RSB.
61 *
62 * Larger sizes support a higher event volume/can help avoid dropping events
63 * under load.
64 */
65 #define RECORD_SUBMISSION_BUFFER_LENGTH (16)
66
67 /** Number of last events per-CPU to remember and reject. */
68 #define DEBOUNCE_RECORD_COUNT (2)
69
70 /**
71 * When true, trap telemetry will not report events to CoreAnalytics.
72 *
73 * Local reporting (via trap_telemetry_dump_event) is not impacted.
74 */
75 static TUNABLE(bool, trap_telemetry_disable_ca, "trap_telemetry_disable_ca", false);
76
77 /**
78 * Disable all trap telemetry reporting (including local reporting)
79 */
80 static TUNABLE(bool, trap_telemetry_disable_all, "trap_telemetry_disable_all", false);
81
82 /**
83 * Print matching events to the console. Set to -1 to disable.
84 * Setting type but disabling code will match all codes of the given type.
85 */
86 static TUNABLE(uint32_t, trap_telemetry_dump_type, "trap_telemetry_dump_type",
87 -1);
88 static TUNABLE(uint64_t, trap_telemetry_dump_code, "trap_telemetry_dump_code",
89 -1);
90
91 /* ~* Data Structures *~ */
92
93 typedef struct match_record {
94 /** Slid address at which the exception was thrown */
95 uintptr_t fault_pc;
96
97 /** The trap type or "class" for the record. */
98 trap_telemetry_type_t trap_type;
99
100 /** The trap code disambiguates traps within a class. */
101 uint64_t trap_code;
102 } match_record_s;
103
104 typedef struct rsb_entry {
105 match_record_s record;
106 trap_telemetry_options_s options;
107 size_t bt_frames_count;
108 uintptr_t bt_frames[TRAP_TELEMETRY_BT_FRAMES];
109 } rsb_entry_s;
110
111 typedef struct trap_telemetry_tree_entry {
112 SPLAY_ENTRY(trap_telemetry_tree_entry) link;
113 match_record_s record;
114 } trap_telemetry_tree_entry_s;
115
116 typedef struct trap_debounce_buffer {
117 /**
118 * Storage array for trap records used to debounce.
119 *
120 * We don't have valid bits for entries but rather use zero to implicitly
121 * indicate an invalid entry (as they should never naturally match any real
122 * trap).
123 */
124 match_record_s records[DEBOUNCE_RECORD_COUNT];
125
126 /** The index of the entry to replace next (LIFO) */
127 size_t tail;
128 } trap_debounce_buffer_s;
129
130 /* ~* Core Analytics *~ */
131 CA_EVENT(kernel_breakpoint_event,
132 CA_INT, brk_type,
133 CA_INT, brk_code,
134 CA_INT, faulting_address,
135 CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
136 CA_STATIC_STRING(CA_UUID_LEN), uuid);
137
138 CA_EVENT(trap_telemetry_internal,
139 CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
140 CA_INT, trap_code,
141 CA_INT, trap_offset,
142 CA_INT, trap_type,
143 CA_STATIC_STRING(CA_UUID_LEN), trap_uuid);
144
145 /* ~* Splay tree *~ */
146 static int
match_record_compare(match_record_s * r1,match_record_s * r2)147 match_record_compare(match_record_s *r1,
148 match_record_s *r2)
149 {
150 if (r1->fault_pc < r2->fault_pc) {
151 return 1;
152 } else if (r1->fault_pc > r2->fault_pc) {
153 return -1;
154 }
155
156 if (r1->trap_type < r2->trap_type) {
157 return 1;
158 } else if (r1->trap_type > r2->trap_type) {
159 return -1;
160 }
161
162 if (r1->trap_code < r2->trap_code) {
163 return 1;
164 } else if (r1->trap_code > r2->trap_code) {
165 return -1;
166 }
167
168 /* Records match */
169 return 0;
170 }
171
172 static int
trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s * r1,trap_telemetry_tree_entry_s * r2)173 trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s *r1,
174 trap_telemetry_tree_entry_s *r2)
175 {
176 return match_record_compare(&r1->record, &r2->record);
177 }
178
179 SPLAY_HEAD(trap_telemetry_tree, trap_telemetry_tree_entry);
180 /* These functions generated by SPLAY_PROTOTYPE but are currently unused */
181 __unused static struct trap_telemetry_tree_entry *
182 trap_telemetry_tree_SPLAY_NEXT(struct trap_telemetry_tree *head,
183 struct trap_telemetry_tree_entry *elm);
184 __unused static struct trap_telemetry_tree_entry *
185 trap_telemetry_tree_SPLAY_SEARCH(struct trap_telemetry_tree *head,
186 struct trap_telemetry_tree_entry *elm);
187 __unused static struct trap_telemetry_tree_entry *
188 trap_telemetry_tree_SPLAY_MIN_MAX(struct trap_telemetry_tree *head, int val);
189 SPLAY_PROTOTYPE(trap_telemetry_tree,
190 trap_telemetry_tree_entry,
191 link,
192 trap_telemetry_tree_entry_compare);
193 SPLAY_GENERATE(trap_telemetry_tree,
194 trap_telemetry_tree_entry,
195 link,
196 trap_telemetry_tree_entry_compare);
197
198 /* ~* Globals *~ */
199 /* Lock which protects the event submission queue */
200 static LCK_GRP_DECLARE(trap_telemetry_lock_grp, "trap_telemetry_lock");
201 static LCK_SPIN_DECLARE(trap_telemetry_lock, &trap_telemetry_lock_grp);
202
203 /*
204 * Since traps are, naturally, caught in an exception context, it is not safe to
205 * allocate. To solve this, we use a short submission ring buffer which collects
206 * records for processing on a submission thread (which can allocate).
207 *
208 * This ring buffer and all its associated control fields are locked by
209 * TRAP_TELEMETRY_LOCK.
210 */
211 static rsb_entry_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH];
212 static size_t rsb_rd_idx;
213 static size_t rsb_wr_idx;
214 static size_t rsb_count;
215 static bool rsb_is_draining;
216
217 /**
218 * For deduplication, we store hit records in a splay tree.
219 * We use a splay here for performance reasons since traps tend to exhibit a
220 * degree of temporal locality.
221 */
222 static struct trap_telemetry_tree telemetry_splay_tree;
223
224 /**
225 * Flag indicating whether this CPU is currently trying to acquire the telemetry
226 * lock or has already acquired the lock.
227 * This is used as a deadlock avoidance mechanism.
228 */
229 static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked);
230
231 /**
232 * In order to avoid reporting the same event many times in quick succession
233 * (especially when report_once_per_site=false) and overwhelming both the trap
234 * telemetry module and CoreAnalytics, we "debounce" all events on a per-CPU
235 * basis. This is done through a buffer which tracks the LIFO
236 * DEBOUNCE_ENTRY_COUNT trap PCs.
237 */
238 static trap_debounce_buffer_s PERCPU_DATA(per_cpu_trap_debounce_buffer);
239
240 /**
241 * Thread which is responsible for clearing the submission buffer by submitting
242 * to CoreAnalytics and the local tree.
243 */
244 static struct thread_call *drain_record_submission_buffer_callout;
245
246 #if DEVELOPMENT || DEBUG
247 /**
248 * sysctl debug.trap_telemetry_reported_events
249 *
250 * Counts the number of events which were successfully reported (either locally
251 * or to CoreAnalytics). This does not include events which were ignored,
252 * debounced, or discarded as a duplicate.
253 */
254 unsigned long trap_telemetry_reported_events = 0;
255
256 /**
257 * sysctl debug.trap_telemetry_capacity_dropped_events
258 *
259 * Counts the number of events which, if not for the RSB being full, would have
260 * been reported successfully. Events in this count indicate telemetry loss.
261 */
262 unsigned long trap_telemetry_capacity_dropped_events = 0;
263 #endif /* DEVELOPMENT || DEBUG */
264
265 /* ~* Implementation *~ */
266
267 /**
268 * Try and acquire a spin lock in an interrupt-deadlock safe way.
269 *
270 * This function differs from the standard lck_spin_try_lock function in that it
271 * will block if the lock is expected to be acquired *eventually* but will not
272 * block if it detects that the lock will never be acquired (such as when the
273 * current CPU owns the lock, which can happen if a trap is taken while handling
274 * a telemetry operation under the lock).
275 */
276 static inline bool OS_WARN_RESULT
safe_telemetry_lock_try_lock(void)277 safe_telemetry_lock_try_lock(void)
278 {
279 uint8_t *telemetry_lock_blocked = NULL;
280
281 /*
282 * Disable preemption to ensure that our block signal always corresponds
283 * to the CPU we're actually running on.
284 *
285 * If we didn't disable preemption, there is a case where we may mark that
286 * we are trying to acquire the lock on core A, get approved, get preempted,
287 * get rescheduled on core B, and then take the lock there. If we then take
288 * another exception on core B while handling the original exception (ex. we
289 * take an IRQ and a telemetry exception is generated there), we may
290 * re-enter on core B, (incorrectly) see that we are not blocked, try to
291 * acquire the lock, and ultimately deadlock.
292 */
293 disable_preemption();
294
295 /*
296 * Since we are preemption disabled, we'll get the desired behavior even if
297 * we take a telemetry trap in the middle of this sequence because the
298 * interrupting context will never return here while holding the telemetry
299 * lock.
300 */
301 telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
302 if (*telemetry_lock_blocked) {
303 /*
304 * This CPU has already acquired/is blocked on the telemetry lock.
305 * Attempting to acquire again on this CPU will deadlock. Refuse the
306 * operation.
307 */
308 enable_preemption();
309 return false;
310 }
311
312 *telemetry_lock_blocked = 1;
313
314 /* We've been approved to acquire the lock on this core! */
315 lck_spin_lock(&trap_telemetry_lock);
316 return true;
317 }
318
319 /**
320 * Attempts to acquire the telemetry lock and panic if it cannot be acquired.
321 */
322 static void
safe_telemetry_lock_lock(void)323 safe_telemetry_lock_lock(void)
324 {
325 if (!safe_telemetry_lock_try_lock()) {
326 panic("Unexpectedly could not acquire telemetry lock "
327 "(nested acquire will deadlock)");
328 }
329 }
330
331 /**
332 * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock
333 */
334 static inline void
safe_telemetry_lock_unlock(void)335 safe_telemetry_lock_unlock(void)
336 {
337 uint8_t *telemetry_lock_blocked = NULL;
338
339 lck_spin_unlock(&trap_telemetry_lock);
340
341 /*
342 * Clear the block only AFTER having dropped the lock so that we can't
343 * hit a really narrow deadlock race where we get interrupted between
344 * clearing the block and dropping the lock.
345 */
346 telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
347 os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed);
348
349 /* Finally, reenable preemption as this thread is now safe to move */
350 enable_preemption();
351 }
352
353 /**
354 * Enqueue SRC into the record submission buffer.
355 * Returns TRUE if successful, false otherwise.
356 * TRAP_TELEMETRY_LOCK must be held during this operation.
357 */
358 static bool
rsb_enqueue_locked(rsb_entry_s * rsb_e)359 rsb_enqueue_locked(rsb_entry_s *rsb_e)
360 {
361 if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) {
362 /* We're full. */
363 return false;
364 }
365
366 /* Write the new entry at the write head */
367 rsb_entry_s *dst = record_submission_buffer + rsb_wr_idx;
368 *dst = *rsb_e;
369
370 /* Update pointers */
371 rsb_count += 1;
372 rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH;
373
374 return true;
375 }
376
377 /**
378 * Enter RECORD into this CPU's debounce buffer, thereby preventing it from
379 * being reported again until it falls off. Records are removed from the
380 * debounce buffer automatically as newer records are inserted.
381 */
382 static bool
trap_debounce_buffer_enter(match_record_s * record)383 trap_debounce_buffer_enter(match_record_s *record)
384 {
385 trap_debounce_buffer_s *debounce = NULL;
386 bool match = false;
387
388 /*
389 * Since we don't lock the debounce buffers and instead rely on them being
390 * per-CPU for synchronization, we need to disable preemption to ensure that
391 * we only access the correct debounce buffer.
392 */
393 disable_preemption();
394 debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
395
396 /*
397 * Enter the record.
398 * We do this by overwriting the oldest entry, which naturally gives us a
399 * LIFO replacement policy.
400 */
401 debounce->records[debounce->tail] = *record;
402 debounce->tail = (debounce->tail + 1) % DEBOUNCE_RECORD_COUNT;
403
404 enable_preemption();
405
406 return match;
407 }
408
409
410 /**
411 * Search for RECORD in the per-CPU debounce buffer.
412 *
413 * This is useful for determining if a trap has triggered recently.
414 */
415 static bool
trap_debounce_buffer_has_match(match_record_s * record)416 trap_debounce_buffer_has_match(match_record_s *record)
417 {
418 trap_debounce_buffer_s *debounce = NULL;
419 bool match = false;
420
421 disable_preemption();
422 debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
423
424 for (size_t i = 0; i < DEBOUNCE_RECORD_COUNT; i++) {
425 if (match_record_compare(debounce->records + i, record) == 0) {
426 match = true;
427 break;
428 }
429 }
430
431 enable_preemption();
432
433 return match;
434 }
435
436 /**
437 * Should the given trap be dumped to the console for debug?
438 */
439 static inline bool
should_dump_trap(trap_telemetry_type_t trap_type,uint64_t trap_code)440 should_dump_trap(
441 trap_telemetry_type_t trap_type,
442 uint64_t trap_code)
443 {
444 if (trap_telemetry_dump_type == -1 /* type match disabled */ ||
445 trap_telemetry_dump_type != (uint32_t)trap_type) {
446 /* No match on type */
447 return false;
448 }
449
450 if (trap_telemetry_dump_code != -1 /* code match is enabled */ &&
451 /* but it doesn't match the trap code */
452 trap_telemetry_dump_code != trap_code) {
453 return false;
454 }
455
456 /* Matching type and, if applicable, code. */
457 return true;
458 }
459
460 /**
461 * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective binary
462 * image. Caller is not responsible for managing the the UUID
463 * memory (i.e. it is not owned by the caller).
464 *
465 * Returns negative on error.
466 *
467 * Acquires a sleeping lock, do not call while interrupts are disabled.
468 */
469 static int
get_uuid_and_text_offset_for_addr(uintptr_t addr,uuid_t ** uuid_out,uint64_t * offset_out)470 get_uuid_and_text_offset_for_addr(
471 uintptr_t addr, uuid_t **uuid_out, uint64_t *offset_out)
472 {
473 kernel_mach_header_t *mh = NULL;
474 kernel_segment_command_t *seg_text = NULL;
475 void *mh_uuid = NULL;
476 unsigned long mh_uuid_len = 0;
477 #if __arm64__
478 const char *text_segment_label = "__TEXT_EXEC";
479 #else
480 const char *text_segment_label = "__TEXT";
481 #endif
482
483 if (!(mh = OSKextKextForAddress((void *)addr))) {
484 return -1;
485 }
486
487 if (!(seg_text = getsegbynamefromheader(mh, text_segment_label))) {
488 return -2;
489 }
490
491 if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) {
492 return -3;
493 }
494
495 if (mh_uuid_len != sizeof(**uuid_out)) {
496 return -4;
497 }
498
499 *uuid_out = (uuid_t *)(mh_uuid);
500 *offset_out = addr - seg_text->vmaddr;
501
502 return 0;
503 }
504
505 /**
506 * If it does not already exist, inserts UUID into UUID_CACHE (described by
507 * CACHE_LEN). In either case, return the index of the UUID in the cache through
508 * *IDX_OUT and set *IS_NEW_OUT if UUID was inserted.
509 *
510 */
511 static void
uuid_cache_get_or_insert(uuid_t * uuid,uuid_t ** uuid_cache,size_t cache_len,uint32_t * idx_out,bool * is_new_out)512 uuid_cache_get_or_insert(uuid_t *uuid, uuid_t **uuid_cache, size_t cache_len,
513 uint32_t *idx_out, bool *is_new_out)
514 {
515 for (uint32_t i = 0; i < cache_len; i++) {
516 if (uuid_cache[i] == uuid) {
517 /* Hit on existing entry */
518 *idx_out = i;
519 *is_new_out = false;
520 return;
521 } else if (uuid_cache[i] == NULL) {
522 /*
523 * Reached the end of the valid entries without finding our UUID.
524 * Insert it now.
525 */
526 uuid_cache[i] = uuid;
527 *idx_out = i;
528 *is_new_out = true;
529 return;
530 }
531
532 /* No match yet, but there might be more entries. Keep going. */
533 }
534
535 /*
536 * We didn't find the UUID but we also couldn't insert it because we never
537 * found a free space. This shouldn't happen if the UUID cache is correctly
538 * sized.
539 */
540 panic("Could not find UUID in cache but cache was full");
541 }
542
543 /**
544 * Convert an array of backtrace addresses in FRAMES into an offset backtrace
545 * string in BUF.
546 *
547 * This backtrace scheme has records deliminated by newline characters. Each
548 * record is either a backtrace entry or a UUID entry. A backtrace entry is
549 * identified by the presence of an `@` character in the record. Any other
550 * record is a UUID entry.
551 *
552 * Example:
553 *
554 * 14760@0\n
555 * 2B417DFA-7964-3EBF-97EE-FC94D26FFABD\n
556 * 9f18@1\n
557 * F9EFB7CA-8F23-3990-8E57-A7DAD698D494\n
558 * 87c974@2\n
559 * 8686ED81-CAA9-358D-B162-1F2F97334C65\n
560 * 87cce4@2\n
561 * 874f64@2\n
562 *
563 * Structurally, this example is equivalent to:
564 *
565 * <text offset>@<uuid entry idx=0>\n
566 * <uuid entry 0>\n
567 * <text offset>@<uuid entry idx=1>\n
568 * <uuid entry 1>\n
569 * <text offset>@<uuid entry idx=2>\n
570 * <uuid entry 2>\n
571 * <text offset>@<uuid entry idx=2>\n
572 * <text offset>@<uuid entry idx=2>\n
573 *
574 * The first record here is a backtrace entry. Backtrace entries encode program
575 * location as a hex offset into the __TEXT/__TEXT_EXEC segment of the enclosing
576 * binary. The enclosing binary is identified by a hex encoded, zero-indexed
577 * UUID entry ID which follows after the `@` in a backtrace entry.
578 *
579 * The second record is a UUID entry. UUID entries are simply records which
580 * contain nothing but the UUID. UUID entries are implicitly assigned IDs,
581 * starting from zero, in the order they appear in the record stream. Entries
582 * may be referenced before they are used.
583 *
584 * Given a 256 byte buffer, we can fit up to ten backtrace entries (assuming
585 * each binary is no larger than 256MB and we have no more than four unique
586 * UUIDs in the backtrace).
587 *
588 * If the encoder runs out of space (for example, because we have more than four
589 * unique UUIDs), the later records will truncate abruptly. In order to provide
590 * as much information as possible, UUIDs are encoded immediately after they are
591 * used. This means that if the encoder does run out of space, all backtrace
592 * entries but the last will always decode correctly.
593 */
594 static void
backtrace_to_offset_bt_string(char * buf,size_t buf_len,const uintptr_t * frames,size_t frames_len)595 backtrace_to_offset_bt_string(
596 char *buf,
597 size_t buf_len,
598 const uintptr_t *frames,
599 size_t frames_len)
600 {
601 size_t written = 0;
602 const size_t uuid_cache_count = TRAP_TELEMETRY_BT_FRAMES;
603 /*
604 * The UUID cache relies on NULL entries to represent free slots, so clear
605 * it before use.
606 */
607 uuid_t *uuid_cache[uuid_cache_count] = {0};
608 assert(frames_len <= uuid_cache_count);
609
610 /* Add all frames and store unique UUIDs into the cache */
611 for (size_t frame_i = 0; frame_i < frames_len; frame_i++) {
612 uuid_t *uuid = NULL;
613 uint64_t offset = 0;
614
615 if (get_uuid_and_text_offset_for_addr(
616 frames[frame_i], &uuid, &offset) == 0) {
617 /* Success! Insert (or reuse) the UUID and then print the entry. */
618 uint32_t uuid_i;
619 bool is_new;
620 uuid_cache_get_or_insert(
621 uuid, uuid_cache, uuid_cache_count,
622 &uuid_i, &is_new);
623
624 /* Write backtrace record */
625 written += scnprintf(buf + written, buf_len - written,
626 "%llx@%x\n",
627 offset, uuid_i);
628
629 /* Write UUID record, if needed. */
630 if (is_new) {
631 uuid_string_t uuid_str;
632 uuid_unparse(*uuid, uuid_str);
633
634 written += scnprintf(buf + written, buf_len - written,
635 "%s\n",
636 uuid_str);
637 }
638 } else {
639 /*
640 * Could not find an image for the target?
641 * Just return the offset into the executable region with an error
642 * UUID ref as it's better than nothing.
643 */
644 written += scnprintf(buf + written, buf_len - written,
645 "%lx@!\n",
646 frames[frame_i] - vm_kernel_stext);
647 }
648 }
649 }
650
651
652 /**
653 * Print RSB_E to the console in a human friendly way.
654 */
655 static void
rsb_entry_dump(rsb_entry_s * rsb_e)656 rsb_entry_dump(rsb_entry_s *rsb_e)
657 {
658 printf(TAG "Triggered trap at PC=0x%08lx "
659 "(type=%u, code=0x%04llx). Backtrace:\n",
660 rsb_e->record.fault_pc,
661 (uint32_t)rsb_e->record.trap_type, rsb_e->record.trap_code);
662
663 for (size_t frame_i = 0; frame_i < rsb_e->bt_frames_count; frame_i++) {
664 printf(TAG "\t0x%08lx\n", rsb_e->bt_frames[frame_i]);
665 }
666 }
667
668 /**
669 * Submit RSB_E to CoreAnalytics (or another backing event provider as
670 * appropriate).
671 */
672 static void
rsb_entry_submit(rsb_entry_s * rsb_e)673 rsb_entry_submit(rsb_entry_s *rsb_e)
674 {
675 trap_telemetry_options_s options = rsb_e->options;
676
677 bool matched_dump_bootarg = should_dump_trap(
678 rsb_e->record.trap_type, rsb_e->record.trap_code);
679 if (matched_dump_bootarg) {
680 rsb_entry_dump(rsb_e);
681 }
682
683 ca_event_t ca_event = NULL;
684 switch (options.telemetry_ca_event) {
685 case TRAP_TELEMETRY_CA_EVENT_NONE: {
686 /*
687 * Unless the event matches the dump boot-arg, we should never see
688 * unreported events in the backend. Instead, we expect these events
689 * to be dropped in the frontend without ever being submitted.
690 */
691 assert(matched_dump_bootarg);
692 break;
693 }
694
695 case TRAP_TELEMETRY_CA_EVENT_KERNEL_BRK: {
696 ca_event = CA_EVENT_ALLOCATE(kernel_breakpoint_event);
697 CA_EVENT_TYPE(kernel_breakpoint_event) * event = ca_event->data;
698
699 /*
700 * The BRK telemetry format is somewhat less dense, so to avoid
701 * truncating (and to maintain the historical backtrace count) report
702 * five or fewer frames.
703 */
704 uint32_t reported_bt_count =
705 MIN((uint32_t)rsb_e->bt_frames_count, 5);
706 telemetry_backtrace_to_string(
707 /* buf */ event->backtrace,
708 /* buf_size */ TRAP_TELEMETRY_BT_STR_LEN,
709 /* tot */ reported_bt_count,
710 /* frames */ rsb_e->bt_frames);
711
712 event->brk_type = (uint32_t)rsb_e->record.trap_type;
713 event->brk_code = (uint64_t)rsb_e->record.trap_code;
714 event->faulting_address = rsb_e->record.fault_pc - vm_kernel_stext;
715 strlcpy(event->uuid, kernel_uuid_string, CA_UUID_LEN);
716 break;
717 }
718
719 case TRAP_TELEMETRY_CA_EVENT_INTERNAL: {
720 int result;
721 uuid_t *uuid = NULL;
722 uint64_t offset = 0;
723
724 ca_event = CA_EVENT_ALLOCATE(trap_telemetry_internal);
725 CA_EVENT_TYPE(trap_telemetry_internal) * event = ca_event->data;
726
727 backtrace_to_offset_bt_string(
728 /* buf */ event->backtrace,
729 /* buf_len */ TRAP_TELEMETRY_BT_STR_LEN,
730 rsb_e->bt_frames,
731 rsb_e->bt_frames_count);
732
733 /*
734 * Internal events report the UUID of the binary containing the
735 * fault PC and offset of the fault PC into the executable region of
736 * that binary (__TEXT_EXEC).
737 */
738 if ((result = get_uuid_and_text_offset_for_addr(
739 rsb_e->record.fault_pc, &uuid, &offset)) == 0) {
740 /* Success! */
741 event->trap_offset = offset;
742 uuid_unparse(*uuid, event->trap_uuid);
743 } else {
744 /*
745 * We couldn't get the required data for symbolication for some
746 * odd reason.
747 * Report the offset into the executable region and the error as
748 * the UUID instead.
749 */
750 event->trap_offset = rsb_e->record.fault_pc - vm_kernel_stext;
751 (void)scnprintf(event->trap_uuid, CA_UUID_LEN, "error:%d\n",
752 result);
753 }
754
755 event->trap_type = (uint32_t)rsb_e->record.trap_type;
756 event->trap_code = rsb_e->record.trap_code;
757 break;
758 }
759
760 default: {
761 panic("Unexpected telemetry CA event: %u\n",
762 options.telemetry_ca_event);
763 }
764 }
765
766 if (ca_event) {
767 CA_EVENT_SEND(ca_event);
768 }
769 }
770
771 /**
772 * Thread call which drains the record submission buffer.
773 * There must be no more than one instance of this thread running at a time.
774 */
775 static void
drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,__unused thread_call_param_t p1)776 drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,
777 __unused thread_call_param_t p1)
778 {
779 size_t drain_count = 0;
780 size_t drain_rd_idx = 0;
781 trap_telemetry_tree_entry_s *tree_records[RECORD_SUBMISSION_BUFFER_LENGTH];
782
783 /*
784 * We never expect for the submission thread to be scheduled while another
785 * thread which is attempting to enqueue is suspended above it (acquiring
786 * disables preemption) or while another submission thread is suspended
787 * above it (only one submission thread should ever be running).
788 *
789 * Thus, failing to acquire the lock anywhere in this function indicates
790 * that something is seriously wrong.
791 */
792 safe_telemetry_lock_lock();
793
794 /*
795 * If we're already draining, that means we either forgot to update
796 * rsb_is_draining or we have another thread draining (which should never
797 * happen).
798 */
799 assert(!rsb_is_draining);
800 rsb_is_draining = true;
801
802 /*
803 * Iteratively drain the submission queue until no entries remain.
804 * Drops and reacquires the telemetry lock.
805 */
806 while ((drain_count = rsb_count)) {
807 /* LOCKED IN */
808 drain_rd_idx = rsb_rd_idx;
809 safe_telemetry_lock_unlock();
810
811 /*
812 * It is safe to read these entries based on snapshots of DRAIN_COUNT
813 * and DRAIN_RD_IDX without holding the lock because all of the records'
814 * writes will have already become visible due to the lock's store
815 * release on the enqueue side. RSB entries are guaranteed to survive
816 * even when we aren't holding the lock so long as DRAIN_RD_IDX doesn't
817 * pass them. Since we are the only agent updating it, if we sequence
818 * the DRAIN_RD_IDX write after, we're fine.
819 *
820 * We may miss some records in this pass if other CPUs enqueue after the
821 * snapshot but we'll just pick them up in the next loop iteration.
822 * Additionally, since only one instance of this function will be
823 * running at a time, we don't need to worry about duplicate
824 * allocations/work.
825 */
826
827 for (size_t i = 0; i < drain_count; i++) {
828 size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
829 rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
830
831 /* Finish processing the entry and submit it as needed. */
832 rsb_entry_submit(rsb_e);
833
834 if (rsb_e->options.report_once_per_site) {
835 /*
836 * Though we don't insert it yet since we aren't holding the
837 * lock, create our tree record from the RSB entry.
838 */
839 trap_telemetry_tree_entry_s *new_tree_record = kalloc_type(
840 trap_telemetry_tree_entry_s, Z_WAITOK | Z_NOFAIL);
841
842 new_tree_record->record = rsb_e->record;
843 tree_records[i] = new_tree_record;
844 } else {
845 tree_records[i] = NULL;
846 }
847 }
848
849 safe_telemetry_lock_lock();
850 /* Insert draining entries into the splay as needed */
851 for (size_t i = 0; i < drain_count; i++) {
852 size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
853 rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
854
855 if (rsb_e->options.report_once_per_site) {
856 trap_telemetry_tree_entry_s *duplicate = SPLAY_INSERT(
857 trap_telemetry_tree,
858 &telemetry_splay_tree,
859 tree_records[i]);
860
861 /*
862 * Since we scan both the RSB and the splay tree before
863 * submitting a report once record, we structurally should never
864 * have multiple instances of any such record.
865 */
866 (void)duplicate;
867 assert(!duplicate);
868 }
869 }
870
871 /* Dequeue the submitted entries from the RSB */
872 rsb_rd_idx =
873 (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH;
874 rsb_count -= drain_count;
875 /* LOCKED OUT */
876 }
877
878 /* Done for now, if submitters have entries they'll need to call again. */
879 rsb_is_draining = false;
880 safe_telemetry_lock_unlock();
881 }
882
883 __startup_func
884 void
trap_telemetry_init(void)885 trap_telemetry_init(void)
886 {
887 printf(TAG "trap_telemetry_init\n");
888 SPLAY_INIT(&telemetry_splay_tree);
889
890 drain_record_submission_buffer_callout = thread_call_allocate_with_options(
891 drain_record_submission_buffer_thread_call, NULL,
892 THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
893
894 if (!drain_record_submission_buffer_callout) {
895 panic("Failed to allocate drain callout!");
896 }
897
898 {
899 /* Ensure that all telemetry events can be encoded in the bitfield */
900 trap_telemetry_options_s opt = (trap_telemetry_options_s) {0};
901 uint8_t last_event = TRAP_TELEMETRY_CA_EVENT_COUNT - 1;
902 opt.telemetry_ca_event = last_event;
903 assert(opt.telemetry_ca_event == last_event);
904 }
905 }
906
907 /**
908 * Submit RSB_E to the record submission queue if it needs to be submitted.
909 * Returns TRUE if the record was accepted (either enqueued or dupe'd), FALSE
910 * otherwise.
911 */
912 static bool
rsb_enqueue_if_needed(rsb_entry_s * rsb_e)913 rsb_enqueue_if_needed(rsb_entry_s *rsb_e)
914 {
915 bool record_accepted = true;
916 bool should_flush_submission_buffer = false;
917 trap_telemetry_tree_entry_s *splay_found_entry = NULL;
918 trap_telemetry_tree_entry_s find_tree_e = {0};
919
920 if (trap_debounce_buffer_has_match(&rsb_e->record)) {
921 /* debounce dupe */
922 return true;
923 }
924
925 if (!safe_telemetry_lock_try_lock()) {
926 /*
927 * Failed to acquire the lock!
928 * We're likely in a nested exception. Since we can't safely do anything
929 * else with the record, just drop it.
930 */
931 return false;
932 }
933
934 if (rsb_e->options.report_once_per_site) {
935 /* First, scan the submission queue for matching, queued records */
936 for (size_t i = 0; i < rsb_count; i++) {
937 size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
938 rsb_entry_s *rsb_e_i = record_submission_buffer + rsb_i;
939 if (match_record_compare(&rsb_e->record, &rsb_e_i->record) == 0) {
940 /* Match, no need to report again. */
941 goto DONE_LOCKED;
942 }
943 }
944
945 /* Next, try for a record in the splay */
946 find_tree_e.record = rsb_e->record;
947 splay_found_entry = SPLAY_FIND(trap_telemetry_tree,
948 &telemetry_splay_tree,
949 &find_tree_e);
950 if (splay_found_entry) {
951 /* Match, no need to report again. */
952 goto DONE_LOCKED;
953 }
954 }
955
956
957 /*
958 * If we haven't hit any disqualifying conditions, this means we have a new
959 * entry which needs to be enqueued for reporting.
960 */
961 record_accepted = rsb_enqueue_locked(rsb_e);
962 should_flush_submission_buffer = record_accepted && !rsb_is_draining;
963
964 if (record_accepted) {
965 /* We've handled the record, so mark it for debouncing */
966 trap_debounce_buffer_enter(&rsb_e->record);
967 #if DEVELOPMENT || DEBUG
968 os_atomic_inc(&trap_telemetry_reported_events, relaxed);
969 #endif /* DEVELOPMENT || DEBUG */
970 } else {
971 /*
972 * Failed to enqueue. Since we have no better options, drop the event.
973 */
974 #if DEVELOPMENT || DEBUG
975 os_atomic_inc(&trap_telemetry_capacity_dropped_events, relaxed);
976 #endif /* DEVELOPMENT || DEBUG */
977 }
978
979 DONE_LOCKED:
980 safe_telemetry_lock_unlock();
981
982 if (should_flush_submission_buffer &&
983 startup_phase >= STARTUP_SUB_THREAD_CALL) {
984 /*
985 * We submitted a new entry while the drain thread was either exiting or
986 * not running. Queue a new flush. Multiple calls here before the drain
987 * starts running will not result in multiple calls being queued due to
988 * THREAD_CALL_OPTIONS_ONCE.
989 */
990 thread_call_enter(drain_record_submission_buffer_callout);
991 }
992
993 return record_accepted;
994 }
995
996 /**
997 * Should a given trap be ignored/not reported?
998 */
999 static bool
should_ignore_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1000 should_ignore_trap(
1001 trap_telemetry_type_t trap_type,
1002 uint64_t trap_code,
1003 trap_telemetry_options_s options)
1004 {
1005 if (trap_telemetry_disable_all) {
1006 /* Telemetry is disabled, drop all events. */
1007 return true;
1008 }
1009
1010 if ((options.telemetry_ca_event == TRAP_TELEMETRY_CA_EVENT_NONE ||
1011 trap_telemetry_disable_ca) &&
1012 !should_dump_trap(trap_type, trap_code)) {
1013 /* Trap won't be reported anywhere, so it can be dropped. */
1014 return true;
1015 }
1016
1017 return false;
1018 }
1019
1020 bool
trap_telemetry_report_exception(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,void * saved_state)1021 trap_telemetry_report_exception(
1022 trap_telemetry_type_t trap_type,
1023 uint64_t trap_code,
1024 trap_telemetry_options_s options,
1025 void *saved_state)
1026 {
1027 if (should_ignore_trap(trap_type, trap_code, options)) {
1028 /*
1029 * Don't bother reporting the trap. Since this is not an error, report
1030 * that we handled the trap as expected.
1031 */
1032 return true;
1033 }
1034
1035 #if __arm64__
1036 arm_saved_state_t *state = (arm_saved_state_t *)saved_state;
1037
1038 uintptr_t faulting_address = get_saved_state_pc(state);
1039 uintptr_t saved_fp = get_saved_state_fp(state);
1040 #else
1041 x86_saved_state64_t *state = (x86_saved_state64_t *)saved_state;
1042
1043 uintptr_t faulting_address = state->isf.rip;
1044 uintptr_t saved_fp = state->rbp;
1045 #endif
1046
1047 struct backtrace_control ctl = {
1048 .btc_frame_addr = (uintptr_t)saved_fp,
1049 };
1050
1051 rsb_entry_s submission_e = { 0 };
1052 submission_e.record.trap_type = trap_type;
1053 submission_e.record.trap_code = trap_code;
1054 submission_e.record.fault_pc = faulting_address;
1055 submission_e.options = options;
1056 submission_e.bt_frames_count = backtrace(
1057 submission_e.bt_frames, TRAP_TELEMETRY_BT_FRAMES, &ctl, NULL);
1058
1059 return rsb_enqueue_if_needed(&submission_e);
1060 }
1061
1062 __attribute__((noinline))
1063 bool
trap_telemetry_report_simulated_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1064 trap_telemetry_report_simulated_trap(
1065 trap_telemetry_type_t trap_type,
1066 uint64_t trap_code,
1067 trap_telemetry_options_s options)
1068 {
1069 if (should_ignore_trap(trap_type, trap_code, options)) {
1070 /*
1071 * Don't bother reporting the trap. Since this is not an error, report
1072 * that we did handle the trap as expected.
1073 */
1074 return true;
1075 }
1076
1077 /*
1078 * We want to provide a backtrace as if a trap ocurred at the callsite of
1079 * the simulated trap. Doing this safely is somewhat awkward as
1080 * __builtin_frame_address with a non-zero argument can itself fault (if our
1081 * callers frame pointer is invalid) so instead we take a backtrace starting
1082 * in our own frame and chop it up as expected.
1083 */
1084
1085 const size_t frames_count = TRAP_TELEMETRY_BT_FRAMES + 1;
1086 uintptr_t frames[frames_count];
1087
1088 struct backtrace_control ctl = {
1089 .btc_frame_addr = (uintptr_t)__builtin_frame_address(0),
1090 };
1091
1092 size_t frames_valid_count = backtrace(frames, frames_count, &ctl, NULL);
1093 if (frames_valid_count) {
1094 /*
1095 * Take the first backtrace entry as the fault address and then place
1096 * all other entries into the backtrace. The first backtrace is our
1097 * caller (due to the noinline attribute), which gives us the fault
1098 * address as the call site (as desired).
1099 */
1100 return trap_telemetry_report_simulated_trap_with_backtrace(
1101 trap_type,
1102 trap_code,
1103 options,
1104 /* fault_pc */ frames[0],
1105 /* frames */ frames + 1,
1106 /* frames_valid_count */ frames_valid_count - 1);
1107 } else {
1108 /* Failed to take a backtrace? Report just the return address then. */
1109 return trap_telemetry_report_simulated_trap_with_backtrace(
1110 trap_type,
1111 trap_code,
1112 options,
1113 /* fault_pc */ (uintptr_t)__builtin_return_address(0),
1114 /* frames */ NULL,
1115 /* frames_valid_count */ 0);
1116 }
1117 }
1118
1119 bool
trap_telemetry_report_simulated_trap_with_backtrace(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,uintptr_t fault_pc,uintptr_t * frames,size_t frames_valid_count)1120 trap_telemetry_report_simulated_trap_with_backtrace(
1121 trap_telemetry_type_t trap_type,
1122 uint64_t trap_code,
1123 trap_telemetry_options_s options,
1124 uintptr_t fault_pc,
1125 uintptr_t *frames,
1126 size_t frames_valid_count)
1127 {
1128 if (should_ignore_trap(trap_type, trap_code, options)) {
1129 /*
1130 * Don't bother reporting the trap. Since this is not an error, report
1131 * that we did handle the trap as expected.
1132 */
1133 return true;
1134 }
1135
1136 rsb_entry_s submission_e = { 0 };
1137 submission_e.record.trap_type = trap_type;
1138 submission_e.record.trap_code = trap_code;
1139 submission_e.options = options;
1140
1141 // only copy up to TRAP_TELEMETRY_BT_FRAMES frames
1142 if (frames_valid_count >= TRAP_TELEMETRY_BT_FRAMES) {
1143 frames_valid_count = TRAP_TELEMETRY_BT_FRAMES;
1144 }
1145
1146 submission_e.bt_frames_count = frames_valid_count;
1147 submission_e.record.fault_pc = fault_pc;
1148
1149 memcpy(submission_e.bt_frames, frames, frames_valid_count * sizeof(*frames));
1150
1151 return rsb_enqueue_if_needed(&submission_e);
1152 }
1153