1 /*
2 * Copyright (c) 2024 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #include <kern/trap_telemetry.h>
30 #include <libkern/coreanalytics/coreanalytics.h>
31 #include <kern/percpu.h>
32 #include <libkern/tree.h>
33 #include <kern/locks.h>
34 #include <kern/thread_call.h>
35 #include <kern/kalloc.h>
36 #include <kern/cpu_data.h>
37 #include <kern/telemetry.h>
38 #include <kern/assert.h>
39 #include <kern/backtrace.h>
40 #include <machine/machine_routines.h>
41 #include <libkern/OSKextLibPrivate.h>
42 #include <libkern/kernel_mach_header.h>
43 #if __arm64__
44 #include <pexpert/arm64/platform.h>
45 #endif
46
47 #define TAG "[trap_telemetry] "
48
49 /* ~* Module Configuration *~ */
50
51 /**
52 * Maximum number of backtrace frames to attempt to report.
53 *
54 * Some reporting destinations may use fewer frames than this due to
55 * encoding/space restrictions.
56 */
57 #define TRAP_TELEMETRY_BT_FRAMES (15)
58
59 /** Static length of various CA telemetry event's backtrace string */
60 #define TRAP_TELEMETRY_BT_STR_LEN CA_UBSANBUF_LEN
61
62 /**
63 * Entry count of the RSB.
64 *
65 * Larger sizes support a higher event volume/can help avoid dropping events
66 * under load.
67 */
68 #define RECORD_SUBMISSION_BUFFER_LENGTH (16)
69
70 /** Number of last events per-CPU to remember and reject. */
71 #define DEBOUNCE_RECORD_COUNT (2)
72
73 /** Length of the kernel_platform string (eg t8132). */
74 #define KERNEL_PLATFORM_STR_LEN 12
75
76 /**
77 * When true, trap telemetry will not report events to CoreAnalytics.
78 *
79 * Local reporting (via trap_telemetry_dump_event) is not impacted.
80 */
81 static TUNABLE(bool, trap_telemetry_disable_ca, "trap_telemetry_disable_ca", false);
82
83 /**
84 * Disable all trap telemetry reporting (including local reporting)
85 */
86 static TUNABLE(bool, trap_telemetry_disable_all, "trap_telemetry_disable_all", false);
87
88 /**
89 * Print matching events to the console. Set to -1 to disable.
90 * Setting type but disabling code will match all codes of the given type.
91 */
92 static TUNABLE(uint32_t, trap_telemetry_dump_type, "trap_telemetry_dump_type",
93 -1);
94 static TUNABLE(uint64_t, trap_telemetry_dump_code, "trap_telemetry_dump_code",
95 -1);
96
97 /* ~* Data Structures *~ */
98
99 typedef struct match_record {
100 /** Slid address at which the exception was thrown */
101 uintptr_t fault_pc;
102
103 /** The trap type or "class" for the record. */
104 trap_telemetry_type_t trap_type;
105
106 /** The trap code disambiguates traps within a class. */
107 uint64_t trap_code;
108 } match_record_s;
109
110 typedef struct rsb_entry {
111 match_record_s record;
112 trap_telemetry_extra_data_u extra_data;
113 trap_telemetry_options_s options;
114 size_t bt_frames_count;
115 uintptr_t bt_frames[TRAP_TELEMETRY_BT_FRAMES];
116 } rsb_entry_s;
117
118 typedef struct trap_telemetry_tree_entry {
119 SPLAY_ENTRY(trap_telemetry_tree_entry) link;
120 match_record_s record;
121 } trap_telemetry_tree_entry_s;
122
123 typedef struct trap_debounce_buffer {
124 /**
125 * Storage array for trap records used to debounce.
126 *
127 * We don't have valid bits for entries but rather use zero to implicitly
128 * indicate an invalid entry (as they should never naturally match any real
129 * trap).
130 */
131 match_record_s records[DEBOUNCE_RECORD_COUNT];
132
133 /** The index of the entry to replace next (LIFO) */
134 size_t tail;
135 } trap_debounce_buffer_s;
136
137 /* ~* Core Analytics *~ */
138 CA_EVENT(kernel_breakpoint_event,
139 CA_INT, brk_type,
140 CA_INT, brk_code,
141 CA_INT, faulting_address,
142 CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
143 CA_STATIC_STRING(CA_UUID_LEN), uuid);
144
145 CA_EVENT(trap_telemetry_internal,
146 CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
147 CA_STATIC_STRING(KERNEL_PLATFORM_STR_LEN), kernel_platform,
148 CA_INT, trap_code,
149 CA_INT, trap_offset,
150 CA_INT, trap_type,
151 CA_STATIC_STRING(CA_UUID_LEN), trap_uuid);
152
153 CA_EVENT(latency_violations,
154 CA_STATIC_STRING(TRAP_TELEMETRY_BT_STR_LEN), backtrace,
155 CA_STATIC_STRING(KERNEL_PLATFORM_STR_LEN), kernel_platform,
156 CA_STATIC_STRING(CA_UUID_LEN), uuid,
157 CA_INT, violation_code,
158 CA_INT, violation_cpi,
159 CA_STATIC_STRING(2), violation_cpu_type,
160 CA_INT, violation_duration,
161 CA_INT, violation_freq,
162 CA_INT, violation_payload,
163 CA_INT, violation_threshold);
164
165 /* ~* Splay tree *~ */
166 static int
match_record_compare(match_record_s * r1,match_record_s * r2)167 match_record_compare(match_record_s *r1,
168 match_record_s *r2)
169 {
170 if (r1->fault_pc < r2->fault_pc) {
171 return 1;
172 } else if (r1->fault_pc > r2->fault_pc) {
173 return -1;
174 }
175
176 if (r1->trap_type < r2->trap_type) {
177 return 1;
178 } else if (r1->trap_type > r2->trap_type) {
179 return -1;
180 }
181
182 if (r1->trap_code < r2->trap_code) {
183 return 1;
184 } else if (r1->trap_code > r2->trap_code) {
185 return -1;
186 }
187
188 /* Records match */
189 return 0;
190 }
191
192 static int
trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s * r1,trap_telemetry_tree_entry_s * r2)193 trap_telemetry_tree_entry_compare(trap_telemetry_tree_entry_s *r1,
194 trap_telemetry_tree_entry_s *r2)
195 {
196 return match_record_compare(&r1->record, &r2->record);
197 }
198
199 SPLAY_HEAD(trap_telemetry_tree, trap_telemetry_tree_entry);
200 /* These functions generated by SPLAY_PROTOTYPE but are currently unused */
201 __unused static struct trap_telemetry_tree_entry *
202 trap_telemetry_tree_SPLAY_NEXT(struct trap_telemetry_tree *head,
203 struct trap_telemetry_tree_entry *elm);
204 __unused static struct trap_telemetry_tree_entry *
205 trap_telemetry_tree_SPLAY_SEARCH(struct trap_telemetry_tree *head,
206 struct trap_telemetry_tree_entry *elm);
207 __unused static struct trap_telemetry_tree_entry *
208 trap_telemetry_tree_SPLAY_MIN_MAX(struct trap_telemetry_tree *head, int val);
209 SPLAY_PROTOTYPE(trap_telemetry_tree,
210 trap_telemetry_tree_entry,
211 link,
212 trap_telemetry_tree_entry_compare);
213 SPLAY_GENERATE(trap_telemetry_tree,
214 trap_telemetry_tree_entry,
215 link,
216 trap_telemetry_tree_entry_compare);
217
218 /* ~* Globals *~ */
219 /* Lock which protects the event submission queue */
220 static LCK_GRP_DECLARE(trap_telemetry_lock_grp, "trap_telemetry_lock");
221 static LCK_SPIN_DECLARE(trap_telemetry_lock, &trap_telemetry_lock_grp);
222
223 /*
224 * Since traps are, naturally, caught in an exception context, it is not safe to
225 * allocate. To solve this, we use a short submission ring buffer which collects
226 * records for processing on a submission thread (which can allocate).
227 *
228 * This ring buffer and all its associated control fields are locked by
229 * TRAP_TELEMETRY_LOCK.
230 */
231 static rsb_entry_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH];
232 static size_t rsb_rd_idx;
233 static size_t rsb_wr_idx;
234 static size_t rsb_count;
235 static bool rsb_is_draining;
236
237 /**
238 * For deduplication, we store hit records in a splay tree.
239 * We use a splay here for performance reasons since traps tend to exhibit a
240 * degree of temporal locality.
241 */
242 static struct trap_telemetry_tree telemetry_splay_tree;
243
244 /**
245 * Flag indicating whether this CPU is currently trying to acquire the telemetry
246 * lock or has already acquired the lock.
247 * This is used as a deadlock avoidance mechanism.
248 */
249 static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked);
250
251 /**
252 * In order to avoid reporting the same event many times in quick succession
253 * (especially when report_once_per_site=false) and overwhelming both the trap
254 * telemetry module and CoreAnalytics, we "debounce" all events on a per-CPU
255 * basis. This is done through a buffer which tracks the LIFO
256 * DEBOUNCE_ENTRY_COUNT trap PCs.
257 */
258 static trap_debounce_buffer_s PERCPU_DATA(per_cpu_trap_debounce_buffer);
259
260 /**
261 * Thread which is responsible for clearing the submission buffer by submitting
262 * to CoreAnalytics and the local tree.
263 */
264 static struct thread_call *drain_record_submission_buffer_callout;
265
266 #if DEVELOPMENT || DEBUG
267 /**
268 * sysctl debug.trap_telemetry_reported_events
269 *
270 * Counts the number of events which were successfully reported (either locally
271 * or to CoreAnalytics). This does not include events which were ignored,
272 * debounced, or discarded as a duplicate.
273 */
274 unsigned long trap_telemetry_reported_events = 0;
275
276 /**
277 * sysctl debug.trap_telemetry_capacity_dropped_events
278 *
279 * Counts the number of events which, if not for the RSB being full, would have
280 * been reported successfully. Events in this count indicate telemetry loss.
281 */
282 unsigned long trap_telemetry_capacity_dropped_events = 0;
283 #endif /* DEVELOPMENT || DEBUG */
284
285 /* ~* Implementation *~ */
286
287 /**
288 * Try and acquire a spin lock in an interrupt-deadlock safe way.
289 *
290 * This function differs from the standard lck_spin_try_lock function in that it
291 * will block if the lock is expected to be acquired *eventually* but will not
292 * block if it detects that the lock will never be acquired (such as when the
293 * current CPU owns the lock, which can happen if a trap is taken while handling
294 * a telemetry operation under the lock).
295 */
296 static inline bool OS_WARN_RESULT
safe_telemetry_lock_try_lock(void)297 safe_telemetry_lock_try_lock(void)
298 {
299 uint8_t *telemetry_lock_blocked = NULL;
300
301 /*
302 * Disable preemption to ensure that our block signal always corresponds
303 * to the CPU we're actually running on.
304 *
305 * If we didn't disable preemption, there is a case where we may mark that
306 * we are trying to acquire the lock on core A, get approved, get preempted,
307 * get rescheduled on core B, and then take the lock there. If we then take
308 * another exception on core B while handling the original exception (ex. we
309 * take an IRQ and a telemetry exception is generated there), we may
310 * re-enter on core B, (incorrectly) see that we are not blocked, try to
311 * acquire the lock, and ultimately deadlock.
312 */
313 disable_preemption();
314
315 /*
316 * Since we are preemption disabled, we'll get the desired behavior even if
317 * we take a telemetry trap in the middle of this sequence because the
318 * interrupting context will never return here while holding the telemetry
319 * lock.
320 */
321 telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
322 if (*telemetry_lock_blocked) {
323 /*
324 * This CPU has already acquired/is blocked on the telemetry lock.
325 * Attempting to acquire again on this CPU will deadlock. Refuse the
326 * operation.
327 */
328 enable_preemption();
329 return false;
330 }
331
332 *telemetry_lock_blocked = 1;
333
334 /* We've been approved to acquire the lock on this core! */
335 lck_spin_lock(&trap_telemetry_lock);
336 return true;
337 }
338
339 /**
340 * Attempts to acquire the telemetry lock and panic if it cannot be acquired.
341 */
342 static void
safe_telemetry_lock_lock(void)343 safe_telemetry_lock_lock(void)
344 {
345 if (!safe_telemetry_lock_try_lock()) {
346 panic("Unexpectedly could not acquire telemetry lock "
347 "(nested acquire will deadlock)");
348 }
349 }
350
351 /**
352 * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock
353 */
354 static inline void
safe_telemetry_lock_unlock(void)355 safe_telemetry_lock_unlock(void)
356 {
357 uint8_t *telemetry_lock_blocked = NULL;
358
359 lck_spin_unlock(&trap_telemetry_lock);
360
361 /*
362 * Clear the block only AFTER having dropped the lock so that we can't
363 * hit a really narrow deadlock race where we get interrupted between
364 * clearing the block and dropping the lock.
365 */
366 telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
367 os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed);
368
369 /* Finally, reenable preemption as this thread is now safe to move */
370 enable_preemption();
371 }
372
373 /**
374 * Enqueue SRC into the record submission buffer.
375 * Returns TRUE if successful, false otherwise.
376 * TRAP_TELEMETRY_LOCK must be held during this operation.
377 */
378 static bool
rsb_enqueue_locked(rsb_entry_s * rsb_e)379 rsb_enqueue_locked(rsb_entry_s *rsb_e)
380 {
381 if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) {
382 /* We're full. */
383 return false;
384 }
385
386 /* Write the new entry at the write head */
387 rsb_entry_s *dst = record_submission_buffer + rsb_wr_idx;
388 *dst = *rsb_e;
389
390 /* Update pointers */
391 rsb_count += 1;
392 rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH;
393
394 return true;
395 }
396
397 /**
398 * Enter RECORD into this CPU's debounce buffer, thereby preventing it from
399 * being reported again until it falls off. Records are removed from the
400 * debounce buffer automatically as newer records are inserted.
401 */
402 static bool
trap_debounce_buffer_enter(match_record_s * record)403 trap_debounce_buffer_enter(match_record_s *record)
404 {
405 trap_debounce_buffer_s *debounce = NULL;
406 bool match = false;
407
408 /*
409 * Since we don't lock the debounce buffers and instead rely on them being
410 * per-CPU for synchronization, we need to disable preemption to ensure that
411 * we only access the correct debounce buffer.
412 */
413 disable_preemption();
414 debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
415
416 /*
417 * Enter the record.
418 * We do this by overwriting the oldest entry, which naturally gives us a
419 * LIFO replacement policy.
420 */
421 debounce->records[debounce->tail] = *record;
422 debounce->tail = (debounce->tail + 1) % DEBOUNCE_RECORD_COUNT;
423
424 enable_preemption();
425
426 return match;
427 }
428
429
430 /**
431 * Search for RECORD in the per-CPU debounce buffer.
432 *
433 * This is useful for determining if a trap has triggered recently.
434 */
435 static bool
trap_debounce_buffer_has_match(match_record_s * record)436 trap_debounce_buffer_has_match(match_record_s *record)
437 {
438 trap_debounce_buffer_s *debounce = NULL;
439 bool match = false;
440
441 disable_preemption();
442 debounce = PERCPU_GET(per_cpu_trap_debounce_buffer);
443
444 for (size_t i = 0; i < DEBOUNCE_RECORD_COUNT; i++) {
445 if (match_record_compare(debounce->records + i, record) == 0) {
446 match = true;
447 break;
448 }
449 }
450
451 enable_preemption();
452
453 return match;
454 }
455
456 /**
457 * Should the given trap be dumped to the console for debug?
458 */
459 static inline bool
should_dump_trap(trap_telemetry_type_t trap_type,uint64_t trap_code)460 should_dump_trap(
461 trap_telemetry_type_t trap_type,
462 uint64_t trap_code)
463 {
464 if (trap_telemetry_dump_type == -1 /* type match disabled */ ||
465 trap_telemetry_dump_type != (uint32_t)trap_type) {
466 /* No match on type */
467 return false;
468 }
469
470 if (trap_telemetry_dump_code != -1 /* code match is enabled */ &&
471 /* but it doesn't match the trap code */
472 trap_telemetry_dump_code != trap_code) {
473 return false;
474 }
475
476 /* Matching type and, if applicable, code. */
477 return true;
478 }
479
480 /**
481 * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective binary
482 * image. Caller is not responsible for managing the the UUID
483 * memory (i.e. it is not owned by the caller).
484 *
485 * Returns negative on error.
486 *
487 * Acquires a sleeping lock, do not call while interrupts are disabled.
488 */
489 static int
get_uuid_and_text_offset_for_addr(uintptr_t addr,uuid_t ** uuid_out,uint64_t * offset_out)490 get_uuid_and_text_offset_for_addr(
491 uintptr_t addr, uuid_t **uuid_out, uint64_t *offset_out)
492 {
493 kernel_mach_header_t *mh = NULL;
494 kernel_segment_command_t *seg_text = NULL;
495 void *mh_uuid = NULL;
496 unsigned long mh_uuid_len = 0;
497 #if __arm64__
498 const char *text_segment_label = "__TEXT_EXEC";
499 #else
500 const char *text_segment_label = "__TEXT";
501 #endif
502
503 if (!(mh = OSKextKextForAddress((void *)addr))) {
504 return -1;
505 }
506
507 if (!(seg_text = getsegbynamefromheader(mh, text_segment_label))) {
508 return -2;
509 }
510
511 if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) {
512 return -3;
513 }
514
515 if (mh_uuid_len != sizeof(**uuid_out)) {
516 return -4;
517 }
518
519 *uuid_out = (uuid_t *)(mh_uuid);
520 *offset_out = addr - seg_text->vmaddr;
521
522 return 0;
523 }
524
525 /**
526 * If it does not already exist, inserts UUID into UUID_CACHE (described by
527 * CACHE_LEN). In either case, return the index of the UUID in the cache through
528 * *IDX_OUT and set *IS_NEW_OUT if UUID was inserted.
529 *
530 */
531 static void
uuid_cache_get_or_insert(uuid_t * uuid,uuid_t ** uuid_cache,size_t cache_len,uint32_t * idx_out,bool * is_new_out)532 uuid_cache_get_or_insert(uuid_t *uuid, uuid_t **uuid_cache, size_t cache_len,
533 uint32_t *idx_out, bool *is_new_out)
534 {
535 for (uint32_t i = 0; i < cache_len; i++) {
536 if (uuid_cache[i] == uuid) {
537 /* Hit on existing entry */
538 *idx_out = i;
539 *is_new_out = false;
540 return;
541 } else if (uuid_cache[i] == NULL) {
542 /*
543 * Reached the end of the valid entries without finding our UUID.
544 * Insert it now.
545 */
546 uuid_cache[i] = uuid;
547 *idx_out = i;
548 *is_new_out = true;
549 return;
550 }
551
552 /* No match yet, but there might be more entries. Keep going. */
553 }
554
555 /*
556 * We didn't find the UUID but we also couldn't insert it because we never
557 * found a free space. This shouldn't happen if the UUID cache is correctly
558 * sized.
559 */
560 panic("Could not find UUID in cache but cache was full");
561 }
562
563 /**
564 * Convert an array of backtrace addresses in FRAMES into an offset backtrace
565 * string in BUF.
566 *
567 * This backtrace scheme has records deliminated by newline characters. Each
568 * record is either a backtrace entry or a UUID entry. A backtrace entry is
569 * identified by the presence of an `@` character in the record. Any other
570 * record is a UUID entry.
571 *
572 * Example:
573 *
574 * 14760@0\n
575 * 2B417DFA-7964-3EBF-97EE-FC94D26FFABD\n
576 * 9f18@1\n
577 * F9EFB7CA-8F23-3990-8E57-A7DAD698D494\n
578 * 87c974@2\n
579 * 8686ED81-CAA9-358D-B162-1F2F97334C65\n
580 * 87cce4@2\n
581 * 874f64@2\n
582 *
583 * Structurally, this example is equivalent to:
584 *
585 * <text offset>@<uuid entry idx=0>\n
586 * <uuid entry 0>\n
587 * <text offset>@<uuid entry idx=1>\n
588 * <uuid entry 1>\n
589 * <text offset>@<uuid entry idx=2>\n
590 * <uuid entry 2>\n
591 * <text offset>@<uuid entry idx=2>\n
592 * <text offset>@<uuid entry idx=2>\n
593 *
594 * The first record here is a backtrace entry. Backtrace entries encode program
595 * location as a hex offset into the __TEXT/__TEXT_EXEC segment of the enclosing
596 * binary. The enclosing binary is identified by a hex encoded, zero-indexed
597 * UUID entry ID which follows after the `@` in a backtrace entry.
598 *
599 * The second record is a UUID entry. UUID entries are simply records which
600 * contain nothing but the UUID. UUID entries are implicitly assigned IDs,
601 * starting from zero, in the order they appear in the record stream. Entries
602 * may be referenced before they are used.
603 *
604 * Given a 256 byte buffer, we can fit up to ten backtrace entries (assuming
605 * each binary is no larger than 256MB and we have no more than four unique
606 * UUIDs in the backtrace).
607 *
608 * If the encoder runs out of space (for example, because we have more than four
609 * unique UUIDs), the later records will truncate abruptly. In order to provide
610 * as much information as possible, UUIDs are encoded immediately after they are
611 * used. This means that if the encoder does run out of space, all backtrace
612 * entries but the last will always decode correctly.
613 */
614 static void
backtrace_to_offset_bt_string(char * buf,size_t buf_len,const uintptr_t * frames,size_t frames_len)615 backtrace_to_offset_bt_string(
616 char *buf,
617 size_t buf_len,
618 const uintptr_t *frames,
619 size_t frames_len)
620 {
621 size_t written = 0;
622 const size_t uuid_cache_count = TRAP_TELEMETRY_BT_FRAMES;
623 /*
624 * The UUID cache relies on NULL entries to represent free slots, so clear
625 * it before use.
626 */
627 uuid_t *uuid_cache[uuid_cache_count] = {0};
628 assert(frames_len <= uuid_cache_count);
629
630 /* Add all frames and store unique UUIDs into the cache */
631 for (size_t frame_i = 0; frame_i < frames_len; frame_i++) {
632 uuid_t *uuid = NULL;
633 uint64_t offset = 0;
634
635 if (get_uuid_and_text_offset_for_addr(
636 frames[frame_i], &uuid, &offset) == 0) {
637 /* Success! Insert (or reuse) the UUID and then print the entry. */
638 uint32_t uuid_i;
639 bool is_new;
640 uuid_cache_get_or_insert(
641 uuid, uuid_cache, uuid_cache_count,
642 &uuid_i, &is_new);
643
644 /* Write backtrace record */
645 written += scnprintf(buf + written, buf_len - written,
646 "%llx@%x\n",
647 offset, uuid_i);
648
649 /* Write UUID record, if needed. */
650 if (is_new) {
651 uuid_string_t uuid_str;
652 uuid_unparse(*uuid, uuid_str);
653
654 written += scnprintf(buf + written, buf_len - written,
655 "%s\n",
656 uuid_str);
657 }
658 } else {
659 /*
660 * Could not find an image for the target?
661 * Just return the offset into the executable region with an error
662 * UUID ref as it's better than nothing.
663 */
664 written += scnprintf(buf + written, buf_len - written,
665 "%lx@!\n",
666 frames[frame_i] - vm_kernel_stext);
667 }
668 }
669 }
670
671
672 /**
673 * Print RSB_E to the console in a human friendly way.
674 */
675 static void
rsb_entry_dump(rsb_entry_s * rsb_e)676 rsb_entry_dump(rsb_entry_s *rsb_e)
677 {
678 printf(TAG "Triggered trap at PC=0x%08lx "
679 "(type=%u, code=0x%04llx). Backtrace:\n",
680 rsb_e->record.fault_pc,
681 (uint32_t)rsb_e->record.trap_type, rsb_e->record.trap_code);
682
683 for (size_t frame_i = 0; frame_i < rsb_e->bt_frames_count; frame_i++) {
684 printf(TAG "\t0x%08lx\n", rsb_e->bt_frames[frame_i]);
685 }
686 }
687
688 /**
689 * Submit RSB_E to CoreAnalytics (or another backing event provider as
690 * appropriate).
691 */
692 static void
rsb_entry_submit(rsb_entry_s * rsb_e)693 rsb_entry_submit(rsb_entry_s *rsb_e)
694 {
695 trap_telemetry_options_s options = rsb_e->options;
696
697 bool matched_dump_bootarg = should_dump_trap(
698 rsb_e->record.trap_type, rsb_e->record.trap_code);
699 if (matched_dump_bootarg) {
700 rsb_entry_dump(rsb_e);
701 }
702
703 ca_event_t ca_event = NULL;
704 switch (options.telemetry_ca_event) {
705 case TRAP_TELEMETRY_CA_EVENT_NONE: {
706 /*
707 * Unless the event matches the dump boot-arg, we should never see
708 * unreported events in the backend. Instead, we expect these events
709 * to be dropped in the frontend without ever being submitted.
710 */
711 assert(matched_dump_bootarg);
712 break;
713 }
714
715 case TRAP_TELEMETRY_CA_EVENT_KERNEL_BRK: {
716 ca_event = CA_EVENT_ALLOCATE(kernel_breakpoint_event);
717 CA_EVENT_TYPE(kernel_breakpoint_event) * event = ca_event->data;
718
719 /*
720 * The BRK telemetry format is somewhat less dense, so to avoid
721 * truncating (and to maintain the historical backtrace count) report
722 * five or fewer frames.
723 */
724 uint32_t reported_bt_count =
725 MIN((uint32_t)rsb_e->bt_frames_count, 5);
726 telemetry_backtrace_to_string(
727 /* buf */ event->backtrace,
728 /* buf_size */ TRAP_TELEMETRY_BT_STR_LEN,
729 /* tot */ reported_bt_count,
730 /* frames */ rsb_e->bt_frames);
731
732 event->brk_type = (uint32_t)rsb_e->record.trap_type;
733 event->brk_code = (uint64_t)rsb_e->record.trap_code;
734 event->faulting_address = rsb_e->record.fault_pc - vm_kernel_stext;
735 strlcpy(event->uuid, kernel_uuid_string, CA_UUID_LEN);
736 break;
737 }
738
739 case TRAP_TELEMETRY_CA_EVENT_INTERNAL: {
740 int result;
741 uuid_t *uuid = NULL;
742 uint64_t offset = 0;
743
744 ca_event = CA_EVENT_ALLOCATE(trap_telemetry_internal);
745 CA_EVENT_TYPE(trap_telemetry_internal) * event = ca_event->data;
746
747 backtrace_to_offset_bt_string(
748 /* buf */ event->backtrace,
749 /* buf_len */ TRAP_TELEMETRY_BT_STR_LEN,
750 rsb_e->bt_frames,
751 rsb_e->bt_frames_count);
752
753 #if __arm64__
754 /*
755 * We want the value of ARM64_SOC_NAME define as a string, so we need
756 * to do a two level indirection of macros to get to it.
757 */
758 #define tostr(s) __STRINGIFY(s)
759 strlcpy(event->kernel_platform, tostr(ARM64_SOC_NAME), KERNEL_PLATFORM_STR_LEN);
760 #undef tostr
761 #endif
762
763 /*
764 * Internal events report the UUID of the binary containing the
765 * fault PC and offset of the fault PC into the executable region of
766 * that binary (__TEXT_EXEC).
767 */
768 if ((result = get_uuid_and_text_offset_for_addr(
769 rsb_e->record.fault_pc, &uuid, &offset)) == 0) {
770 /* Success! */
771 event->trap_offset = offset;
772 uuid_unparse(*uuid, event->trap_uuid);
773 } else {
774 /*
775 * We couldn't get the required data for symbolication for some
776 * odd reason.
777 * Report the offset into the executable region and the error as
778 * the UUID instead.
779 */
780 event->trap_offset = rsb_e->record.fault_pc - vm_kernel_stext;
781 (void)scnprintf(event->trap_uuid, CA_UUID_LEN, "error:%d\n",
782 result);
783 }
784
785 event->trap_type = (uint32_t)rsb_e->record.trap_type;
786 event->trap_code = rsb_e->record.trap_code;
787
788 break;
789 }
790
791 case TRAP_TELEMETRY_CA_EVENT_LATENCY: {
792 ca_event = CA_EVENT_ALLOCATE(latency_violations);
793 CA_EVENT_TYPE(latency_violations) * event = ca_event->data;
794
795 backtrace_to_offset_bt_string(
796 /* buf */ event->backtrace,
797 /* buf_len */ TRAP_TELEMETRY_BT_STR_LEN,
798 rsb_e->bt_frames,
799 rsb_e->bt_frames_count);
800
801 #if __arm64__
802 /*
803 * We want the value of ARM64_SOC_NAME define as a string, so we need
804 * to do a two level indirection of macros to get to it.
805 */
806 #define tostr(s) __STRINGIFY(s)
807 strlcpy(event->kernel_platform, tostr(ARM64_SOC_NAME), KERNEL_PLATFORM_STR_LEN);
808 #undef tostr
809 #endif
810 strlcpy(event->uuid, kernel_uuid_string, CA_UUID_LEN);
811 (void)scnprintf(event->violation_cpu_type, 2, "%c",
812 rsb_e->extra_data.latency_data.violation_cpu_type);
813
814 event->violation_code = (uint32_t)rsb_e->record.trap_type;
815 event->violation_cpi = rsb_e->extra_data.latency_data.violation_cpi;
816 event->violation_freq = rsb_e->extra_data.latency_data.violation_freq;
817 event->violation_duration = rsb_e->extra_data.latency_data.violation_duration;
818 event->violation_threshold = rsb_e->extra_data.latency_data.violation_threshold;
819 event->violation_payload = rsb_e->extra_data.latency_data.violation_payload;
820
821 break;
822 }
823 default: {
824 panic("Unexpected telemetry CA event: %u\n",
825 options.telemetry_ca_event);
826 }
827 }
828
829 if (ca_event) {
830 CA_EVENT_SEND(ca_event);
831 }
832 }
833
834 /**
835 * Thread call which drains the record submission buffer.
836 * There must be no more than one instance of this thread running at a time.
837 */
838 static void
drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,__unused thread_call_param_t p1)839 drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,
840 __unused thread_call_param_t p1)
841 {
842 size_t drain_count = 0;
843 size_t drain_rd_idx = 0;
844 trap_telemetry_tree_entry_s *tree_records[RECORD_SUBMISSION_BUFFER_LENGTH];
845
846 /*
847 * We never expect for the submission thread to be scheduled while another
848 * thread which is attempting to enqueue is suspended above it (acquiring
849 * disables preemption) or while another submission thread is suspended
850 * above it (only one submission thread should ever be running).
851 *
852 * Thus, failing to acquire the lock anywhere in this function indicates
853 * that something is seriously wrong.
854 */
855 safe_telemetry_lock_lock();
856
857 /*
858 * If we're already draining, that means we either forgot to update
859 * rsb_is_draining or we have another thread draining (which should never
860 * happen).
861 */
862 assert(!rsb_is_draining);
863 rsb_is_draining = true;
864
865 /*
866 * Iteratively drain the submission queue until no entries remain.
867 * Drops and reacquires the telemetry lock.
868 */
869 while ((drain_count = rsb_count)) {
870 /* LOCKED IN */
871 drain_rd_idx = rsb_rd_idx;
872 safe_telemetry_lock_unlock();
873
874 /*
875 * It is safe to read these entries based on snapshots of DRAIN_COUNT
876 * and DRAIN_RD_IDX without holding the lock because all of the records'
877 * writes will have already become visible due to the lock's store
878 * release on the enqueue side. RSB entries are guaranteed to survive
879 * even when we aren't holding the lock so long as DRAIN_RD_IDX doesn't
880 * pass them. Since we are the only agent updating it, if we sequence
881 * the DRAIN_RD_IDX write after, we're fine.
882 *
883 * We may miss some records in this pass if other CPUs enqueue after the
884 * snapshot but we'll just pick them up in the next loop iteration.
885 * Additionally, since only one instance of this function will be
886 * running at a time, we don't need to worry about duplicate
887 * allocations/work.
888 */
889
890 for (size_t i = 0; i < drain_count; i++) {
891 size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
892 rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
893
894 /* Finish processing the entry and submit it as needed. */
895 rsb_entry_submit(rsb_e);
896
897 if (rsb_e->options.report_once_per_site) {
898 /*
899 * Though we don't insert it yet since we aren't holding the
900 * lock, create our tree record from the RSB entry.
901 */
902 trap_telemetry_tree_entry_s *new_tree_record = kalloc_type(
903 trap_telemetry_tree_entry_s, Z_WAITOK | Z_NOFAIL);
904
905 new_tree_record->record = rsb_e->record;
906 tree_records[i] = new_tree_record;
907 } else {
908 tree_records[i] = NULL;
909 }
910 }
911
912 safe_telemetry_lock_lock();
913 /* Insert draining entries into the splay as needed */
914 for (size_t i = 0; i < drain_count; i++) {
915 size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
916 rsb_entry_s *rsb_e = record_submission_buffer + rsb_i;
917
918 if (rsb_e->options.report_once_per_site) {
919 trap_telemetry_tree_entry_s *duplicate = SPLAY_INSERT(
920 trap_telemetry_tree,
921 &telemetry_splay_tree,
922 tree_records[i]);
923
924 /*
925 * Since we scan both the RSB and the splay tree before
926 * submitting a report once record, we structurally should never
927 * have multiple instances of any such record.
928 */
929 (void)duplicate;
930 assert(!duplicate);
931 }
932 }
933
934 /* Dequeue the submitted entries from the RSB */
935 rsb_rd_idx =
936 (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH;
937 rsb_count -= drain_count;
938 /* LOCKED OUT */
939 }
940
941 /* Done for now, if submitters have entries they'll need to call again. */
942 rsb_is_draining = false;
943 safe_telemetry_lock_unlock();
944 }
945
946 __startup_func
947 void
trap_telemetry_init(void)948 trap_telemetry_init(void)
949 {
950 printf(TAG "trap_telemetry_init\n");
951 SPLAY_INIT(&telemetry_splay_tree);
952
953 drain_record_submission_buffer_callout = thread_call_allocate_with_options(
954 drain_record_submission_buffer_thread_call, NULL,
955 THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
956
957 if (!drain_record_submission_buffer_callout) {
958 panic("Failed to allocate drain callout!");
959 }
960
961 {
962 /* Ensure that all telemetry events can be encoded in the bitfield */
963 trap_telemetry_options_s opt = (trap_telemetry_options_s) {0};
964 uint8_t last_event = TRAP_TELEMETRY_CA_EVENT_COUNT - 1;
965 opt.telemetry_ca_event = last_event;
966 assert(opt.telemetry_ca_event == last_event);
967 }
968 }
969
970 /**
971 * Submit RSB_E to the record submission queue if it needs to be submitted.
972 * Returns TRUE if the record was accepted (either enqueued or dupe'd), FALSE
973 * otherwise.
974 */
975 static bool
rsb_enqueue_if_needed(rsb_entry_s * rsb_e)976 rsb_enqueue_if_needed(rsb_entry_s *rsb_e)
977 {
978 bool record_accepted = true;
979 bool should_flush_submission_buffer = false;
980 trap_telemetry_tree_entry_s *splay_found_entry = NULL;
981 trap_telemetry_tree_entry_s find_tree_e = {0};
982
983 if (trap_debounce_buffer_has_match(&rsb_e->record)) {
984 /* debounce dupe */
985 return true;
986 }
987
988 if (!safe_telemetry_lock_try_lock()) {
989 /*
990 * Failed to acquire the lock!
991 * We're likely in a nested exception. Since we can't safely do anything
992 * else with the record, just drop it.
993 */
994 return false;
995 }
996
997 if (rsb_e->options.report_once_per_site) {
998 /* First, scan the submission queue for matching, queued records */
999 for (size_t i = 0; i < rsb_count; i++) {
1000 size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
1001 rsb_entry_s *rsb_e_i = record_submission_buffer + rsb_i;
1002 if (match_record_compare(&rsb_e->record, &rsb_e_i->record) == 0) {
1003 /* Match, no need to report again. */
1004 goto DONE_LOCKED;
1005 }
1006 }
1007
1008 /* Next, try for a record in the splay */
1009 find_tree_e.record = rsb_e->record;
1010 splay_found_entry = SPLAY_FIND(trap_telemetry_tree,
1011 &telemetry_splay_tree,
1012 &find_tree_e);
1013 if (splay_found_entry) {
1014 /* Match, no need to report again. */
1015 goto DONE_LOCKED;
1016 }
1017 }
1018
1019
1020 /*
1021 * If we haven't hit any disqualifying conditions, this means we have a new
1022 * entry which needs to be enqueued for reporting.
1023 */
1024 record_accepted = rsb_enqueue_locked(rsb_e);
1025 should_flush_submission_buffer = record_accepted && !rsb_is_draining;
1026
1027 if (record_accepted) {
1028 /* We've handled the record, so mark it for debouncing */
1029 trap_debounce_buffer_enter(&rsb_e->record);
1030 #if DEVELOPMENT || DEBUG
1031 os_atomic_inc(&trap_telemetry_reported_events, relaxed);
1032 #endif /* DEVELOPMENT || DEBUG */
1033 } else {
1034 /*
1035 * Failed to enqueue. Since we have no better options, drop the event.
1036 */
1037 #if DEVELOPMENT || DEBUG
1038 os_atomic_inc(&trap_telemetry_capacity_dropped_events, relaxed);
1039 #endif /* DEVELOPMENT || DEBUG */
1040 }
1041
1042 DONE_LOCKED:
1043 safe_telemetry_lock_unlock();
1044
1045 if (should_flush_submission_buffer &&
1046 startup_phase >= STARTUP_SUB_THREAD_CALL) {
1047 /*
1048 * We submitted a new entry while the drain thread was either exiting or
1049 * not running. Queue a new flush. Multiple calls here before the drain
1050 * starts running will not result in multiple calls being queued due to
1051 * THREAD_CALL_OPTIONS_ONCE.
1052 */
1053 thread_call_enter(drain_record_submission_buffer_callout);
1054 }
1055
1056 return record_accepted;
1057 }
1058
1059 /**
1060 * Should a given trap be ignored/not reported?
1061 */
1062 static bool
should_ignore_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1063 should_ignore_trap(
1064 trap_telemetry_type_t trap_type,
1065 uint64_t trap_code,
1066 trap_telemetry_options_s options)
1067 {
1068 if (trap_telemetry_disable_all) {
1069 /* Telemetry is disabled, drop all events. */
1070 return true;
1071 }
1072
1073 if ((options.telemetry_ca_event == TRAP_TELEMETRY_CA_EVENT_NONE ||
1074 trap_telemetry_disable_ca) &&
1075 !should_dump_trap(trap_type, trap_code)) {
1076 /* Trap won't be reported anywhere, so it can be dropped. */
1077 return true;
1078 }
1079
1080 return false;
1081 }
1082
1083 bool
trap_telemetry_report_exception(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,void * saved_state)1084 trap_telemetry_report_exception(
1085 trap_telemetry_type_t trap_type,
1086 uint64_t trap_code,
1087 trap_telemetry_options_s options,
1088 void *saved_state)
1089 {
1090 if (should_ignore_trap(trap_type, trap_code, options)) {
1091 /*
1092 * Don't bother reporting the trap. Since this is not an error, report
1093 * that we handled the trap as expected.
1094 */
1095 return true;
1096 }
1097
1098 #if __arm64__
1099 arm_saved_state_t *state = (arm_saved_state_t *)saved_state;
1100
1101 uintptr_t faulting_address = get_saved_state_pc(state);
1102 uintptr_t saved_fp = get_saved_state_fp(state);
1103 #else
1104 x86_saved_state64_t *state = (x86_saved_state64_t *)saved_state;
1105
1106 uintptr_t faulting_address = state->isf.rip;
1107 uintptr_t saved_fp = state->rbp;
1108 #endif
1109
1110 struct backtrace_control ctl = {
1111 .btc_frame_addr = (uintptr_t)saved_fp,
1112 };
1113
1114 rsb_entry_s submission_e = { 0 };
1115 submission_e.record.trap_type = trap_type;
1116 submission_e.record.trap_code = trap_code;
1117 submission_e.record.fault_pc = faulting_address;
1118 submission_e.options = options;
1119 submission_e.bt_frames_count = backtrace(
1120 submission_e.bt_frames, TRAP_TELEMETRY_BT_FRAMES, &ctl, NULL);
1121
1122 return rsb_enqueue_if_needed(&submission_e);
1123 }
1124
1125 __attribute__((always_inline))
1126 static bool
trap_telemetry_report_simulated_trap_impl(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_extra_data_u * extra_data,trap_telemetry_options_s options)1127 trap_telemetry_report_simulated_trap_impl(
1128 trap_telemetry_type_t trap_type,
1129 uint64_t trap_code,
1130 trap_telemetry_extra_data_u *extra_data,
1131 trap_telemetry_options_s options)
1132 {
1133 if (should_ignore_trap(trap_type, trap_code, options)) {
1134 /*
1135 * Don't bother reporting the trap. Since this is not an error, report
1136 * that we did handle the trap as expected.
1137 */
1138 return true;
1139 }
1140
1141 /*
1142 * We want to provide a backtrace as if a trap ocurred at the callsite of
1143 * the simulated trap. Doing this safely is somewhat awkward as
1144 * __builtin_frame_address with a non-zero argument can itself fault (if our
1145 * callers frame pointer is invalid) so instead we take a backtrace starting
1146 * in our own frame and chop it up as expected.
1147 */
1148
1149 const size_t frames_count = TRAP_TELEMETRY_BT_FRAMES + 1;
1150 uintptr_t frames[frames_count];
1151
1152 struct backtrace_control ctl = {
1153 .btc_frame_addr = (uintptr_t)__builtin_frame_address(0),
1154 };
1155
1156 size_t frames_valid_count = backtrace(frames, frames_count, &ctl, NULL);
1157 if (frames_valid_count) {
1158 /*
1159 * Take the first backtrace entry as the fault address and then place
1160 * all other entries into the backtrace. The first backtrace is our
1161 * caller (due to the noinline attribute), which gives us the fault
1162 * address as the call site (as desired).
1163 */
1164 return trap_telemetry_report_simulated_trap_with_backtrace(
1165 trap_type,
1166 trap_code,
1167 options,
1168 extra_data,
1169 /* fault_pc */ frames[0],
1170 /* frames */ frames + 1,
1171 /* frames_valid_count */ frames_valid_count - 1);
1172 } else {
1173 /* Failed to take a backtrace? Report just the return address then. */
1174 return trap_telemetry_report_simulated_trap_with_backtrace(
1175 trap_type,
1176 trap_code,
1177 options,
1178 extra_data,
1179 /* fault_pc */ (uintptr_t)__builtin_return_address(0),
1180 /* frames */ NULL,
1181 /* frames_valid_count */ 0);
1182 }
1183 }
1184
1185 __attribute__((noinline))
1186 bool
trap_telemetry_report_simulated_trap(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options)1187 trap_telemetry_report_simulated_trap(
1188 trap_telemetry_type_t trap_type,
1189 uint64_t trap_code,
1190 trap_telemetry_options_s options)
1191 {
1192 return trap_telemetry_report_simulated_trap_impl(trap_type, trap_code, NULL, options);
1193 }
1194
1195 __attribute__((noinline))
1196 bool
trap_telemetry_report_latency_violation(trap_telemetry_type_t trap_type,trap_telemetry_latency_s latency_data)1197 trap_telemetry_report_latency_violation(
1198 trap_telemetry_type_t trap_type,
1199 trap_telemetry_latency_s latency_data)
1200 {
1201 return trap_telemetry_report_simulated_trap_impl(trap_type, 0,
1202 (trap_telemetry_extra_data_u *)&latency_data,
1203 (trap_telemetry_options_s) {
1204 .telemetry_ca_event = TRAP_TELEMETRY_CA_EVENT_LATENCY,
1205 .report_once_per_site = false
1206 });
1207 }
1208
1209 bool
trap_telemetry_report_simulated_trap_with_backtrace(trap_telemetry_type_t trap_type,uint64_t trap_code,trap_telemetry_options_s options,trap_telemetry_extra_data_u * extra_data,uintptr_t fault_pc,uintptr_t * frames,size_t frames_valid_count)1210 trap_telemetry_report_simulated_trap_with_backtrace(
1211 trap_telemetry_type_t trap_type,
1212 uint64_t trap_code,
1213 trap_telemetry_options_s options,
1214 trap_telemetry_extra_data_u *extra_data,
1215 uintptr_t fault_pc,
1216 uintptr_t *frames,
1217 size_t frames_valid_count)
1218 {
1219 if (should_ignore_trap(trap_type, trap_code, options)) {
1220 /*
1221 * Don't bother reporting the trap. Since this is not an error, report
1222 * that we did handle the trap as expected.
1223 */
1224 return true;
1225 }
1226
1227 rsb_entry_s submission_e = { 0 };
1228 submission_e.record.trap_type = trap_type;
1229 submission_e.record.trap_code = trap_code;
1230 if (extra_data != NULL) {
1231 submission_e.extra_data = *extra_data;
1232 }
1233 submission_e.options = options;
1234
1235 // only copy up to TRAP_TELEMETRY_BT_FRAMES frames
1236 if (frames_valid_count >= TRAP_TELEMETRY_BT_FRAMES) {
1237 frames_valid_count = TRAP_TELEMETRY_BT_FRAMES;
1238 }
1239
1240 submission_e.bt_frames_count = frames_valid_count;
1241 submission_e.record.fault_pc = fault_pc;
1242
1243 memcpy(submission_e.bt_frames, frames, frames_valid_count * sizeof(*frames));
1244
1245 return rsb_enqueue_if_needed(&submission_e);
1246 }
1247