xref: /xnu-11215.81.4/osfmk/arm64/bti_telemetry.c (revision d4514f0bc1d3f944c22d92e68b646ac3fb40d452)
1 /*
2  * Copyright (c) 2022 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * BTI Telemetry is a debug feature intended to support a safer, slow rollout
31  * of ARMv8.5's Branch Target Indication in and across the kernel.
32  * Telemetry mode converts normally fatal BTI exceptions into non-fatal,
33  * analytics generating events.
34  */
35 
36 #include <libkern/coreanalytics/coreanalytics.h>
37 #include <kern/percpu.h>
38 #include <arm64/bti_telemetry.h>
39 #include <libkern/tree.h>
40 #include <kern/locks.h>
41 #include <kern/thread_call.h>
42 #include <kern/kalloc.h>
43 #include <kern/cpu_data.h>
44 #include <machine/machine_routines.h>
45 #include <vm/pmap.h>
46 #include <libkern/OSKextLibPrivate.h>
47 #include <libkern/kernel_mach_header.h>
48 #include <kern/assert.h>
49 
50 #ifdef CONFIG_BTI_TELEMETRY
51 #define TAG "[bti_telemetry] "
52 
53 /* ~* Module Configuration *~ */
54 /**
55  * Enable reporting via CoreAnalytics in addition to local gathering.
56  */
57 #define BTI_TELEMETRY_USE_CORE_ANALYTICS (1)
58 
59 typedef struct bti_telemetry_record {
60 	SPLAY_ENTRY(bti_telemetry_record) link;
61 
62 	/** Slid address at which the exception was thrown */
63 	uintptr_t faulting_address;
64 
65 	/** The raw BTYPE for this exception */
66 	uint8_t branch_type;
67 } bti_telemetry_record_s;
68 
69 /* ~* Core Analytics *~ */
70 CA_EVENT(arm_bti_exceptions,
71     CA_INT, branch_type,
72     CA_INT, faulting_offset,
73     CA_STATIC_STRING(CA_UUID_LEN), faulting_uuid);
74 
75 /* ~* Splay tree *~ */
76 static int
bti_telemetry_record_compare(bti_telemetry_record_s * a1_r,bti_telemetry_record_s * a2_r)77 bti_telemetry_record_compare(bti_telemetry_record_s *a1_r,
78     bti_telemetry_record_s *a2_r)
79 {
80 	/* Compare on fault address */
81 	if (a1_r->faulting_address > a2_r->faulting_address) {
82 		return 1;
83 	} else if (a1_r->faulting_address < a2_r->faulting_address) {
84 		return -1;
85 	}
86 
87 	/* Same address but different BTI exception type? */
88 	if (a1_r->branch_type > a2_r->branch_type) {
89 		return 1;
90 	} else if (a1_r->branch_type < a2_r->branch_type) {
91 		return -1;
92 	} else {
93 		return 0;
94 	}
95 }
96 
97 SPLAY_HEAD(bti_telemetry_tree, bti_telemetry_record);
98 // These functions generated by SPLAY_PROTOTYPE but are currently unused
99 __unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_NEXT(
100 	struct bti_telemetry_tree *head, struct bti_telemetry_record *elm);
101 __unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_SEARCH(
102 	struct bti_telemetry_tree *head, struct bti_telemetry_record *elm);
103 __unused static struct bti_telemetry_record *bti_telemetry_tree_SPLAY_MIN_MAX(
104 	struct bti_telemetry_tree *head, int val);
105 SPLAY_PROTOTYPE(bti_telemetry_tree,
106     bti_telemetry_record,
107     link,
108     bti_telemetry_record_compare);
109 SPLAY_GENERATE(bti_telemetry_tree,
110     bti_telemetry_record,
111     link,
112     bti_telemetry_record_compare);
113 
114 /* ~* Globals *~ */
115 /* Lock which protects the event submission queue */
116 static LCK_GRP_DECLARE(bti_telemetry_lock_grp, "bti_telemetry_lock");
117 static LCK_SPIN_DECLARE(bti_telemetry_lock, &bti_telemetry_lock_grp);
118 
119 /*
120  * Since BTI exceptions are, naturally, caught in an exception context, it is
121  * not safe to allocate or do other complex behaviors like calling into
122  * CoreAnalytics. To solve this, we use a short submission ring buffer which
123  * collects records for processing on the submission thread.
124  *
125  * This ring buffer is locked by BTI_TELEMETRY_LOCK.
126  */
127 #define RECORD_SUBMISSION_BUFFER_LENGTH (16)
128 static bti_telemetry_record_s record_submission_buffer[RECORD_SUBMISSION_BUFFER_LENGTH];
129 static size_t rsb_rd_idx;
130 static size_t rsb_wr_idx;
131 static size_t rsb_count;
132 static bool rsb_is_draining;
133 
134 /**
135  * For local telemetry and deduplication, we store hit records in a splay tree.
136  * We use a splay here for performance reasons since BTI exceptions exhibit a
137  * degree of temporal locality.
138  */
139 static struct bti_telemetry_tree telemetry_splay_tree;
140 
141 /**
142  * Flag indicating whether this CPU is currently trying to acquire the
143  * telemetry lock or has already acquired the lock.
144  * This is used as a deadlock avoidance mechanism.
145  */
146 static uint8_t PERCPU_DATA(per_cpu_telemetry_lock_blocked);
147 
148 /**
149  * Thread which is responsible for clearing the submission buffer by submitting
150  * to CoreAnalytics and the local tree.
151  */
152 static struct thread_call *drain_record_submission_buffer_callout;
153 
154 /* ~* Implementation *~ */
155 /**
156  * Enqueue SRC into the record submission buffer. Returns TRUE if successful,
157  * false otherwise. BTI_TELEMETRY_LOCK must be held during this operation.
158  */
159 static bool
rsb_enqueue_locked(bti_telemetry_record_s * src)160 rsb_enqueue_locked(bti_telemetry_record_s *src)
161 {
162 	if (rsb_count == RECORD_SUBMISSION_BUFFER_LENGTH) {
163 		return false;
164 	}
165 
166 	rsb_count += 1;
167 	bti_telemetry_record_s *dst = record_submission_buffer + rsb_wr_idx;
168 	memcpy(dst, src, sizeof(bti_telemetry_record_s));
169 	rsb_wr_idx = (rsb_wr_idx + 1) % RECORD_SUBMISSION_BUFFER_LENGTH;
170 
171 	return true;
172 }
173 
174 /**
175  * Try and acquire a spin lock in an interrupt-deadlock safe way.
176  *
177  * This function differs from the standard lck_spin_try_lock function in that it
178  * will block if the lock is expected to be acquired *eventually* but will not
179  * block if it detects that the lock will never be acquired (such as when the
180  * current CPU owns the lock, which can happen if a BTI exception is taken while
181  * handling a telemetry operation under the lock).
182  */
183 static inline bool OS_WARN_RESULT
safe_telemetry_lock_try_lock(void)184 safe_telemetry_lock_try_lock(void)
185 {
186 	uint8_t *telemetry_lock_blocked = NULL;
187 
188 	/*
189 	 * Disable preemption to ensure that our block signal always corresponds
190 	 * to the CPU we're actually running on.
191 	 *
192 	 * If we did not disable preemption, there is a case where we may mark that
193 	 * we are trying to acquire the lock on core A, get approved, get preempted,
194 	 * get rescheduled on core B, and then take the lock there. If we then take
195 	 * a BTI exception on core B while handling the original exception (ex.  we
196 	 * take an IRQ and a BTI exception is generated there), we may re-enter on
197 	 * core B, (incorrectly) see that we are not blocked, try to acquire the
198 	 * lock, and ultimately deadlock.
199 	 */
200 	disable_preemption();
201 
202 	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
203 	if (!os_atomic_cmpxchg(telemetry_lock_blocked, 0, 1, relaxed)) {
204 		/*
205 		 * This CPU has already acquired/is blocked on the telemetry lock.
206 		 * Attempting to acquire again on this CPU will deadlock. Refuse the
207 		 * operation.
208 		 */
209 		enable_preemption();
210 		return false;
211 	}
212 
213 	/* We've been approved to acquire the lock on this core! */
214 	lck_spin_lock(&bti_telemetry_lock);
215 	return true;
216 }
217 
218 /**
219  * Attempts to acquire the telemetry lock and panic if it cannot be acquired.
220  */
221 static void
safe_telemetry_lock_lock(void)222 safe_telemetry_lock_lock(void)
223 {
224 	if (!safe_telemetry_lock_try_lock()) {
225 		panic("Unexpectedly could not acquire telemetry lock (nested acquire will deadlock)");
226 	}
227 }
228 
229 /**
230  * Unlock telemetry lock after being locked with safe_telemetry_lock_try_lock
231  */
232 static inline void
safe_telemetry_lock_unlock(void)233 safe_telemetry_lock_unlock(void)
234 {
235 	uint8_t *telemetry_lock_blocked = NULL;
236 
237 	lck_spin_unlock(&bti_telemetry_lock);
238 	/*
239 	 * Clear the block only AFTER having dropped the lock so that we can't
240 	 * hit a really narrow deadlock race where we get interrupted between
241 	 * clearing the block and dropping the lock.
242 	 */
243 	telemetry_lock_blocked = PERCPU_GET(per_cpu_telemetry_lock_blocked);
244 	os_atomic_store(telemetry_lock_blocked, (uint8_t)0, relaxed);
245 
246 	/* Finally, reenable preemption as this thread is now safe to move */
247 	enable_preemption();
248 }
249 
250 /**
251  * Get the UUID and __TEXT_EXEC based offset of ADDR into its respective
252  * binary image. Copy each into UUID and OFFSET. Returns negative on error.
253  *
254  * Acquires a sleeping lock, do not call while interrupts are disabled.
255  */
256 static int
get_uuid_and_text_offset_for_addr(uintptr_t addr,uuid_t * uuid,size_t * offset)257 get_uuid_and_text_offset_for_addr(uintptr_t addr, uuid_t *uuid, size_t *offset)
258 {
259 	kernel_mach_header_t *mh = NULL;
260 	kernel_segment_command_t *seg_text_exec = NULL;
261 	void *mh_uuid = NULL;
262 	unsigned long mh_uuid_len = 0;
263 
264 	if (!(mh = OSKextKextForAddress((void *)addr))) {
265 		return -1;
266 	}
267 
268 	if (!(seg_text_exec = getsegbynamefromheader(mh, "__TEXT_EXEC"))) {
269 		return -2;
270 	}
271 
272 	if (!(mh_uuid = getuuidfromheader(mh, &mh_uuid_len))) {
273 		return -3;
274 	}
275 
276 	if (mh_uuid_len != sizeof(*uuid)) {
277 		return -4;
278 	}
279 
280 	memcpy(uuid, mh_uuid, sizeof(*uuid));
281 	*offset = addr - seg_text_exec->vmaddr;
282 
283 	return 0;
284 }
285 
286 static void __unused
dump_telemetry_record(bti_telemetry_record_s * record,uuid_string_t uuid_str,size_t offset)287 dump_telemetry_record(bti_telemetry_record_s *record,
288     uuid_string_t uuid_str,
289     size_t offset)
290 {
291 	printf(
292 		TAG "Unexpected BTI exception (pc=0x%08lx, BTYPE=%d)\n"
293 		TAG "\t<UUID: %s, offset: 0x%08lx>\n",
294 		record->faulting_address, record->branch_type,
295 		uuid_str, offset);
296 }
297 
298 /**
299  * Thread call which drains the record submission buffer.
300  * There must be no more than one instance of this thread running at a time.
301  */
302 static void
drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,__unused thread_call_param_t p1)303 drain_record_submission_buffer_thread_call(__unused thread_call_param_t p0,
304     __unused thread_call_param_t p1)
305 {
306 	size_t drain_count = 0;
307 	size_t drain_rd_idx = 0;
308 	bti_telemetry_record_s *record_allocations[RECORD_SUBMISSION_BUFFER_LENGTH];
309 
310 	/*
311 	 * We never expect for the submission thread to be scheduled while another
312 	 * handler is suspended above it (acquiring disables preemption) or while
313 	 * another submission thread is suspended above it (only one submission
314 	 * thread should ever be running). Thus, failing to acquire the lock
315 	 * indicates that something is seriously wrong.
316 	 */
317 	safe_telemetry_lock_lock();
318 
319 	if (rsb_is_draining) {
320 		panic("Unexpectedly found multiple concurrent drains!");
321 	}
322 	rsb_is_draining = true;
323 
324 	/*
325 	 * Iteratively drain the submission queue until no entries remain.
326 	 * Drops and reacquires the telemetry lock.
327 	 */
328 	while ((drain_count = rsb_count)) {
329 		/* LOCKED IN */
330 		drain_rd_idx = rsb_rd_idx;
331 		safe_telemetry_lock_unlock();
332 
333 		/*
334 		 * It is safe to read these entries based on snapshots of DRAIN_COUNT
335 		 * and DRAIN_RD_IDX without holding the lock because all of the records'
336 		 * writes will have already become visible due to the lock's store
337 		 * release on the enqueue side. We may miss some records in this pass if
338 		 * they enqueue after the snapshot but we'll just pick them up in the
339 		 * next loop iteration. Additionally, since only one instance of this
340 		 * function will be running at a time, we don't need to worry about
341 		 * duplicate allocations/work.
342 		 */
343 
344 		for (size_t i = 0; i < drain_count; i++) {
345 			/* Create persistent copies of the entries in the RSB */
346 			size_t rsb_i = (drain_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
347 			bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i;
348 
349 			bti_telemetry_record_s *new_record =
350 			    kalloc_type(bti_telemetry_record_s, Z_WAITOK | Z_NOFAIL);
351 
352 			memcpy(new_record, record_i, sizeof(bti_telemetry_record_s));
353 			record_allocations[i] = new_record;
354 		}
355 
356 		safe_telemetry_lock_lock();
357 		/* Insert all draining entries into the splay */
358 		for (size_t i = 0; i < drain_count; i++) {
359 			bti_telemetry_record_s *duplicate = SPLAY_INSERT(bti_telemetry_tree,
360 			    &telemetry_splay_tree,
361 			    record_allocations[i]);
362 			if (duplicate) {
363 				/*
364 				 * Since we scan both the RSB and the splay tree before
365 				 * submitting a record, we never expect to have multiple
366 				 * instances of any record. If this occurs, it's a bug!
367 				 */
368 				panic("Unexpected duplicate splay entry!");
369 			}
370 		}
371 
372 		/* Dequeue the entries from the RSB */
373 		rsb_rd_idx =
374 		    (rsb_rd_idx + drain_count) % RECORD_SUBMISSION_BUFFER_LENGTH;
375 		rsb_count -= drain_count;
376 		safe_telemetry_lock_unlock();
377 
378 		/* Report entries */
379 		for (size_t i = 0; i < drain_count; i++) {
380 			int result = 0;
381 			uuid_t uuid;
382 			uuid_string_t uuid_str;
383 			size_t offset = 0;
384 			bti_telemetry_record_s *record_i = record_allocations[i];
385 
386 			if ((result = get_uuid_and_text_offset_for_addr(
387 				    record_i->faulting_address,
388 				    &uuid, &offset)) < 0) {
389 				/*
390 				 * We couldn't get the required data for symbolication for some
391 				 * odd reason. Report a NULL UUID and the address raw so we can
392 				 * track these invalid events.
393 				 */
394 				memset(&uuid, 0x00, sizeof(uuid));
395 				offset = VM_KERNEL_UNSLIDE(record_i->faulting_address);
396 			}
397 			uuid_unparse(uuid, uuid_str);
398 
399 			/* Print events to the console for local debug */
400 			dump_telemetry_record(record_i, uuid_str, offset);
401 
402 #if BTI_TELEMETRY_USE_CORE_ANALYTICS
403 			/* Report to CoreAnalytics */
404 			ca_event_t ca_event = CA_EVENT_ALLOCATE(arm_bti_exceptions);
405 			CA_EVENT_TYPE(arm_bti_exceptions) * event_data = ca_event->data;
406 
407 			event_data->branch_type = record_i->branch_type;
408 			event_data->faulting_offset = offset;
409 			strlcpy(event_data->faulting_uuid, uuid_str, CA_UUID_LEN);
410 
411 			CA_EVENT_SEND(ca_event);
412 #endif /* BTI_TELEMETRY_USE_CORE_ANALYTICS */
413 		}
414 
415 		safe_telemetry_lock_lock();
416 		/* LOCKED OUT */
417 	}
418 
419 	/* Done for now, if submitters have entries they'll need to call again. */
420 	rsb_is_draining = false;
421 	safe_telemetry_lock_unlock();
422 }
423 
424 __startup_func
425 void
bti_telemetry_init(void)426 bti_telemetry_init(void)
427 {
428 	printf(TAG "bti_telemetry_init\n");
429 	SPLAY_INIT(&telemetry_splay_tree);
430 
431 	drain_record_submission_buffer_callout = thread_call_allocate_with_options(
432 		drain_record_submission_buffer_thread_call, NULL,
433 		THREAD_CALL_PRIORITY_KERNEL, THREAD_CALL_OPTIONS_ONCE);
434 
435 	if (!drain_record_submission_buffer_callout) {
436 		panic("Failed to allocate drain callout!");
437 	}
438 }
439 
440 /**
441  * Submit RECORD to the submission queue. Returns TRUE if the record was
442  * ingested (either enqueued or dupe'd), FALSE otherwise.
443  */
444 static bool
submit_telemetry_record(bti_telemetry_record_s * record)445 submit_telemetry_record(bti_telemetry_record_s *record)
446 {
447 	bool did_ingest = true;
448 	bool should_flush_submission_buffer = false;
449 	bti_telemetry_record_s *splay_found_record = NULL;
450 	if (!safe_telemetry_lock_try_lock()) {
451 		/*
452 		 * Failed to acquire the lock!
453 		 * We're likely in a nested exception. Since we can't safely do anything
454 		 * else with the record, just drop it.
455 		 */
456 		return false;
457 	}
458 
459 	/* First, scan the submission queue for matching, queued records */
460 	for (size_t i = 0; i < rsb_count; i++) {
461 		size_t rsb_i = (rsb_rd_idx + i) % RECORD_SUBMISSION_BUFFER_LENGTH;
462 		bti_telemetry_record_s *record_i = record_submission_buffer + rsb_i;
463 		if (bti_telemetry_record_compare(record, record_i) == 0) {
464 			/* Match, no need to report again. */
465 			goto DONE_LOCKED;
466 		}
467 	}
468 
469 	/* Next, try for a record in the splay */
470 	splay_found_record = SPLAY_FIND(bti_telemetry_tree,
471 	    &telemetry_splay_tree,
472 	    record);
473 	if (splay_found_record) {
474 		/* Match, no need to report again. */
475 		goto DONE_LOCKED;
476 	}
477 
478 	/*
479 	 * If we haven't hit anywhere, this means we have a new event that needs to
480 	 * be enqueued for reporting.
481 	 */
482 	did_ingest = rsb_enqueue_locked(record);
483 	should_flush_submission_buffer = did_ingest && !rsb_is_draining;
484 
485 DONE_LOCKED:
486 	safe_telemetry_lock_unlock();
487 
488 	if (should_flush_submission_buffer) {
489 		/*
490 		 * We submitted a new entry while the drain thread was either exiting or
491 		 * not running. Queue a new flush. Multiple calls here before the drain
492 		 * starts running will not result in multiple calls being queued due to
493 		 * THREAD_CALL_OPTIONS_ONCE.
494 		 */
495 		thread_call_enter(drain_record_submission_buffer_callout);
496 	}
497 
498 	return did_ingest;
499 }
500 
501 /** Convert a BTI exception frame into a telemetry record */
502 static void
generate_telemetry_record(arm_saved_state_t * state,bti_telemetry_record_s * record)503 generate_telemetry_record(arm_saved_state_t *state,
504     bti_telemetry_record_s *record)
505 {
506 	uintptr_t pc = 0;
507 	uint64_t esr = 0;
508 
509 	pc = get_saved_state_pc(state);
510 	esr = get_saved_state_esr(state);
511 
512 	/* Generate the exception record */
513 	record->branch_type = (uint8_t)(esr & ISS_BTI_BTYPE_MASK);
514 	record->faulting_address = pc;
515 }
516 
517 /*
518  * Try and recover from a BTI exception. Returns true if we are able to recover,
519  * false otherwise.
520  */
521 static bool
recover_from_bti_exception(arm_saved_state_t * state)522 recover_from_bti_exception(arm_saved_state_t *state)
523 {
524 	/*
525 	 * Since BTI raises on a mismatched PSTATE.BTYPE, we can simply clear BTYPE
526 	 * and directly return from the exception to continue executing as if
527 	 * the exception never happened.
528 	 */
529 	uint32_t psr = get_saved_state_cpsr(state);
530 	psr &= ~PSR_BTYPE_MASK;
531 	set_saved_state_cpsr(state, psr);
532 
533 	return true;
534 }
535 
536 bool
bti_telemetry_handle_exception(arm_saved_state_t * state)537 bti_telemetry_handle_exception(arm_saved_state_t *state)
538 {
539 	bti_telemetry_record_s record = { 0 };
540 
541 	/* Generate the telemetry record and hand it to the submission thread */
542 	generate_telemetry_record(state, &record);
543 	(void)submit_telemetry_record(&record);
544 
545 	/* Recover and prepare to keep executing */
546 	return recover_from_bti_exception(state);
547 }
548 
549 #endif /* CONFIG_BTI_TELEMETRY */
550