xref: /xnu-8020.121.3/osfmk/kern/telemetry.c (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1 /*
2  * Copyright (c) 2012-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 #include <mach/host_priv.h>
29 #include <mach/host_special_ports.h>
30 #include <mach/mach_types.h>
31 #include <mach/telemetry_notification_server.h>
32 
33 #include <kern/assert.h>
34 #include <kern/clock.h>
35 #include <kern/debug.h>
36 #include <kern/host.h>
37 #include <kern/kalloc.h>
38 #include <kern/kern_types.h>
39 #include <kern/locks.h>
40 #include <kern/misc_protos.h>
41 #include <kern/sched.h>
42 #include <kern/sched_prim.h>
43 #include <kern/telemetry.h>
44 #include <kern/timer_call.h>
45 #include <kern/policy_internal.h>
46 #include <kern/kcdata.h>
47 
48 #include <pexpert/pexpert.h>
49 
50 #include <string.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_shared_region.h>
53 
54 #include <kperf/callstack.h>
55 #include <kern/backtrace.h>
56 #include <kern/monotonic.h>
57 
58 #include <security/mac_mach_internal.h>
59 
60 #include <sys/errno.h>
61 #include <sys/kdebug.h>
62 #include <uuid/uuid.h>
63 #include <kdp/kdp_dyld.h>
64 
65 #define TELEMETRY_DEBUG 0
66 
67 struct proc;
68 extern int      proc_pid(struct proc *);
69 extern char     *proc_name_address(void *p);
70 extern uint64_t proc_uniqueid(void *p);
71 extern uint64_t proc_was_throttled(void *p);
72 extern uint64_t proc_did_throttle(void *p);
73 extern int      proc_selfpid(void);
74 extern boolean_t task_did_exec(task_t task);
75 extern boolean_t task_is_exec_copy(task_t task);
76 
77 struct micro_snapshot_buffer {
78 	vm_offset_t             buffer;
79 	uint32_t                size;
80 	uint32_t                current_position;
81 	uint32_t                end_point;
82 };
83 
84 static bool telemetry_task_ready_for_sample(task_t task);
85 
86 static void telemetry_instrumentation_begin(
87 	struct micro_snapshot_buffer *buffer, enum micro_snapshot_flags flags);
88 
89 static void telemetry_instrumentation_end(struct micro_snapshot_buffer *buffer);
90 
91 static void telemetry_take_sample(thread_t thread, enum micro_snapshot_flags flags);
92 
93 #if CONFIG_MACF
94 static void telemetry_macf_take_sample(thread_t thread, enum micro_snapshot_flags flags);
95 #endif
96 
97 struct telemetry_target {
98 	thread_t                         thread;
99 	uintptr_t                       *frames;
100 	size_t                           frames_count;
101 	bool                             user64_regs;
102 	enum micro_snapshot_flags        microsnapshot_flags;
103 	struct micro_snapshot_buffer    *buffer;
104 	lck_mtx_t                       *buffer_mtx;
105 };
106 
107 static int telemetry_process_sample(
108 	const struct telemetry_target *target,
109 	bool release_buffer_lock,
110 	uint32_t *out_current_record_start);
111 
112 static int telemetry_buffer_gather(
113 	user_addr_t buffer,
114 	uint32_t *length,
115 	bool mark,
116 	struct micro_snapshot_buffer *current_buffer);
117 
118 #define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */
119 #define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024)
120 #define TELEMETRY_MAX_BUFFER_SIZE (64*1024)
121 
122 #define TELEMETRY_DEFAULT_NOTIFY_LEEWAY (4*1024) // Userland gets 4k of leeway to collect data after notification
123 #define TELEMETRY_MAX_UUID_COUNT (128) // Max of 128 non-shared-cache UUIDs to log for symbolication
124 
125 uint32_t                telemetry_sample_rate = 0;
126 volatile boolean_t      telemetry_needs_record = FALSE;
127 volatile boolean_t      telemetry_needs_timer_arming_record = FALSE;
128 
129 /*
130  * If TRUE, record micro-stackshot samples for all tasks.
131  * If FALSE, only sample tasks which are marked for telemetry.
132  */
133 bool     telemetry_sample_all_tasks = false;
134 bool     telemetry_sample_pmis = false;
135 uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry
136 
137 uint32_t telemetry_timestamp = 0;
138 
139 /*
140  * The telemetry_buffer is responsible
141  * for timer samples and interrupt samples that are driven by
142  * compute_averages().  It will notify its client (if one
143  * exists) when it has enough data to be worth flushing.
144  */
145 struct micro_snapshot_buffer telemetry_buffer = {
146 	.buffer = 0,
147 	.size = 0,
148 	.current_position = 0,
149 	.end_point = 0
150 };
151 
152 #if CONFIG_MACF
153 #define TELEMETRY_MACF_DEFAULT_BUFFER_SIZE (16*1024)
154 /*
155  * The MAC framework uses its own telemetry buffer for the purposes of auditing
156  * security-related work being done by userland threads.
157  */
158 struct micro_snapshot_buffer telemetry_macf_buffer = {
159 	.buffer = 0,
160 	.size = 0,
161 	.current_position = 0,
162 	.end_point = 0
163 };
164 #endif
165 
166 int                                     telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
167 int                                     telemetry_buffer_notify_at = 0;
168 
169 LCK_GRP_DECLARE(telemetry_lck_grp, "telemetry group");
170 LCK_MTX_DECLARE(telemetry_mtx, &telemetry_lck_grp);
171 LCK_MTX_DECLARE(telemetry_pmi_mtx, &telemetry_lck_grp);
172 LCK_MTX_DECLARE(telemetry_macf_mtx, &telemetry_lck_grp);
173 
174 #define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while (0)
175 #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx)
176 #define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while (0)
177 
178 #define TELEMETRY_PMI_LOCK() do { lck_mtx_lock(&telemetry_pmi_mtx); } while (0)
179 #define TELEMETRY_PMI_UNLOCK() do { lck_mtx_unlock(&telemetry_pmi_mtx); } while (0)
180 
181 #define TELEMETRY_MACF_LOCK() do { lck_mtx_lock(&telemetry_macf_mtx); } while (0)
182 #define TELEMETRY_MACF_UNLOCK() do { lck_mtx_unlock(&telemetry_macf_mtx); } while (0)
183 
184 void
telemetry_init(void)185 telemetry_init(void)
186 {
187 	kern_return_t ret;
188 	uint32_t          telemetry_notification_leeway;
189 
190 	if (!PE_parse_boot_argn("telemetry_buffer_size",
191 	    &telemetry_buffer.size, sizeof(telemetry_buffer.size))) {
192 		telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE;
193 	}
194 
195 	if (telemetry_buffer.size > TELEMETRY_MAX_BUFFER_SIZE) {
196 		telemetry_buffer.size = TELEMETRY_MAX_BUFFER_SIZE;
197 	}
198 
199 	ret = kmem_alloc(kernel_map, &telemetry_buffer.buffer, telemetry_buffer.size,
200 	    KMA_DATA | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG);
201 	if (ret != KERN_SUCCESS) {
202 		kprintf("Telemetry: Allocation failed: %d\n", ret);
203 		return;
204 	}
205 
206 	if (!PE_parse_boot_argn("telemetry_notification_leeway",
207 	    &telemetry_notification_leeway, sizeof(telemetry_notification_leeway))) {
208 		/*
209 		 * By default, notify the user to collect the buffer when there is this much space left in the buffer.
210 		 */
211 		telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
212 	}
213 	if (telemetry_notification_leeway >= telemetry_buffer.size) {
214 		printf("telemetry: nonsensical telemetry_notification_leeway boot-arg %d changed to %d\n",
215 		    telemetry_notification_leeway, TELEMETRY_DEFAULT_NOTIFY_LEEWAY);
216 		telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
217 	}
218 	telemetry_buffer_notify_at = telemetry_buffer.size - telemetry_notification_leeway;
219 
220 	if (!PE_parse_boot_argn("telemetry_sample_rate",
221 	    &telemetry_sample_rate, sizeof(telemetry_sample_rate))) {
222 		telemetry_sample_rate = TELEMETRY_DEFAULT_SAMPLE_RATE;
223 	}
224 
225 	/*
226 	 * To enable telemetry for all tasks, include "telemetry_sample_all_tasks=1" in boot-args.
227 	 */
228 	if (!PE_parse_boot_argn("telemetry_sample_all_tasks",
229 	    &telemetry_sample_all_tasks, sizeof(telemetry_sample_all_tasks))) {
230 #if !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG)
231 		telemetry_sample_all_tasks = false;
232 #else
233 		telemetry_sample_all_tasks = true;
234 #endif /* !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG) */
235 	}
236 
237 	kprintf("Telemetry: Sampling %stasks once per %u second%s\n",
238 	    (telemetry_sample_all_tasks) ? "all " : "",
239 	    telemetry_sample_rate, telemetry_sample_rate == 1 ? "" : "s");
240 }
241 
242 /*
243  * Enable or disable global microstackshots (ie telemetry_sample_all_tasks).
244  *
245  * enable_disable == 1: turn it on
246  * enable_disable == 0: turn it off
247  */
248 void
telemetry_global_ctl(int enable_disable)249 telemetry_global_ctl(int enable_disable)
250 {
251 	if (enable_disable == 1) {
252 		telemetry_sample_all_tasks = true;
253 	} else {
254 		telemetry_sample_all_tasks = false;
255 	}
256 }
257 
258 /*
259  * Opt the given task into or out of the telemetry stream.
260  *
261  * Supported reasons (callers may use any or all of):
262  *     TF_CPUMON_WARNING
263  *     TF_WAKEMON_WARNING
264  *
265  * enable_disable == 1: turn it on
266  * enable_disable == 0: turn it off
267  */
268 void
telemetry_task_ctl(task_t task,uint32_t reasons,int enable_disable)269 telemetry_task_ctl(task_t task, uint32_t reasons, int enable_disable)
270 {
271 	task_lock(task);
272 	telemetry_task_ctl_locked(task, reasons, enable_disable);
273 	task_unlock(task);
274 }
275 
276 void
telemetry_task_ctl_locked(task_t task,uint32_t reasons,int enable_disable)277 telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable)
278 {
279 	uint32_t origflags;
280 
281 	assert((reasons != 0) && ((reasons | TF_TELEMETRY) == TF_TELEMETRY));
282 
283 	task_lock_assert_owned(task);
284 
285 	origflags = task->t_flags;
286 
287 	if (enable_disable == 1) {
288 		task->t_flags |= reasons;
289 		if ((origflags & TF_TELEMETRY) == 0) {
290 			OSIncrementAtomic(&telemetry_active_tasks);
291 #if TELEMETRY_DEBUG
292 			printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
293 #endif
294 		}
295 	} else {
296 		task->t_flags &= ~reasons;
297 		if (((origflags & TF_TELEMETRY) != 0) && ((task->t_flags & TF_TELEMETRY) == 0)) {
298 			/*
299 			 * If this task went from having at least one telemetry bit to having none,
300 			 * the net change was to disable telemetry for the task.
301 			 */
302 			OSDecrementAtomic(&telemetry_active_tasks);
303 #if TELEMETRY_DEBUG
304 			printf("%s: telemetry ON -> OFF (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
305 #endif
306 		}
307 	}
308 }
309 
310 /*
311  * Determine if the current thread is eligible for telemetry:
312  *
313  * telemetry_sample_all_tasks: All threads are eligible. This takes precedence.
314  * telemetry_active_tasks: Count of tasks opted in.
315  * task->t_flags & TF_TELEMETRY: This task is opted in.
316  */
317 static bool
telemetry_is_active(thread_t thread)318 telemetry_is_active(thread_t thread)
319 {
320 	task_t task = get_threadtask(thread);
321 
322 	if (task == kernel_task) {
323 		/* Kernel threads never return to an AST boundary, and are ineligible */
324 		return false;
325 	}
326 
327 	if (telemetry_sample_all_tasks || telemetry_sample_pmis) {
328 		return true;
329 	}
330 
331 	if ((telemetry_active_tasks > 0) && ((task->t_flags & TF_TELEMETRY) != 0)) {
332 		return true;
333 	}
334 
335 	return false;
336 }
337 
338 /*
339  * Userland is arming a timer. If we are eligible for such a record,
340  * sample now. No need to do this one at the AST because we're already at
341  * a safe place in this system call.
342  */
343 int
telemetry_timer_event(__unused uint64_t deadline,__unused uint64_t interval,__unused uint64_t leeway)344 telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval, __unused uint64_t leeway)
345 {
346 	if (telemetry_needs_timer_arming_record == TRUE) {
347 		telemetry_needs_timer_arming_record = FALSE;
348 		telemetry_take_sample(current_thread(), (enum micro_snapshot_flags)(kTimerArmingRecord | kUserMode));
349 	}
350 
351 	return 0;
352 }
353 
354 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
355 static void
telemetry_pmi_handler(bool user_mode,__unused void * ctx)356 telemetry_pmi_handler(bool user_mode, __unused void *ctx)
357 {
358 	telemetry_mark_curthread(user_mode, TRUE);
359 }
360 #endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
361 
362 int
telemetry_pmi_setup(enum telemetry_pmi pmi_ctr,uint64_t period)363 telemetry_pmi_setup(enum telemetry_pmi pmi_ctr, uint64_t period)
364 {
365 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
366 	static bool sample_all_tasks_aside = false;
367 	static uint32_t active_tasks_aside = false;
368 	int error = 0;
369 	const char *name = "?";
370 
371 	unsigned int ctr = 0;
372 
373 	TELEMETRY_PMI_LOCK();
374 
375 	switch (pmi_ctr) {
376 	case TELEMETRY_PMI_NONE:
377 		if (!telemetry_sample_pmis) {
378 			error = 1;
379 			goto out;
380 		}
381 
382 		telemetry_sample_pmis = false;
383 		telemetry_sample_all_tasks = sample_all_tasks_aside;
384 		telemetry_active_tasks = active_tasks_aside;
385 		error = mt_microstackshot_stop();
386 		if (!error) {
387 			printf("telemetry: disabling ustackshot on PMI\n");
388 		}
389 		goto out;
390 
391 	case TELEMETRY_PMI_INSTRS:
392 		ctr = MT_CORE_INSTRS;
393 		name = "instructions";
394 		break;
395 
396 	case TELEMETRY_PMI_CYCLES:
397 		ctr = MT_CORE_CYCLES;
398 		name = "cycles";
399 		break;
400 
401 	default:
402 		error = 1;
403 		goto out;
404 	}
405 
406 	telemetry_sample_pmis = true;
407 	sample_all_tasks_aside = telemetry_sample_all_tasks;
408 	active_tasks_aside = telemetry_active_tasks;
409 	telemetry_sample_all_tasks = false;
410 	telemetry_active_tasks = 0;
411 
412 	error = mt_microstackshot_start(ctr, period, telemetry_pmi_handler, NULL);
413 	if (!error) {
414 		printf("telemetry: ustackshot every %llu %s\n", period, name);
415 	}
416 
417 out:
418 	TELEMETRY_PMI_UNLOCK();
419 	return error;
420 #else /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
421 #pragma unused(pmi_ctr, period)
422 	return 1;
423 #endif /* !defined(MT_CORE_INSTRS) || !defined(MT_CORE_CYCLES) */
424 }
425 
426 /*
427  * Mark the current thread for an interrupt-based
428  * telemetry record, to be sampled at the next AST boundary.
429  */
430 void
telemetry_mark_curthread(boolean_t interrupted_userspace,boolean_t pmi)431 telemetry_mark_curthread(boolean_t interrupted_userspace, boolean_t pmi)
432 {
433 	uint32_t ast_bits = 0;
434 	thread_t thread = current_thread();
435 
436 	/*
437 	 * If telemetry isn't active for this thread, return and try
438 	 * again next time.
439 	 */
440 	if (telemetry_is_active(thread) == false) {
441 		return;
442 	}
443 
444 	ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL);
445 	if (pmi) {
446 		ast_bits |= AST_TELEMETRY_PMI;
447 	}
448 
449 	telemetry_needs_record = FALSE;
450 	thread_ast_set(thread, ast_bits);
451 	ast_propagate(thread);
452 }
453 
454 void
compute_telemetry(void * arg __unused)455 compute_telemetry(void *arg __unused)
456 {
457 	if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) {
458 		if ((++telemetry_timestamp) % telemetry_sample_rate == 0) {
459 			telemetry_needs_record = TRUE;
460 			telemetry_needs_timer_arming_record = TRUE;
461 		}
462 	}
463 }
464 
465 /*
466  * If userland has registered a port for telemetry notifications, send one now.
467  */
468 static void
telemetry_notify_user(void)469 telemetry_notify_user(void)
470 {
471 	mach_port_t user_port = MACH_PORT_NULL;
472 
473 	kern_return_t kr = host_get_telemetry_port(host_priv_self(), &user_port);
474 	if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
475 		return;
476 	}
477 
478 	telemetry_notification(user_port, 0);
479 	ipc_port_release_send(user_port);
480 }
481 
482 void
telemetry_ast(thread_t thread,ast_t reasons)483 telemetry_ast(thread_t thread, ast_t reasons)
484 {
485 	assert((reasons & AST_TELEMETRY_ALL) != 0);
486 
487 	uint8_t record_type = 0;
488 	if (reasons & AST_TELEMETRY_IO) {
489 		record_type |= kIORecord;
490 	}
491 	if (reasons & (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL)) {
492 		record_type |= (reasons & AST_TELEMETRY_PMI) ? kPMIRecord :
493 		    kInterruptRecord;
494 	}
495 
496 	if ((reasons & AST_TELEMETRY_MACF) != 0) {
497 		record_type |= kMACFRecord;
498 	}
499 
500 	enum micro_snapshot_flags user_telemetry = (reasons & AST_TELEMETRY_USER) ? kUserMode : 0;
501 	enum micro_snapshot_flags microsnapshot_flags = record_type | user_telemetry;
502 
503 	if ((reasons & AST_TELEMETRY_MACF) != 0) {
504 		telemetry_macf_take_sample(thread, microsnapshot_flags);
505 	}
506 
507 	if ((reasons & (AST_TELEMETRY_IO | AST_TELEMETRY_KERNEL | AST_TELEMETRY_PMI
508 	    | AST_TELEMETRY_USER)) != 0) {
509 		telemetry_take_sample(thread, microsnapshot_flags);
510 	}
511 }
512 
513 bool
telemetry_task_ready_for_sample(task_t task)514 telemetry_task_ready_for_sample(task_t task)
515 {
516 	return task != TASK_NULL &&
517 	       task != kernel_task &&
518 	       !task_did_exec(task) &&
519 	       !task_is_exec_copy(task);
520 }
521 
522 void
telemetry_instrumentation_begin(__unused struct micro_snapshot_buffer * buffer,__unused enum micro_snapshot_flags flags)523 telemetry_instrumentation_begin(
524 	__unused struct micro_snapshot_buffer *buffer,
525 	__unused enum micro_snapshot_flags flags)
526 {
527 	/* telemetry_XXX accessed outside of lock for instrumentation only */
528 	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START,
529 	    flags, telemetry_bytes_since_last_mark, 0,
530 	    (&telemetry_buffer != buffer));
531 }
532 
533 void
telemetry_instrumentation_end(__unused struct micro_snapshot_buffer * buffer)534 telemetry_instrumentation_end(__unused struct micro_snapshot_buffer *buffer)
535 {
536 	/* telemetry_XXX accessed outside of lock for instrumentation only */
537 	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END,
538 	    (&telemetry_buffer == buffer), telemetry_bytes_since_last_mark,
539 	    buffer->current_position, buffer->end_point);
540 }
541 
542 void
telemetry_take_sample(thread_t thread,enum micro_snapshot_flags flags)543 telemetry_take_sample(thread_t thread, enum micro_snapshot_flags flags)
544 {
545 	task_t                      task;
546 	uintptr_t                   frames[128];
547 	uint32_t                    btcount;
548 	struct backtrace_user_info  btinfo = BTUINFO_INIT;
549 
550 	if (thread == THREAD_NULL) {
551 		return;
552 	}
553 
554 	/* Ensure task is ready for taking a sample. */
555 	task = get_threadtask(thread);
556 	if (!telemetry_task_ready_for_sample(task)) {
557 		return;
558 	}
559 
560 	telemetry_instrumentation_begin(&telemetry_buffer, flags);
561 
562 	/* Collect backtrace from user thread. */
563 	btcount = backtrace_user(
564 		frames,                                 /* bt */
565 		sizeof(frames) / sizeof(frames[0]),     /* btlen */
566 		NULL,                                   /* ctl */
567 		&btinfo                                 /* info_out */
568 		);
569 
570 	if (btinfo.btui_error != 0) {
571 		return;
572 	}
573 
574 	/* Process the backtrace. */
575 	struct telemetry_target target = {
576 		.thread = thread,
577 		.frames = frames,
578 		.frames_count = btcount,
579 		.user64_regs = (btinfo.btui_info & BTI_64_BIT) != 0,
580 		.microsnapshot_flags = flags,
581 		.buffer = &telemetry_buffer,
582 		.buffer_mtx = &telemetry_mtx
583 	};
584 	telemetry_process_sample(&target, true, NULL);
585 
586 	telemetry_instrumentation_end(&telemetry_buffer);
587 }
588 
589 #if CONFIG_MACF
590 void
telemetry_macf_take_sample(thread_t thread,enum micro_snapshot_flags flags)591 telemetry_macf_take_sample(thread_t thread, enum micro_snapshot_flags flags)
592 {
593 	task_t                        task;
594 
595 	vm_size_t                     btcapacity     = 128;
596 	uintptr_t                     frames_stack[btcapacity];
597 	uint32_t                      btcount        = 0;
598 	uintptr_t                    *frames         = frames_stack;
599 	bool                          alloced_frames = false;
600 
601 	struct backtrace_user_info    btinfo         = BTUINFO_INIT;
602 	struct backtrace_control      btctl          = BTCTL_INIT;
603 
604 	uint32_t                      retry_count    = 0;
605 	const uint32_t                max_retries    = 10;
606 
607 	bool                          initialized    = false;
608 	struct micro_snapshot_buffer *telbuf         = &telemetry_macf_buffer;
609 	uint32_t                      record_start   = 0;
610 	bool                          did_process    = false;
611 	int                           rv             = 0;
612 
613 	if (thread == THREAD_NULL) {
614 		return;
615 	}
616 
617 	telemetry_instrumentation_begin(telbuf, flags);
618 
619 	/* Ensure task is ready for taking a sample. */
620 	task = get_threadtask(thread);
621 	if (!telemetry_task_ready_for_sample(task)) {
622 		rv = EBUSY;
623 		goto out;
624 	}
625 
626 	/* Ensure MACF telemetry buffer was initialized. */
627 	TELEMETRY_MACF_LOCK();
628 	initialized = (telbuf->size > 0);
629 	TELEMETRY_MACF_UNLOCK();
630 
631 	if (!initialized) {
632 		rv = ENOMEM;
633 		goto out;
634 	}
635 
636 	/* Collect backtrace from user thread. */
637 	while (retry_count < max_retries) {
638 		btcount += backtrace_user(frames + btcount, btcapacity - btcount, &btctl, &btinfo);
639 
640 		if ((btinfo.btui_info & BTI_TRUNCATED) != 0 && btinfo.btui_next_frame_addr != 0) {
641 			/*
642 			 * Fast path uses stack memory to avoid an allocation. We must
643 			 * pivot to heap memory in the case where we cannot write the
644 			 * complete backtrace to this buffer.
645 			 */
646 			if (frames == frames_stack) {
647 				btcapacity += 128;
648 				frames = kalloc_data(btcapacity * sizeof(*frames), Z_WAITOK);
649 
650 				if (frames == NULL) {
651 					break;
652 				}
653 
654 				alloced_frames = true;
655 
656 				assert(btcapacity > sizeof(frames_stack) / sizeof(frames_stack[0]));
657 				memcpy(frames, frames_stack, sizeof(frames_stack));
658 			} else {
659 				assert(alloced_frames);
660 				frames = krealloc_data(frames,
661 				    btcapacity * sizeof(*frames),
662 				    (btcapacity + 128) * sizeof(*frames),
663 				    Z_WAITOK);
664 
665 				if (frames == NULL) {
666 					break;
667 				}
668 
669 				btcapacity += 128;
670 			}
671 
672 			btctl.btc_frame_addr = btinfo.btui_next_frame_addr;
673 			++retry_count;
674 		} else {
675 			break;
676 		}
677 	}
678 
679 	if (frames == NULL) {
680 		rv = ENOMEM;
681 		goto out;
682 	} else if (btinfo.btui_error != 0) {
683 		rv = btinfo.btui_error;
684 		goto out;
685 	}
686 
687 	/* Process the backtrace. */
688 	struct telemetry_target target = {
689 		.thread = thread,
690 		.frames = frames,
691 		.frames_count = btcount,
692 		.user64_regs = (btinfo.btui_info & BTI_64_BIT) != 0,
693 		.microsnapshot_flags = flags,
694 		.buffer = telbuf,
695 		.buffer_mtx = &telemetry_macf_mtx
696 	};
697 	rv = telemetry_process_sample(&target, false, &record_start);
698 	did_process = true;
699 
700 out:
701 	/* Immediately deliver the collected sample to MAC clients. */
702 	if (rv == 0) {
703 		assert(telbuf->current_position >= record_start);
704 		mac_thread_telemetry(thread,
705 		    0,
706 		    (void *)(telbuf->buffer + record_start),
707 		    telbuf->current_position - record_start);
708 	} else {
709 		mac_thread_telemetry(thread, rv, NULL, 0);
710 	}
711 
712 	/*
713 	 * The lock was taken by telemetry_process_sample, and we asked it not to
714 	 * unlock upon completion, so we must release the lock here.
715 	 */
716 	if (did_process) {
717 		TELEMETRY_MACF_UNLOCK();
718 	}
719 
720 	if (alloced_frames && frames != NULL) {
721 		kfree_data(frames, btcapacity * sizeof(*frames));
722 	}
723 
724 	telemetry_instrumentation_end(telbuf);
725 }
726 #endif /* CONFIG_MACF */
727 
728 int
telemetry_process_sample(const struct telemetry_target * target,bool release_buffer_lock,uint32_t * out_current_record_start)729 telemetry_process_sample(const struct telemetry_target *target,
730     bool release_buffer_lock,
731     uint32_t *out_current_record_start)
732 {
733 	thread_t thread = target->thread;
734 	uintptr_t *frames = target->frames;
735 	size_t btcount = target->frames_count;
736 	bool user64_regs = target->user64_regs;
737 	enum micro_snapshot_flags microsnapshot_flags = target->microsnapshot_flags;
738 	struct micro_snapshot_buffer *current_buffer = target->buffer;
739 	lck_mtx_t *buffer_mtx = target->buffer_mtx;
740 
741 	task_t task;
742 	void *p;
743 	uint32_t bti;
744 	struct micro_snapshot *msnap;
745 	struct task_snapshot *tsnap;
746 	struct thread_snapshot *thsnap;
747 	clock_sec_t secs;
748 	clock_usec_t usecs;
749 	vm_size_t framesize;
750 	uint32_t current_record_start;
751 	uint32_t tmp = 0;
752 	bool notify = false;
753 	int     rv = 0;
754 
755 	if (thread == THREAD_NULL) {
756 		return EINVAL;
757 	}
758 
759 	task = get_threadtask(thread);
760 	p = get_bsdtask_info(task);
761 	bool user64_va = task_has_64Bit_addr(task);
762 
763 	/*
764 	 * Retrieve the array of UUID's for binaries used by this task.
765 	 * We reach down into DYLD's data structures to find the array.
766 	 *
767 	 * XXX - make this common with kdp?
768 	 */
769 	uint32_t uuid_info_count = 0;
770 	mach_vm_address_t uuid_info_addr = 0;
771 	uint32_t uuid_info_size = 0;
772 	if (user64_va) {
773 		uuid_info_size = sizeof(struct user64_dyld_uuid_info);
774 		struct user64_dyld_all_image_infos task_image_infos;
775 		if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
776 			uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
777 			uuid_info_addr = task_image_infos.uuidArray;
778 		}
779 	} else {
780 		uuid_info_size = sizeof(struct user32_dyld_uuid_info);
781 		struct user32_dyld_all_image_infos task_image_infos;
782 		if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
783 			uuid_info_count = task_image_infos.uuidArrayCount;
784 			uuid_info_addr = task_image_infos.uuidArray;
785 		}
786 	}
787 
788 	/*
789 	 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
790 	 * this data structure), we zero the uuid_info_count so that we won't even try to save load info
791 	 * for this task.
792 	 */
793 	if (!uuid_info_addr) {
794 		uuid_info_count = 0;
795 	}
796 
797 	/*
798 	 * Don't copy in an unbounded amount of memory. The main binary and interesting
799 	 * non-shared-cache libraries should be in the first few images.
800 	 */
801 	if (uuid_info_count > TELEMETRY_MAX_UUID_COUNT) {
802 		uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
803 	}
804 
805 	uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
806 	char     *uuid_info_array = NULL;
807 
808 	if (uuid_info_count > 0) {
809 		uuid_info_array = kalloc_data(uuid_info_array_size, Z_WAITOK);
810 		if (uuid_info_array == NULL) {
811 			return ENOMEM;
812 		}
813 
814 		/*
815 		 * Copy in the UUID info array.
816 		 * It may be nonresident, in which case just fix up nloadinfos to 0 in the task snapshot.
817 		 */
818 		if (copyin(uuid_info_addr, uuid_info_array, uuid_info_array_size) != 0) {
819 			kfree_data(uuid_info_array, uuid_info_array_size);
820 			uuid_info_array = NULL;
821 			uuid_info_array_size = 0;
822 		}
823 	}
824 
825 	/*
826 	 * Look for a dispatch queue serial number, and copy it in from userland if present.
827 	 */
828 	uint64_t dqserialnum = 0;
829 	int              dqserialnum_valid = 0;
830 
831 	uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
832 	if (dqkeyaddr != 0) {
833 		uint64_t dqaddr = 0;
834 		uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task);
835 		if ((copyin(dqkeyaddr, (char *)&dqaddr, (user64_va ? 8 : 4)) == 0) &&
836 		    (dqaddr != 0) && (dq_serialno_offset != 0)) {
837 			uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
838 			if (copyin(dqserialnumaddr, (char *)&dqserialnum, (user64_va ? 8 : 4)) == 0) {
839 				dqserialnum_valid = 1;
840 			}
841 		}
842 	}
843 
844 	clock_get_calendar_microtime(&secs, &usecs);
845 
846 	lck_mtx_lock(buffer_mtx);
847 
848 	/*
849 	 * If our buffer is not backed by anything,
850 	 * then we cannot take the sample.  Meant to allow us to deallocate the window
851 	 * buffer if it is disabled.
852 	 */
853 	if (!current_buffer->buffer) {
854 		rv = EINVAL;
855 		goto cancel_sample;
856 	}
857 
858 	/*
859 	 * We do the bulk of the operation under the telemetry lock, on assumption that
860 	 * any page faults during execution will not cause another AST_TELEMETRY_ALL
861 	 * to deadlock; they will just block until we finish. This makes it easier
862 	 * to copy into the buffer directly. As soon as we unlock, userspace can copy
863 	 * out of our buffer.
864 	 */
865 
866 copytobuffer:
867 
868 	current_record_start = current_buffer->current_position;
869 
870 	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct micro_snapshot)) {
871 		/*
872 		 * We can't fit a record in the space available, so wrap around to the beginning.
873 		 * Save the current position as the known end point of valid data.
874 		 */
875 		current_buffer->end_point = current_record_start;
876 		current_buffer->current_position = 0;
877 		if (current_record_start == 0) {
878 			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
879 			rv = ERANGE;
880 			goto cancel_sample;
881 		}
882 		goto copytobuffer;
883 	}
884 
885 	msnap = (struct micro_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
886 	msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC;
887 	msnap->ms_flags = (uint8_t)microsnapshot_flags;
888 	msnap->ms_opaque_flags = 0; /* namespace managed by userspace */
889 	msnap->ms_cpu = cpu_number();
890 	msnap->ms_time = secs;
891 	msnap->ms_time_microsecs = usecs;
892 
893 	current_buffer->current_position += sizeof(struct micro_snapshot);
894 
895 	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct task_snapshot)) {
896 		current_buffer->end_point = current_record_start;
897 		current_buffer->current_position = 0;
898 		if (current_record_start == 0) {
899 			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
900 			rv = ERANGE;
901 			goto cancel_sample;
902 		}
903 		goto copytobuffer;
904 	}
905 
906 	tsnap = (struct task_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
907 	bzero(tsnap, sizeof(*tsnap));
908 	tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC;
909 	tsnap->pid = proc_pid(p);
910 	tsnap->uniqueid = proc_uniqueid(p);
911 	tsnap->user_time_in_terminated_threads = task->total_user_time;
912 	tsnap->system_time_in_terminated_threads = task->total_system_time;
913 	tsnap->suspend_count = task->suspend_count;
914 	tsnap->task_size = (typeof(tsnap->task_size))(get_task_phys_footprint(task) / PAGE_SIZE);
915 	tsnap->faults = counter_load(&task->faults);
916 	tsnap->pageins = counter_load(&task->pageins);
917 	tsnap->cow_faults = counter_load(&task->cow_faults);
918 	/*
919 	 * The throttling counters are maintained as 64-bit counters in the proc
920 	 * structure. However, we reserve 32-bits (each) for them in the task_snapshot
921 	 * struct to save space and since we do not expect them to overflow 32-bits. If we
922 	 * find these values overflowing in the future, the fix would be to simply
923 	 * upgrade these counters to 64-bit in the task_snapshot struct
924 	 */
925 	tsnap->was_throttled = (uint32_t) proc_was_throttled(p);
926 	tsnap->did_throttle = (uint32_t) proc_did_throttle(p);
927 
928 	if (task->t_flags & TF_TELEMETRY) {
929 		tsnap->ss_flags |= kTaskRsrcFlagged;
930 	}
931 
932 	if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) {
933 		tsnap->ss_flags |= kTaskDarwinBG;
934 	}
935 
936 	proc_get_darwinbgstate(task, &tmp);
937 
938 	if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION) {
939 		tsnap->ss_flags |= kTaskIsForeground;
940 	}
941 
942 	if (tmp & PROC_FLAG_ADAPTIVE_IMPORTANT) {
943 		tsnap->ss_flags |= kTaskIsBoosted;
944 	}
945 
946 	if (tmp & PROC_FLAG_SUPPRESSED) {
947 		tsnap->ss_flags |= kTaskIsSuppressed;
948 	}
949 
950 
951 	tsnap->latency_qos = task_grab_latency_qos(task);
952 
953 	strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
954 	if (user64_va) {
955 		tsnap->ss_flags |= kUser64_p;
956 	}
957 
958 
959 	if (task->task_shared_region_slide != -1) {
960 		tsnap->shared_cache_slide = task->task_shared_region_slide;
961 		bcopy(task->task_shared_region_uuid, tsnap->shared_cache_identifier,
962 		    sizeof(task->task_shared_region_uuid));
963 	}
964 
965 	current_buffer->current_position += sizeof(struct task_snapshot);
966 
967 	/*
968 	 * Directly after the task snapshot, place the array of UUID's corresponding to the binaries
969 	 * used by this task.
970 	 */
971 	if ((current_buffer->size - current_buffer->current_position) < uuid_info_array_size) {
972 		current_buffer->end_point = current_record_start;
973 		current_buffer->current_position = 0;
974 		if (current_record_start == 0) {
975 			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
976 			rv = ERANGE;
977 			goto cancel_sample;
978 		}
979 		goto copytobuffer;
980 	}
981 
982 	/*
983 	 * Copy the UUID info array into our sample.
984 	 */
985 	if (uuid_info_array_size > 0) {
986 		bcopy(uuid_info_array, (char *)(current_buffer->buffer + current_buffer->current_position), uuid_info_array_size);
987 		tsnap->nloadinfos = uuid_info_count;
988 	}
989 
990 	current_buffer->current_position += uuid_info_array_size;
991 
992 	/*
993 	 * After the task snapshot & list of binary UUIDs, we place a thread snapshot.
994 	 */
995 
996 	if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) {
997 		/* wrap and overwrite */
998 		current_buffer->end_point = current_record_start;
999 		current_buffer->current_position = 0;
1000 		if (current_record_start == 0) {
1001 			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1002 			rv = ERANGE;
1003 			goto cancel_sample;
1004 		}
1005 		goto copytobuffer;
1006 	}
1007 
1008 	thsnap = (struct thread_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
1009 	bzero(thsnap, sizeof(*thsnap));
1010 
1011 	thsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC;
1012 	thsnap->thread_id = thread_tid(thread);
1013 	thsnap->state = thread->state;
1014 	thsnap->priority = thread->base_pri;
1015 	thsnap->sched_pri = thread->sched_pri;
1016 	thsnap->sched_flags = thread->sched_flags;
1017 	thsnap->ss_flags |= kStacksPCOnly;
1018 	thsnap->ts_qos = thread->effective_policy.thep_qos;
1019 	thsnap->ts_rqos = thread->requested_policy.thrp_qos;
1020 	thsnap->ts_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
1021 	    thread->requested_policy.thrp_qos_workq_override);
1022 
1023 	if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) {
1024 		thsnap->ss_flags |= kThreadDarwinBG;
1025 	}
1026 
1027 	thsnap->user_time = timer_grab(&thread->user_timer);
1028 
1029 	uint64_t tval = timer_grab(&thread->system_timer);
1030 
1031 	if (thread->precise_user_kernel_time) {
1032 		thsnap->system_time = tval;
1033 	} else {
1034 		thsnap->user_time += tval;
1035 		thsnap->system_time = 0;
1036 	}
1037 
1038 	current_buffer->current_position += sizeof(struct thread_snapshot);
1039 
1040 	/*
1041 	 * If this thread has a dispatch queue serial number, include it here.
1042 	 */
1043 	if (dqserialnum_valid) {
1044 		if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) {
1045 			/* wrap and overwrite */
1046 			current_buffer->end_point = current_record_start;
1047 			current_buffer->current_position = 0;
1048 			if (current_record_start == 0) {
1049 				/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1050 				rv = ERANGE;
1051 				goto cancel_sample;
1052 			}
1053 			goto copytobuffer;
1054 		}
1055 
1056 		thsnap->ss_flags |= kHasDispatchSerial;
1057 		bcopy(&dqserialnum, (char *)current_buffer->buffer + current_buffer->current_position, sizeof(dqserialnum));
1058 		current_buffer->current_position += sizeof(dqserialnum);
1059 	}
1060 
1061 	if (user64_regs) {
1062 		framesize = 8;
1063 		thsnap->ss_flags |= kUser64_p;
1064 	} else {
1065 		framesize = 4;
1066 	}
1067 
1068 	/*
1069 	 * If we can't fit this entire stacktrace then cancel this record, wrap to the beginning,
1070 	 * and start again there so that we always store a full record.
1071 	 */
1072 	if ((current_buffer->size - current_buffer->current_position) / framesize < btcount) {
1073 		current_buffer->end_point = current_record_start;
1074 		current_buffer->current_position = 0;
1075 		if (current_record_start == 0) {
1076 			/* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1077 			rv = ERANGE;
1078 			goto cancel_sample;
1079 		}
1080 		goto copytobuffer;
1081 	}
1082 
1083 	for (bti = 0; bti < btcount; bti++, current_buffer->current_position += framesize) {
1084 		if (framesize == 8) {
1085 			*(uint64_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = frames[bti];
1086 		} else {
1087 			*(uint32_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = (uint32_t)frames[bti];
1088 		}
1089 	}
1090 
1091 	if (current_buffer->end_point < current_buffer->current_position) {
1092 		/*
1093 		 * Each time the cursor wraps around to the beginning, we leave a
1094 		 * differing amount of unused space at the end of the buffer. Make
1095 		 * sure the cursor pushes the end point in case we're making use of
1096 		 * more of the buffer than we did the last time we wrapped.
1097 		 */
1098 		current_buffer->end_point = current_buffer->current_position;
1099 	}
1100 
1101 	thsnap->nuser_frames = btcount;
1102 
1103 	/*
1104 	 * Now THIS is a hack.
1105 	 */
1106 	if (current_buffer == &telemetry_buffer) {
1107 		telemetry_bytes_since_last_mark += (current_buffer->current_position - current_record_start);
1108 		if (telemetry_bytes_since_last_mark > telemetry_buffer_notify_at) {
1109 			notify = true;
1110 		}
1111 	}
1112 
1113 	if (out_current_record_start != NULL) {
1114 		*out_current_record_start = current_record_start;
1115 	}
1116 
1117 cancel_sample:
1118 	if (release_buffer_lock) {
1119 		lck_mtx_unlock(buffer_mtx);
1120 	}
1121 
1122 	if (notify) {
1123 		telemetry_notify_user();
1124 	}
1125 
1126 	if (uuid_info_array != NULL) {
1127 		kfree_data(uuid_info_array, uuid_info_array_size);
1128 	}
1129 
1130 	return rv;
1131 }
1132 
1133 #if TELEMETRY_DEBUG
1134 static void
log_telemetry_output(vm_offset_t buf,uint32_t pos,uint32_t sz)1135 log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz)
1136 {
1137 	struct micro_snapshot *p;
1138 	uint32_t offset;
1139 
1140 	printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos);
1141 
1142 	buf += pos;
1143 
1144 	/*
1145 	 * Find and log each timestamp in this chunk of buffer.
1146 	 */
1147 	for (offset = 0; offset < sz; offset++) {
1148 		p = (struct micro_snapshot *)(buf + offset);
1149 		if (p->snapshot_magic == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
1150 			printf("telemetry timestamp: %lld\n", p->ms_time);
1151 		}
1152 	}
1153 }
1154 #endif
1155 
1156 int
telemetry_gather(user_addr_t buffer,uint32_t * length,bool mark)1157 telemetry_gather(user_addr_t buffer, uint32_t *length, bool mark)
1158 {
1159 	return telemetry_buffer_gather(buffer, length, mark, &telemetry_buffer);
1160 }
1161 
1162 int
telemetry_buffer_gather(user_addr_t buffer,uint32_t * length,bool mark,struct micro_snapshot_buffer * current_buffer)1163 telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, bool mark, struct micro_snapshot_buffer * current_buffer)
1164 {
1165 	int result = 0;
1166 	uint32_t oldest_record_offset;
1167 
1168 	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START,
1169 	    mark, telemetry_bytes_since_last_mark, 0,
1170 	    (&telemetry_buffer != current_buffer));
1171 
1172 	TELEMETRY_LOCK();
1173 
1174 	if (current_buffer->buffer == 0) {
1175 		*length = 0;
1176 		goto out;
1177 	}
1178 
1179 	if (*length < current_buffer->size) {
1180 		result = KERN_NO_SPACE;
1181 		goto out;
1182 	}
1183 
1184 	/*
1185 	 * Copy the ring buffer out to userland in order sorted by time: least recent to most recent.
1186 	 * First, we need to search forward from the cursor to find the oldest record in our buffer.
1187 	 */
1188 	oldest_record_offset = current_buffer->current_position;
1189 	do {
1190 		if (((oldest_record_offset + sizeof(uint32_t)) > current_buffer->size) ||
1191 		    ((oldest_record_offset + sizeof(uint32_t)) > current_buffer->end_point)) {
1192 			if (*(uint32_t *)(uintptr_t)(current_buffer->buffer) == 0) {
1193 				/*
1194 				 * There is no magic number at the start of the buffer, which means
1195 				 * it's empty; nothing to see here yet.
1196 				 */
1197 				*length = 0;
1198 				goto out;
1199 			}
1200 			/*
1201 			 * We've looked through the end of the active buffer without finding a valid
1202 			 * record; that means all valid records are in a single chunk, beginning at
1203 			 * the very start of the buffer.
1204 			 */
1205 
1206 			oldest_record_offset = 0;
1207 			assert(*(uint32_t *)(uintptr_t)(current_buffer->buffer) == STACKSHOT_MICRO_SNAPSHOT_MAGIC);
1208 			break;
1209 		}
1210 
1211 		if (*(uint32_t *)(uintptr_t)(current_buffer->buffer + oldest_record_offset) == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
1212 			break;
1213 		}
1214 
1215 		/*
1216 		 * There are no alignment guarantees for micro-stackshot records, so we must search at each
1217 		 * byte offset.
1218 		 */
1219 		oldest_record_offset++;
1220 	} while (oldest_record_offset != current_buffer->current_position);
1221 
1222 	/*
1223 	 * If needed, copyout in two chunks: from the oldest record to the end of the buffer, and then
1224 	 * from the beginning of the buffer up to the current position.
1225 	 */
1226 	if (oldest_record_offset != 0) {
1227 #if TELEMETRY_DEBUG
1228 		log_telemetry_output(current_buffer->buffer, oldest_record_offset,
1229 		    current_buffer->end_point - oldest_record_offset);
1230 #endif
1231 		if ((result = copyout((void *)(current_buffer->buffer + oldest_record_offset), buffer,
1232 		    current_buffer->end_point - oldest_record_offset)) != 0) {
1233 			*length = 0;
1234 			goto out;
1235 		}
1236 		*length = current_buffer->end_point - oldest_record_offset;
1237 	} else {
1238 		*length = 0;
1239 	}
1240 
1241 #if TELEMETRY_DEBUG
1242 	log_telemetry_output(current_buffer->buffer, 0, current_buffer->current_position);
1243 #endif
1244 	if ((result = copyout((void *)current_buffer->buffer, buffer + *length,
1245 	    current_buffer->current_position)) != 0) {
1246 		*length = 0;
1247 		goto out;
1248 	}
1249 	*length += (uint32_t)current_buffer->current_position;
1250 
1251 out:
1252 
1253 	if (mark && (*length > 0)) {
1254 		telemetry_bytes_since_last_mark = 0;
1255 	}
1256 
1257 	TELEMETRY_UNLOCK();
1258 
1259 	KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END,
1260 	    current_buffer->current_position, *length,
1261 	    current_buffer->end_point, (&telemetry_buffer != current_buffer));
1262 
1263 	return result;
1264 }
1265 
1266 #if CONFIG_MACF
1267 static int
telemetry_macf_init_locked(size_t buffer_size)1268 telemetry_macf_init_locked(size_t buffer_size)
1269 {
1270 	kern_return_t   kr;
1271 
1272 	if (buffer_size > TELEMETRY_MAX_BUFFER_SIZE) {
1273 		buffer_size = TELEMETRY_MAX_BUFFER_SIZE;
1274 	}
1275 
1276 	telemetry_macf_buffer.size = buffer_size;
1277 
1278 	kr = kmem_alloc(kernel_map, &telemetry_macf_buffer.buffer,
1279 	    telemetry_macf_buffer.size, KMA_DATA | KMA_ZERO | KMA_PERMANENT,
1280 	    VM_KERN_MEMORY_SECURITY);
1281 
1282 	if (kr != KERN_SUCCESS) {
1283 		kprintf("Telemetry (MACF): Allocation failed: %d\n", kr);
1284 		return ENOMEM;
1285 	}
1286 
1287 	return 0;
1288 }
1289 
1290 int
telemetry_macf_mark_curthread(void)1291 telemetry_macf_mark_curthread(void)
1292 {
1293 	thread_t thread = current_thread();
1294 	task_t   task   = get_threadtask(thread);
1295 	int      rv     = 0;
1296 
1297 	if (task == kernel_task) {
1298 		/* Kernel threads never return to an AST boundary, and are ineligible */
1299 		return EINVAL;
1300 	}
1301 
1302 	/* Initialize the MACF telemetry buffer if needed. */
1303 	TELEMETRY_MACF_LOCK();
1304 	if (__improbable(telemetry_macf_buffer.size == 0)) {
1305 		rv = telemetry_macf_init_locked(TELEMETRY_MACF_DEFAULT_BUFFER_SIZE);
1306 
1307 		if (rv != 0) {
1308 			return rv;
1309 		}
1310 	}
1311 	TELEMETRY_MACF_UNLOCK();
1312 
1313 	act_set_macf_telemetry_ast(thread);
1314 	return 0;
1315 }
1316 #endif /* CONFIG_MACF */
1317 
1318 /************************/
1319 /* BOOT PROFILE SUPPORT */
1320 /************************/
1321 /*
1322  * Boot Profiling
1323  *
1324  * The boot-profiling support is a mechanism to sample activity happening on the
1325  * system during boot. This mechanism sets up a periodic timer and on every timer fire,
1326  * captures a full backtrace into the boot profiling buffer. This buffer can be pulled
1327  * out and analyzed from user-space. It is turned on using the following boot-args:
1328  * "bootprofile_buffer_size" specifies the size of the boot profile buffer
1329  * "bootprofile_interval_ms" specifies the interval for the profiling timer
1330  *
1331  * Process Specific Boot Profiling
1332  *
1333  * The boot-arg "bootprofile_proc_name" can be used to specify a certain
1334  * process that needs to profiled during boot. Setting this boot-arg changes
1335  * the way stackshots are captured. At every timer fire, the code looks at the
1336  * currently running process and takes a stackshot only if the requested process
1337  * is on-core (which makes it unsuitable for MP systems).
1338  *
1339  * Trigger Events
1340  *
1341  * The boot-arg "bootprofile_type=boot" starts the timer during early boot. Using
1342  * "wake" starts the timer at AP wake from suspend-to-RAM.
1343  */
1344 
1345 #define BOOTPROFILE_MAX_BUFFER_SIZE (64*1024*1024) /* see also COPYSIZELIMIT_PANIC */
1346 
1347 vm_offset_t         bootprofile_buffer = 0;
1348 uint32_t            bootprofile_buffer_size = 0;
1349 uint32_t            bootprofile_buffer_current_position = 0;
1350 uint32_t            bootprofile_interval_ms = 0;
1351 uint64_t            bootprofile_stackshot_flags = 0;
1352 uint64_t            bootprofile_interval_abs = 0;
1353 uint64_t            bootprofile_next_deadline = 0;
1354 uint32_t            bootprofile_all_procs = 0;
1355 char                bootprofile_proc_name[17];
1356 uint64_t            bootprofile_delta_since_timestamp = 0;
1357 LCK_GRP_DECLARE(bootprofile_lck_grp, "bootprofile_group");
1358 LCK_MTX_DECLARE(bootprofile_mtx, &bootprofile_lck_grp);
1359 
1360 
1361 enum {
1362 	kBootProfileDisabled = 0,
1363 	kBootProfileStartTimerAtBoot,
1364 	kBootProfileStartTimerAtWake
1365 } bootprofile_type = kBootProfileDisabled;
1366 
1367 
1368 static timer_call_data_t        bootprofile_timer_call_entry;
1369 
1370 #define BOOTPROFILE_LOCK() do { lck_mtx_lock(&bootprofile_mtx); } while(0)
1371 #define BOOTPROFILE_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&bootprofile_mtx)
1372 #define BOOTPROFILE_UNLOCK() do { lck_mtx_unlock(&bootprofile_mtx); } while(0)
1373 
1374 static void bootprofile_timer_call(
1375 	timer_call_param_t      param0,
1376 	timer_call_param_t      param1);
1377 
1378 void
bootprofile_init(void)1379 bootprofile_init(void)
1380 {
1381 	kern_return_t ret;
1382 	char type[32];
1383 
1384 	if (!PE_parse_boot_argn("bootprofile_buffer_size",
1385 	    &bootprofile_buffer_size, sizeof(bootprofile_buffer_size))) {
1386 		bootprofile_buffer_size = 0;
1387 	}
1388 
1389 	if (bootprofile_buffer_size > BOOTPROFILE_MAX_BUFFER_SIZE) {
1390 		bootprofile_buffer_size = BOOTPROFILE_MAX_BUFFER_SIZE;
1391 	}
1392 
1393 	if (!PE_parse_boot_argn("bootprofile_interval_ms",
1394 	    &bootprofile_interval_ms, sizeof(bootprofile_interval_ms))) {
1395 		bootprofile_interval_ms = 0;
1396 	}
1397 
1398 	if (!PE_parse_boot_argn("bootprofile_stackshot_flags",
1399 	    &bootprofile_stackshot_flags, sizeof(bootprofile_stackshot_flags))) {
1400 		bootprofile_stackshot_flags = 0;
1401 	}
1402 
1403 	if (!PE_parse_boot_argn("bootprofile_proc_name",
1404 	    &bootprofile_proc_name, sizeof(bootprofile_proc_name))) {
1405 		bootprofile_all_procs = 1;
1406 		bootprofile_proc_name[0] = '\0';
1407 	}
1408 
1409 	if (PE_parse_boot_argn("bootprofile_type", type, sizeof(type))) {
1410 		if (0 == strcmp(type, "boot")) {
1411 			bootprofile_type = kBootProfileStartTimerAtBoot;
1412 		} else if (0 == strcmp(type, "wake")) {
1413 			bootprofile_type = kBootProfileStartTimerAtWake;
1414 		} else {
1415 			bootprofile_type = kBootProfileDisabled;
1416 		}
1417 	} else {
1418 		bootprofile_type = kBootProfileDisabled;
1419 	}
1420 
1421 	clock_interval_to_absolutetime_interval(bootprofile_interval_ms, NSEC_PER_MSEC, &bootprofile_interval_abs);
1422 
1423 	/* Both boot args must be set to enable */
1424 	if ((bootprofile_type == kBootProfileDisabled) || (bootprofile_buffer_size == 0) || (bootprofile_interval_abs == 0)) {
1425 		return;
1426 	}
1427 
1428 	ret = kmem_alloc(kernel_map, &bootprofile_buffer, bootprofile_buffer_size,
1429 	    KMA_DATA | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG);
1430 	if (ret != KERN_SUCCESS) {
1431 		kprintf("Boot profile: Allocation failed: %d\n", ret);
1432 		return;
1433 	}
1434 
1435 	kprintf("Boot profile: Sampling %s once per %u ms at %s\n",
1436 	    bootprofile_all_procs ? "all procs" : bootprofile_proc_name, bootprofile_interval_ms,
1437 	    bootprofile_type == kBootProfileStartTimerAtBoot ? "boot" : (bootprofile_type == kBootProfileStartTimerAtWake ? "wake" : "unknown"));
1438 
1439 	timer_call_setup(&bootprofile_timer_call_entry,
1440 	    bootprofile_timer_call,
1441 	    NULL);
1442 
1443 	if (bootprofile_type == kBootProfileStartTimerAtBoot) {
1444 		bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1445 		timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1446 		    NULL,
1447 		    bootprofile_next_deadline,
1448 		    0,
1449 		    TIMER_CALL_SYS_NORMAL,
1450 		    false);
1451 	}
1452 }
1453 
1454 void
bootprofile_wake_from_sleep(void)1455 bootprofile_wake_from_sleep(void)
1456 {
1457 	if (bootprofile_type == kBootProfileStartTimerAtWake) {
1458 		bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1459 		timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1460 		    NULL,
1461 		    bootprofile_next_deadline,
1462 		    0,
1463 		    TIMER_CALL_SYS_NORMAL,
1464 		    false);
1465 	}
1466 }
1467 
1468 
1469 static void
bootprofile_timer_call(timer_call_param_t param0 __unused,timer_call_param_t param1 __unused)1470 bootprofile_timer_call(
1471 	timer_call_param_t      param0 __unused,
1472 	timer_call_param_t      param1 __unused)
1473 {
1474 	unsigned retbytes = 0;
1475 	int pid_to_profile = -1;
1476 
1477 	if (!BOOTPROFILE_TRY_SPIN_LOCK()) {
1478 		goto reprogram;
1479 	}
1480 
1481 	/* Check if process-specific boot profiling is turned on */
1482 	if (!bootprofile_all_procs) {
1483 		/*
1484 		 * Since boot profiling initializes really early in boot, it is
1485 		 * possible that at this point, the task/proc is not initialized.
1486 		 * Nothing to do in that case.
1487 		 */
1488 
1489 		if ((current_task() != NULL) && (current_task()->bsd_info != NULL) &&
1490 		    (0 == strncmp(bootprofile_proc_name, proc_name_address(current_task()->bsd_info), 17))) {
1491 			pid_to_profile = proc_selfpid();
1492 		} else {
1493 			/*
1494 			 * Process-specific boot profiling requested but the on-core process is
1495 			 * something else. Nothing to do here.
1496 			 */
1497 			BOOTPROFILE_UNLOCK();
1498 			goto reprogram;
1499 		}
1500 	}
1501 
1502 	/* initiate a stackshot with whatever portion of the buffer is left */
1503 	if (bootprofile_buffer_current_position < bootprofile_buffer_size) {
1504 		uint64_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO
1505 		    | STACKSHOT_GET_GLOBAL_MEM_STATS;
1506 #if defined(XNU_TARGET_OS_OSX)
1507 		flags |= STACKSHOT_SAVE_KEXT_LOADINFO;
1508 #endif
1509 
1510 
1511 		/* OR on flags specified in boot-args */
1512 		flags |= bootprofile_stackshot_flags;
1513 		if ((flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) && (bootprofile_delta_since_timestamp == 0)) {
1514 			/* Can't take deltas until the first one */
1515 			flags &= ~STACKSHOT_COLLECT_DELTA_SNAPSHOT;
1516 		}
1517 
1518 		uint64_t timestamp = 0;
1519 		if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) {
1520 			timestamp = mach_absolute_time();
1521 		}
1522 
1523 		kern_return_t r = stack_snapshot_from_kernel(
1524 			pid_to_profile, (void *)(bootprofile_buffer + bootprofile_buffer_current_position),
1525 			bootprofile_buffer_size - bootprofile_buffer_current_position,
1526 			flags, bootprofile_delta_since_timestamp, 0, &retbytes);
1527 
1528 		/*
1529 		 * We call with STACKSHOT_TRYLOCK because the stackshot lock is coarser
1530 		 * than the bootprofile lock.  If someone else has the lock we'll just
1531 		 * try again later.
1532 		 */
1533 
1534 		if (r == KERN_LOCK_OWNED) {
1535 			BOOTPROFILE_UNLOCK();
1536 			goto reprogram;
1537 		}
1538 
1539 		if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT &&
1540 		    r == KERN_SUCCESS) {
1541 			bootprofile_delta_since_timestamp = timestamp;
1542 		}
1543 
1544 		bootprofile_buffer_current_position += retbytes;
1545 	}
1546 
1547 	BOOTPROFILE_UNLOCK();
1548 
1549 	/* If we didn't get any data or have run out of buffer space, stop profiling */
1550 	if ((retbytes == 0) || (bootprofile_buffer_current_position == bootprofile_buffer_size)) {
1551 		return;
1552 	}
1553 
1554 
1555 reprogram:
1556 	/* If the user gathered the buffer, no need to keep profiling */
1557 	if (bootprofile_interval_abs == 0) {
1558 		return;
1559 	}
1560 
1561 	clock_deadline_for_periodic_event(bootprofile_interval_abs,
1562 	    mach_absolute_time(),
1563 	    &bootprofile_next_deadline);
1564 	timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1565 	    NULL,
1566 	    bootprofile_next_deadline,
1567 	    0,
1568 	    TIMER_CALL_SYS_NORMAL,
1569 	    false);
1570 }
1571 
1572 void
bootprofile_get(void ** buffer,uint32_t * length)1573 bootprofile_get(void **buffer, uint32_t *length)
1574 {
1575 	BOOTPROFILE_LOCK();
1576 	*buffer = (void*) bootprofile_buffer;
1577 	*length = bootprofile_buffer_current_position;
1578 	BOOTPROFILE_UNLOCK();
1579 }
1580 
1581 int
bootprofile_gather(user_addr_t buffer,uint32_t * length)1582 bootprofile_gather(user_addr_t buffer, uint32_t *length)
1583 {
1584 	int result = 0;
1585 
1586 	BOOTPROFILE_LOCK();
1587 
1588 	if (bootprofile_buffer == 0) {
1589 		*length = 0;
1590 		goto out;
1591 	}
1592 
1593 	if (*length < bootprofile_buffer_current_position) {
1594 		result = KERN_NO_SPACE;
1595 		goto out;
1596 	}
1597 
1598 	if ((result = copyout((void *)bootprofile_buffer, buffer,
1599 	    bootprofile_buffer_current_position)) != 0) {
1600 		*length = 0;
1601 		goto out;
1602 	}
1603 	*length = bootprofile_buffer_current_position;
1604 
1605 	/* cancel future timers */
1606 	bootprofile_interval_abs = 0;
1607 
1608 out:
1609 
1610 	BOOTPROFILE_UNLOCK();
1611 
1612 	return result;
1613 }
1614