1 /*
2 * Copyright (c) 2012-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 #include <mach/host_priv.h>
29 #include <mach/host_special_ports.h>
30 #include <mach/mach_types.h>
31 #include <mach/telemetry_notification_server.h>
32
33 #include <kern/assert.h>
34 #include <kern/clock.h>
35 #include <kern/debug.h>
36 #include <kern/host.h>
37 #include <kern/kalloc.h>
38 #include <kern/kern_types.h>
39 #include <kern/locks.h>
40 #include <kern/misc_protos.h>
41 #include <kern/sched.h>
42 #include <kern/sched_prim.h>
43 #include <kern/telemetry.h>
44 #include <kern/timer_call.h>
45 #include <kern/policy_internal.h>
46 #include <kern/kcdata.h>
47
48 #include <pexpert/pexpert.h>
49
50 #include <string.h>
51 #include <vm/vm_kern.h>
52 #include <vm/vm_shared_region.h>
53
54 #include <kperf/callstack.h>
55 #include <kern/backtrace.h>
56 #include <kern/monotonic.h>
57
58 #include <security/mac_mach_internal.h>
59
60 #include <sys/errno.h>
61 #include <sys/kdebug.h>
62 #include <uuid/uuid.h>
63 #include <kdp/kdp_dyld.h>
64
65 #define TELEMETRY_DEBUG 0
66
67 struct proc;
68 extern int proc_pid(struct proc *);
69 extern char *proc_name_address(void *p);
70 extern uint64_t proc_uniqueid(void *p);
71 extern uint64_t proc_was_throttled(void *p);
72 extern uint64_t proc_did_throttle(void *p);
73 extern int proc_selfpid(void);
74 extern boolean_t task_did_exec(task_t task);
75 extern boolean_t task_is_exec_copy(task_t task);
76
77 struct micro_snapshot_buffer {
78 vm_offset_t buffer;
79 uint32_t size;
80 uint32_t current_position;
81 uint32_t end_point;
82 };
83
84 static bool telemetry_task_ready_for_sample(task_t task);
85
86 static void telemetry_instrumentation_begin(
87 struct micro_snapshot_buffer *buffer, enum micro_snapshot_flags flags);
88
89 static void telemetry_instrumentation_end(struct micro_snapshot_buffer *buffer);
90
91 static void telemetry_take_sample(thread_t thread, enum micro_snapshot_flags flags);
92
93 #if CONFIG_MACF
94 static void telemetry_macf_take_sample(thread_t thread, enum micro_snapshot_flags flags);
95 #endif
96
97 struct telemetry_target {
98 thread_t thread;
99 uintptr_t *frames;
100 size_t frames_count;
101 bool user64_regs;
102 enum micro_snapshot_flags microsnapshot_flags;
103 struct micro_snapshot_buffer *buffer;
104 lck_mtx_t *buffer_mtx;
105 };
106
107 static int telemetry_process_sample(
108 const struct telemetry_target *target,
109 bool release_buffer_lock,
110 uint32_t *out_current_record_start);
111
112 static int telemetry_buffer_gather(
113 user_addr_t buffer,
114 uint32_t *length,
115 bool mark,
116 struct micro_snapshot_buffer *current_buffer);
117
118 #define TELEMETRY_DEFAULT_SAMPLE_RATE (1) /* 1 sample every 1 second */
119 #define TELEMETRY_DEFAULT_BUFFER_SIZE (16*1024)
120 #define TELEMETRY_MAX_BUFFER_SIZE (64*1024)
121
122 #define TELEMETRY_DEFAULT_NOTIFY_LEEWAY (4*1024) // Userland gets 4k of leeway to collect data after notification
123 #define TELEMETRY_MAX_UUID_COUNT (128) // Max of 128 non-shared-cache UUIDs to log for symbolication
124
125 uint32_t telemetry_sample_rate = 0;
126 volatile boolean_t telemetry_needs_record = FALSE;
127 volatile boolean_t telemetry_needs_timer_arming_record = FALSE;
128
129 /*
130 * If TRUE, record micro-stackshot samples for all tasks.
131 * If FALSE, only sample tasks which are marked for telemetry.
132 */
133 bool telemetry_sample_all_tasks = false;
134 bool telemetry_sample_pmis = false;
135 uint32_t telemetry_active_tasks = 0; // Number of tasks opted into telemetry
136
137 uint32_t telemetry_timestamp = 0;
138
139 /*
140 * The telemetry_buffer is responsible
141 * for timer samples and interrupt samples that are driven by
142 * compute_averages(). It will notify its client (if one
143 * exists) when it has enough data to be worth flushing.
144 */
145 struct micro_snapshot_buffer telemetry_buffer = {
146 .buffer = 0,
147 .size = 0,
148 .current_position = 0,
149 .end_point = 0
150 };
151
152 #if CONFIG_MACF
153 #define TELEMETRY_MACF_DEFAULT_BUFFER_SIZE (16*1024)
154 /*
155 * The MAC framework uses its own telemetry buffer for the purposes of auditing
156 * security-related work being done by userland threads.
157 */
158 struct micro_snapshot_buffer telemetry_macf_buffer = {
159 .buffer = 0,
160 .size = 0,
161 .current_position = 0,
162 .end_point = 0
163 };
164 #endif
165
166 int telemetry_bytes_since_last_mark = -1; // How much data since buf was last marked?
167 int telemetry_buffer_notify_at = 0;
168
169 LCK_GRP_DECLARE(telemetry_lck_grp, "telemetry group");
170 LCK_MTX_DECLARE(telemetry_mtx, &telemetry_lck_grp);
171 LCK_MTX_DECLARE(telemetry_pmi_mtx, &telemetry_lck_grp);
172 LCK_MTX_DECLARE(telemetry_macf_mtx, &telemetry_lck_grp);
173
174 #define TELEMETRY_LOCK() do { lck_mtx_lock(&telemetry_mtx); } while (0)
175 #define TELEMETRY_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&telemetry_mtx)
176 #define TELEMETRY_UNLOCK() do { lck_mtx_unlock(&telemetry_mtx); } while (0)
177
178 #define TELEMETRY_PMI_LOCK() do { lck_mtx_lock(&telemetry_pmi_mtx); } while (0)
179 #define TELEMETRY_PMI_UNLOCK() do { lck_mtx_unlock(&telemetry_pmi_mtx); } while (0)
180
181 #define TELEMETRY_MACF_LOCK() do { lck_mtx_lock(&telemetry_macf_mtx); } while (0)
182 #define TELEMETRY_MACF_UNLOCK() do { lck_mtx_unlock(&telemetry_macf_mtx); } while (0)
183
184 void
telemetry_init(void)185 telemetry_init(void)
186 {
187 kern_return_t ret;
188 uint32_t telemetry_notification_leeway;
189
190 if (!PE_parse_boot_argn("telemetry_buffer_size",
191 &telemetry_buffer.size, sizeof(telemetry_buffer.size))) {
192 telemetry_buffer.size = TELEMETRY_DEFAULT_BUFFER_SIZE;
193 }
194
195 if (telemetry_buffer.size > TELEMETRY_MAX_BUFFER_SIZE) {
196 telemetry_buffer.size = TELEMETRY_MAX_BUFFER_SIZE;
197 }
198
199 ret = kmem_alloc(kernel_map, &telemetry_buffer.buffer, telemetry_buffer.size,
200 KMA_DATA | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG);
201 if (ret != KERN_SUCCESS) {
202 kprintf("Telemetry: Allocation failed: %d\n", ret);
203 return;
204 }
205
206 if (!PE_parse_boot_argn("telemetry_notification_leeway",
207 &telemetry_notification_leeway, sizeof(telemetry_notification_leeway))) {
208 /*
209 * By default, notify the user to collect the buffer when there is this much space left in the buffer.
210 */
211 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
212 }
213 if (telemetry_notification_leeway >= telemetry_buffer.size) {
214 printf("telemetry: nonsensical telemetry_notification_leeway boot-arg %d changed to %d\n",
215 telemetry_notification_leeway, TELEMETRY_DEFAULT_NOTIFY_LEEWAY);
216 telemetry_notification_leeway = TELEMETRY_DEFAULT_NOTIFY_LEEWAY;
217 }
218 telemetry_buffer_notify_at = telemetry_buffer.size - telemetry_notification_leeway;
219
220 if (!PE_parse_boot_argn("telemetry_sample_rate",
221 &telemetry_sample_rate, sizeof(telemetry_sample_rate))) {
222 telemetry_sample_rate = TELEMETRY_DEFAULT_SAMPLE_RATE;
223 }
224
225 /*
226 * To enable telemetry for all tasks, include "telemetry_sample_all_tasks=1" in boot-args.
227 */
228 if (!PE_parse_boot_argn("telemetry_sample_all_tasks",
229 &telemetry_sample_all_tasks, sizeof(telemetry_sample_all_tasks))) {
230 #if !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG)
231 telemetry_sample_all_tasks = false;
232 #else
233 telemetry_sample_all_tasks = true;
234 #endif /* !defined(XNU_TARGET_OS_OSX) && !(DEVELOPMENT || DEBUG) */
235 }
236
237 kprintf("Telemetry: Sampling %stasks once per %u second%s\n",
238 (telemetry_sample_all_tasks) ? "all " : "",
239 telemetry_sample_rate, telemetry_sample_rate == 1 ? "" : "s");
240 }
241
242 /*
243 * Enable or disable global microstackshots (ie telemetry_sample_all_tasks).
244 *
245 * enable_disable == 1: turn it on
246 * enable_disable == 0: turn it off
247 */
248 void
telemetry_global_ctl(int enable_disable)249 telemetry_global_ctl(int enable_disable)
250 {
251 if (enable_disable == 1) {
252 telemetry_sample_all_tasks = true;
253 } else {
254 telemetry_sample_all_tasks = false;
255 }
256 }
257
258 /*
259 * Opt the given task into or out of the telemetry stream.
260 *
261 * Supported reasons (callers may use any or all of):
262 * TF_CPUMON_WARNING
263 * TF_WAKEMON_WARNING
264 *
265 * enable_disable == 1: turn it on
266 * enable_disable == 0: turn it off
267 */
268 void
telemetry_task_ctl(task_t task,uint32_t reasons,int enable_disable)269 telemetry_task_ctl(task_t task, uint32_t reasons, int enable_disable)
270 {
271 task_lock(task);
272 telemetry_task_ctl_locked(task, reasons, enable_disable);
273 task_unlock(task);
274 }
275
276 void
telemetry_task_ctl_locked(task_t task,uint32_t reasons,int enable_disable)277 telemetry_task_ctl_locked(task_t task, uint32_t reasons, int enable_disable)
278 {
279 uint32_t origflags;
280
281 assert((reasons != 0) && ((reasons | TF_TELEMETRY) == TF_TELEMETRY));
282
283 task_lock_assert_owned(task);
284
285 origflags = task->t_flags;
286
287 if (enable_disable == 1) {
288 task->t_flags |= reasons;
289 if ((origflags & TF_TELEMETRY) == 0) {
290 OSIncrementAtomic(&telemetry_active_tasks);
291 #if TELEMETRY_DEBUG
292 printf("%s: telemetry OFF -> ON (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
293 #endif
294 }
295 } else {
296 task->t_flags &= ~reasons;
297 if (((origflags & TF_TELEMETRY) != 0) && ((task->t_flags & TF_TELEMETRY) == 0)) {
298 /*
299 * If this task went from having at least one telemetry bit to having none,
300 * the net change was to disable telemetry for the task.
301 */
302 OSDecrementAtomic(&telemetry_active_tasks);
303 #if TELEMETRY_DEBUG
304 printf("%s: telemetry ON -> OFF (%d active)\n", proc_name_address(task->bsd_info), telemetry_active_tasks);
305 #endif
306 }
307 }
308 }
309
310 /*
311 * Determine if the current thread is eligible for telemetry:
312 *
313 * telemetry_sample_all_tasks: All threads are eligible. This takes precedence.
314 * telemetry_active_tasks: Count of tasks opted in.
315 * task->t_flags & TF_TELEMETRY: This task is opted in.
316 */
317 static bool
telemetry_is_active(thread_t thread)318 telemetry_is_active(thread_t thread)
319 {
320 task_t task = get_threadtask(thread);
321
322 if (task == kernel_task) {
323 /* Kernel threads never return to an AST boundary, and are ineligible */
324 return false;
325 }
326
327 if (telemetry_sample_all_tasks || telemetry_sample_pmis) {
328 return true;
329 }
330
331 if ((telemetry_active_tasks > 0) && ((task->t_flags & TF_TELEMETRY) != 0)) {
332 return true;
333 }
334
335 return false;
336 }
337
338 /*
339 * Userland is arming a timer. If we are eligible for such a record,
340 * sample now. No need to do this one at the AST because we're already at
341 * a safe place in this system call.
342 */
343 int
telemetry_timer_event(__unused uint64_t deadline,__unused uint64_t interval,__unused uint64_t leeway)344 telemetry_timer_event(__unused uint64_t deadline, __unused uint64_t interval, __unused uint64_t leeway)
345 {
346 if (telemetry_needs_timer_arming_record == TRUE) {
347 telemetry_needs_timer_arming_record = FALSE;
348 telemetry_take_sample(current_thread(), (enum micro_snapshot_flags)(kTimerArmingRecord | kUserMode));
349 }
350
351 return 0;
352 }
353
354 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
355 static void
telemetry_pmi_handler(bool user_mode,__unused void * ctx)356 telemetry_pmi_handler(bool user_mode, __unused void *ctx)
357 {
358 telemetry_mark_curthread(user_mode, TRUE);
359 }
360 #endif /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
361
362 int
telemetry_pmi_setup(enum telemetry_pmi pmi_ctr,uint64_t period)363 telemetry_pmi_setup(enum telemetry_pmi pmi_ctr, uint64_t period)
364 {
365 #if defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES)
366 static bool sample_all_tasks_aside = false;
367 static uint32_t active_tasks_aside = false;
368 int error = 0;
369 const char *name = "?";
370
371 unsigned int ctr = 0;
372
373 TELEMETRY_PMI_LOCK();
374
375 switch (pmi_ctr) {
376 case TELEMETRY_PMI_NONE:
377 if (!telemetry_sample_pmis) {
378 error = 1;
379 goto out;
380 }
381
382 telemetry_sample_pmis = false;
383 telemetry_sample_all_tasks = sample_all_tasks_aside;
384 telemetry_active_tasks = active_tasks_aside;
385 error = mt_microstackshot_stop();
386 if (!error) {
387 printf("telemetry: disabling ustackshot on PMI\n");
388 }
389 goto out;
390
391 case TELEMETRY_PMI_INSTRS:
392 ctr = MT_CORE_INSTRS;
393 name = "instructions";
394 break;
395
396 case TELEMETRY_PMI_CYCLES:
397 ctr = MT_CORE_CYCLES;
398 name = "cycles";
399 break;
400
401 default:
402 error = 1;
403 goto out;
404 }
405
406 telemetry_sample_pmis = true;
407 sample_all_tasks_aside = telemetry_sample_all_tasks;
408 active_tasks_aside = telemetry_active_tasks;
409 telemetry_sample_all_tasks = false;
410 telemetry_active_tasks = 0;
411
412 error = mt_microstackshot_start(ctr, period, telemetry_pmi_handler, NULL);
413 if (!error) {
414 printf("telemetry: ustackshot every %llu %s\n", period, name);
415 }
416
417 out:
418 TELEMETRY_PMI_UNLOCK();
419 return error;
420 #else /* defined(MT_CORE_INSTRS) && defined(MT_CORE_CYCLES) */
421 #pragma unused(pmi_ctr, period)
422 return 1;
423 #endif /* !defined(MT_CORE_INSTRS) || !defined(MT_CORE_CYCLES) */
424 }
425
426 /*
427 * Mark the current thread for an interrupt-based
428 * telemetry record, to be sampled at the next AST boundary.
429 */
430 void
telemetry_mark_curthread(boolean_t interrupted_userspace,boolean_t pmi)431 telemetry_mark_curthread(boolean_t interrupted_userspace, boolean_t pmi)
432 {
433 uint32_t ast_bits = 0;
434 thread_t thread = current_thread();
435
436 /*
437 * If telemetry isn't active for this thread, return and try
438 * again next time.
439 */
440 if (telemetry_is_active(thread) == false) {
441 return;
442 }
443
444 ast_bits |= (interrupted_userspace ? AST_TELEMETRY_USER : AST_TELEMETRY_KERNEL);
445 if (pmi) {
446 ast_bits |= AST_TELEMETRY_PMI;
447 }
448
449 telemetry_needs_record = FALSE;
450 thread_ast_set(thread, ast_bits);
451 ast_propagate(thread);
452 }
453
454 void
compute_telemetry(void * arg __unused)455 compute_telemetry(void *arg __unused)
456 {
457 if (telemetry_sample_all_tasks || (telemetry_active_tasks > 0)) {
458 if ((++telemetry_timestamp) % telemetry_sample_rate == 0) {
459 telemetry_needs_record = TRUE;
460 telemetry_needs_timer_arming_record = TRUE;
461 }
462 }
463 }
464
465 /*
466 * If userland has registered a port for telemetry notifications, send one now.
467 */
468 static void
telemetry_notify_user(void)469 telemetry_notify_user(void)
470 {
471 mach_port_t user_port = MACH_PORT_NULL;
472
473 kern_return_t kr = host_get_telemetry_port(host_priv_self(), &user_port);
474 if ((kr != KERN_SUCCESS) || !IPC_PORT_VALID(user_port)) {
475 return;
476 }
477
478 telemetry_notification(user_port, 0);
479 ipc_port_release_send(user_port);
480 }
481
482 void
telemetry_ast(thread_t thread,ast_t reasons)483 telemetry_ast(thread_t thread, ast_t reasons)
484 {
485 assert((reasons & AST_TELEMETRY_ALL) != 0);
486
487 uint8_t record_type = 0;
488 if (reasons & AST_TELEMETRY_IO) {
489 record_type |= kIORecord;
490 }
491 if (reasons & (AST_TELEMETRY_USER | AST_TELEMETRY_KERNEL)) {
492 record_type |= (reasons & AST_TELEMETRY_PMI) ? kPMIRecord :
493 kInterruptRecord;
494 }
495
496 if ((reasons & AST_TELEMETRY_MACF) != 0) {
497 record_type |= kMACFRecord;
498 }
499
500 enum micro_snapshot_flags user_telemetry = (reasons & AST_TELEMETRY_USER) ? kUserMode : 0;
501 enum micro_snapshot_flags microsnapshot_flags = record_type | user_telemetry;
502
503 if ((reasons & AST_TELEMETRY_MACF) != 0) {
504 telemetry_macf_take_sample(thread, microsnapshot_flags);
505 }
506
507 if ((reasons & (AST_TELEMETRY_IO | AST_TELEMETRY_KERNEL | AST_TELEMETRY_PMI
508 | AST_TELEMETRY_USER)) != 0) {
509 telemetry_take_sample(thread, microsnapshot_flags);
510 }
511 }
512
513 bool
telemetry_task_ready_for_sample(task_t task)514 telemetry_task_ready_for_sample(task_t task)
515 {
516 return task != TASK_NULL &&
517 task != kernel_task &&
518 !task_did_exec(task) &&
519 !task_is_exec_copy(task);
520 }
521
522 void
telemetry_instrumentation_begin(__unused struct micro_snapshot_buffer * buffer,__unused enum micro_snapshot_flags flags)523 telemetry_instrumentation_begin(
524 __unused struct micro_snapshot_buffer *buffer,
525 __unused enum micro_snapshot_flags flags)
526 {
527 /* telemetry_XXX accessed outside of lock for instrumentation only */
528 KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_START,
529 flags, telemetry_bytes_since_last_mark, 0,
530 (&telemetry_buffer != buffer));
531 }
532
533 void
telemetry_instrumentation_end(__unused struct micro_snapshot_buffer * buffer)534 telemetry_instrumentation_end(__unused struct micro_snapshot_buffer *buffer)
535 {
536 /* telemetry_XXX accessed outside of lock for instrumentation only */
537 KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_RECORD) | DBG_FUNC_END,
538 (&telemetry_buffer == buffer), telemetry_bytes_since_last_mark,
539 buffer->current_position, buffer->end_point);
540 }
541
542 void
telemetry_take_sample(thread_t thread,enum micro_snapshot_flags flags)543 telemetry_take_sample(thread_t thread, enum micro_snapshot_flags flags)
544 {
545 task_t task;
546 uintptr_t frames[128];
547 uint32_t btcount;
548 struct backtrace_user_info btinfo = BTUINFO_INIT;
549
550 if (thread == THREAD_NULL) {
551 return;
552 }
553
554 /* Ensure task is ready for taking a sample. */
555 task = get_threadtask(thread);
556 if (!telemetry_task_ready_for_sample(task)) {
557 return;
558 }
559
560 telemetry_instrumentation_begin(&telemetry_buffer, flags);
561
562 /* Collect backtrace from user thread. */
563 btcount = backtrace_user(
564 frames, /* bt */
565 sizeof(frames) / sizeof(frames[0]), /* btlen */
566 NULL, /* ctl */
567 &btinfo /* info_out */
568 );
569
570 if (btinfo.btui_error != 0) {
571 return;
572 }
573
574 /* Process the backtrace. */
575 struct telemetry_target target = {
576 .thread = thread,
577 .frames = frames,
578 .frames_count = btcount,
579 .user64_regs = (btinfo.btui_info & BTI_64_BIT) != 0,
580 .microsnapshot_flags = flags,
581 .buffer = &telemetry_buffer,
582 .buffer_mtx = &telemetry_mtx
583 };
584 telemetry_process_sample(&target, true, NULL);
585
586 telemetry_instrumentation_end(&telemetry_buffer);
587 }
588
589 #if CONFIG_MACF
590 void
telemetry_macf_take_sample(thread_t thread,enum micro_snapshot_flags flags)591 telemetry_macf_take_sample(thread_t thread, enum micro_snapshot_flags flags)
592 {
593 task_t task;
594
595 vm_size_t btcapacity = 128;
596 uintptr_t frames_stack[btcapacity];
597 uint32_t btcount = 0;
598 uintptr_t *frames = frames_stack;
599 bool alloced_frames = false;
600
601 struct backtrace_user_info btinfo = BTUINFO_INIT;
602 struct backtrace_control btctl = BTCTL_INIT;
603
604 uint32_t retry_count = 0;
605 const uint32_t max_retries = 10;
606
607 bool initialized = false;
608 struct micro_snapshot_buffer *telbuf = &telemetry_macf_buffer;
609 uint32_t record_start = 0;
610 bool did_process = false;
611 int rv = 0;
612
613 if (thread == THREAD_NULL) {
614 return;
615 }
616
617 telemetry_instrumentation_begin(telbuf, flags);
618
619 /* Ensure task is ready for taking a sample. */
620 task = get_threadtask(thread);
621 if (!telemetry_task_ready_for_sample(task)) {
622 rv = EBUSY;
623 goto out;
624 }
625
626 /* Ensure MACF telemetry buffer was initialized. */
627 TELEMETRY_MACF_LOCK();
628 initialized = (telbuf->size > 0);
629 TELEMETRY_MACF_UNLOCK();
630
631 if (!initialized) {
632 rv = ENOMEM;
633 goto out;
634 }
635
636 /* Collect backtrace from user thread. */
637 while (retry_count < max_retries) {
638 btcount += backtrace_user(frames + btcount, btcapacity - btcount, &btctl, &btinfo);
639
640 if ((btinfo.btui_info & BTI_TRUNCATED) != 0 && btinfo.btui_next_frame_addr != 0) {
641 /*
642 * Fast path uses stack memory to avoid an allocation. We must
643 * pivot to heap memory in the case where we cannot write the
644 * complete backtrace to this buffer.
645 */
646 if (frames == frames_stack) {
647 btcapacity += 128;
648 frames = kalloc_data(btcapacity * sizeof(*frames), Z_WAITOK);
649
650 if (frames == NULL) {
651 break;
652 }
653
654 alloced_frames = true;
655
656 assert(btcapacity > sizeof(frames_stack) / sizeof(frames_stack[0]));
657 memcpy(frames, frames_stack, sizeof(frames_stack));
658 } else {
659 assert(alloced_frames);
660 frames = krealloc_data(frames,
661 btcapacity * sizeof(*frames),
662 (btcapacity + 128) * sizeof(*frames),
663 Z_WAITOK);
664
665 if (frames == NULL) {
666 break;
667 }
668
669 btcapacity += 128;
670 }
671
672 btctl.btc_frame_addr = btinfo.btui_next_frame_addr;
673 ++retry_count;
674 } else {
675 break;
676 }
677 }
678
679 if (frames == NULL) {
680 rv = ENOMEM;
681 goto out;
682 } else if (btinfo.btui_error != 0) {
683 rv = btinfo.btui_error;
684 goto out;
685 }
686
687 /* Process the backtrace. */
688 struct telemetry_target target = {
689 .thread = thread,
690 .frames = frames,
691 .frames_count = btcount,
692 .user64_regs = (btinfo.btui_info & BTI_64_BIT) != 0,
693 .microsnapshot_flags = flags,
694 .buffer = telbuf,
695 .buffer_mtx = &telemetry_macf_mtx
696 };
697 rv = telemetry_process_sample(&target, false, &record_start);
698 did_process = true;
699
700 out:
701 /* Immediately deliver the collected sample to MAC clients. */
702 if (rv == 0) {
703 assert(telbuf->current_position >= record_start);
704 mac_thread_telemetry(thread,
705 0,
706 (void *)(telbuf->buffer + record_start),
707 telbuf->current_position - record_start);
708 } else {
709 mac_thread_telemetry(thread, rv, NULL, 0);
710 }
711
712 /*
713 * The lock was taken by telemetry_process_sample, and we asked it not to
714 * unlock upon completion, so we must release the lock here.
715 */
716 if (did_process) {
717 TELEMETRY_MACF_UNLOCK();
718 }
719
720 if (alloced_frames && frames != NULL) {
721 kfree_data(frames, btcapacity * sizeof(*frames));
722 }
723
724 telemetry_instrumentation_end(telbuf);
725 }
726 #endif /* CONFIG_MACF */
727
728 int
telemetry_process_sample(const struct telemetry_target * target,bool release_buffer_lock,uint32_t * out_current_record_start)729 telemetry_process_sample(const struct telemetry_target *target,
730 bool release_buffer_lock,
731 uint32_t *out_current_record_start)
732 {
733 thread_t thread = target->thread;
734 uintptr_t *frames = target->frames;
735 size_t btcount = target->frames_count;
736 bool user64_regs = target->user64_regs;
737 enum micro_snapshot_flags microsnapshot_flags = target->microsnapshot_flags;
738 struct micro_snapshot_buffer *current_buffer = target->buffer;
739 lck_mtx_t *buffer_mtx = target->buffer_mtx;
740
741 task_t task;
742 void *p;
743 uint32_t bti;
744 struct micro_snapshot *msnap;
745 struct task_snapshot *tsnap;
746 struct thread_snapshot *thsnap;
747 clock_sec_t secs;
748 clock_usec_t usecs;
749 vm_size_t framesize;
750 uint32_t current_record_start;
751 uint32_t tmp = 0;
752 bool notify = false;
753 int rv = 0;
754
755 if (thread == THREAD_NULL) {
756 return EINVAL;
757 }
758
759 task = get_threadtask(thread);
760 p = get_bsdtask_info(task);
761 bool user64_va = task_has_64Bit_addr(task);
762
763 /*
764 * Retrieve the array of UUID's for binaries used by this task.
765 * We reach down into DYLD's data structures to find the array.
766 *
767 * XXX - make this common with kdp?
768 */
769 uint32_t uuid_info_count = 0;
770 mach_vm_address_t uuid_info_addr = 0;
771 uint32_t uuid_info_size = 0;
772 if (user64_va) {
773 uuid_info_size = sizeof(struct user64_dyld_uuid_info);
774 struct user64_dyld_all_image_infos task_image_infos;
775 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
776 uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
777 uuid_info_addr = task_image_infos.uuidArray;
778 }
779 } else {
780 uuid_info_size = sizeof(struct user32_dyld_uuid_info);
781 struct user32_dyld_all_image_infos task_image_infos;
782 if (copyin(task->all_image_info_addr, (char *)&task_image_infos, sizeof(task_image_infos)) == 0) {
783 uuid_info_count = task_image_infos.uuidArrayCount;
784 uuid_info_addr = task_image_infos.uuidArray;
785 }
786 }
787
788 /*
789 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
790 * this data structure), we zero the uuid_info_count so that we won't even try to save load info
791 * for this task.
792 */
793 if (!uuid_info_addr) {
794 uuid_info_count = 0;
795 }
796
797 /*
798 * Don't copy in an unbounded amount of memory. The main binary and interesting
799 * non-shared-cache libraries should be in the first few images.
800 */
801 if (uuid_info_count > TELEMETRY_MAX_UUID_COUNT) {
802 uuid_info_count = TELEMETRY_MAX_UUID_COUNT;
803 }
804
805 uint32_t uuid_info_array_size = uuid_info_count * uuid_info_size;
806 char *uuid_info_array = NULL;
807
808 if (uuid_info_count > 0) {
809 uuid_info_array = kalloc_data(uuid_info_array_size, Z_WAITOK);
810 if (uuid_info_array == NULL) {
811 return ENOMEM;
812 }
813
814 /*
815 * Copy in the UUID info array.
816 * It may be nonresident, in which case just fix up nloadinfos to 0 in the task snapshot.
817 */
818 if (copyin(uuid_info_addr, uuid_info_array, uuid_info_array_size) != 0) {
819 kfree_data(uuid_info_array, uuid_info_array_size);
820 uuid_info_array = NULL;
821 uuid_info_array_size = 0;
822 }
823 }
824
825 /*
826 * Look for a dispatch queue serial number, and copy it in from userland if present.
827 */
828 uint64_t dqserialnum = 0;
829 int dqserialnum_valid = 0;
830
831 uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
832 if (dqkeyaddr != 0) {
833 uint64_t dqaddr = 0;
834 uint64_t dq_serialno_offset = get_task_dispatchqueue_serialno_offset(task);
835 if ((copyin(dqkeyaddr, (char *)&dqaddr, (user64_va ? 8 : 4)) == 0) &&
836 (dqaddr != 0) && (dq_serialno_offset != 0)) {
837 uint64_t dqserialnumaddr = dqaddr + dq_serialno_offset;
838 if (copyin(dqserialnumaddr, (char *)&dqserialnum, (user64_va ? 8 : 4)) == 0) {
839 dqserialnum_valid = 1;
840 }
841 }
842 }
843
844 clock_get_calendar_microtime(&secs, &usecs);
845
846 lck_mtx_lock(buffer_mtx);
847
848 /*
849 * If our buffer is not backed by anything,
850 * then we cannot take the sample. Meant to allow us to deallocate the window
851 * buffer if it is disabled.
852 */
853 if (!current_buffer->buffer) {
854 rv = EINVAL;
855 goto cancel_sample;
856 }
857
858 /*
859 * We do the bulk of the operation under the telemetry lock, on assumption that
860 * any page faults during execution will not cause another AST_TELEMETRY_ALL
861 * to deadlock; they will just block until we finish. This makes it easier
862 * to copy into the buffer directly. As soon as we unlock, userspace can copy
863 * out of our buffer.
864 */
865
866 copytobuffer:
867
868 current_record_start = current_buffer->current_position;
869
870 if ((current_buffer->size - current_buffer->current_position) < sizeof(struct micro_snapshot)) {
871 /*
872 * We can't fit a record in the space available, so wrap around to the beginning.
873 * Save the current position as the known end point of valid data.
874 */
875 current_buffer->end_point = current_record_start;
876 current_buffer->current_position = 0;
877 if (current_record_start == 0) {
878 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
879 rv = ERANGE;
880 goto cancel_sample;
881 }
882 goto copytobuffer;
883 }
884
885 msnap = (struct micro_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
886 msnap->snapshot_magic = STACKSHOT_MICRO_SNAPSHOT_MAGIC;
887 msnap->ms_flags = (uint8_t)microsnapshot_flags;
888 msnap->ms_opaque_flags = 0; /* namespace managed by userspace */
889 msnap->ms_cpu = cpu_number();
890 msnap->ms_time = secs;
891 msnap->ms_time_microsecs = usecs;
892
893 current_buffer->current_position += sizeof(struct micro_snapshot);
894
895 if ((current_buffer->size - current_buffer->current_position) < sizeof(struct task_snapshot)) {
896 current_buffer->end_point = current_record_start;
897 current_buffer->current_position = 0;
898 if (current_record_start == 0) {
899 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
900 rv = ERANGE;
901 goto cancel_sample;
902 }
903 goto copytobuffer;
904 }
905
906 tsnap = (struct task_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
907 bzero(tsnap, sizeof(*tsnap));
908 tsnap->snapshot_magic = STACKSHOT_TASK_SNAPSHOT_MAGIC;
909 tsnap->pid = proc_pid(p);
910 tsnap->uniqueid = proc_uniqueid(p);
911 tsnap->user_time_in_terminated_threads = task->total_user_time;
912 tsnap->system_time_in_terminated_threads = task->total_system_time;
913 tsnap->suspend_count = task->suspend_count;
914 tsnap->task_size = (typeof(tsnap->task_size))(get_task_phys_footprint(task) / PAGE_SIZE);
915 tsnap->faults = counter_load(&task->faults);
916 tsnap->pageins = counter_load(&task->pageins);
917 tsnap->cow_faults = counter_load(&task->cow_faults);
918 /*
919 * The throttling counters are maintained as 64-bit counters in the proc
920 * structure. However, we reserve 32-bits (each) for them in the task_snapshot
921 * struct to save space and since we do not expect them to overflow 32-bits. If we
922 * find these values overflowing in the future, the fix would be to simply
923 * upgrade these counters to 64-bit in the task_snapshot struct
924 */
925 tsnap->was_throttled = (uint32_t) proc_was_throttled(p);
926 tsnap->did_throttle = (uint32_t) proc_did_throttle(p);
927
928 if (task->t_flags & TF_TELEMETRY) {
929 tsnap->ss_flags |= kTaskRsrcFlagged;
930 }
931
932 if (proc_get_effective_task_policy(task, TASK_POLICY_DARWIN_BG)) {
933 tsnap->ss_flags |= kTaskDarwinBG;
934 }
935
936 proc_get_darwinbgstate(task, &tmp);
937
938 if (proc_get_effective_task_policy(task, TASK_POLICY_ROLE) == TASK_FOREGROUND_APPLICATION) {
939 tsnap->ss_flags |= kTaskIsForeground;
940 }
941
942 if (tmp & PROC_FLAG_ADAPTIVE_IMPORTANT) {
943 tsnap->ss_flags |= kTaskIsBoosted;
944 }
945
946 if (tmp & PROC_FLAG_SUPPRESSED) {
947 tsnap->ss_flags |= kTaskIsSuppressed;
948 }
949
950
951 tsnap->latency_qos = task_grab_latency_qos(task);
952
953 strlcpy(tsnap->p_comm, proc_name_address(p), sizeof(tsnap->p_comm));
954 if (user64_va) {
955 tsnap->ss_flags |= kUser64_p;
956 }
957
958
959 if (task->task_shared_region_slide != -1) {
960 tsnap->shared_cache_slide = task->task_shared_region_slide;
961 bcopy(task->task_shared_region_uuid, tsnap->shared_cache_identifier,
962 sizeof(task->task_shared_region_uuid));
963 }
964
965 current_buffer->current_position += sizeof(struct task_snapshot);
966
967 /*
968 * Directly after the task snapshot, place the array of UUID's corresponding to the binaries
969 * used by this task.
970 */
971 if ((current_buffer->size - current_buffer->current_position) < uuid_info_array_size) {
972 current_buffer->end_point = current_record_start;
973 current_buffer->current_position = 0;
974 if (current_record_start == 0) {
975 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
976 rv = ERANGE;
977 goto cancel_sample;
978 }
979 goto copytobuffer;
980 }
981
982 /*
983 * Copy the UUID info array into our sample.
984 */
985 if (uuid_info_array_size > 0) {
986 bcopy(uuid_info_array, (char *)(current_buffer->buffer + current_buffer->current_position), uuid_info_array_size);
987 tsnap->nloadinfos = uuid_info_count;
988 }
989
990 current_buffer->current_position += uuid_info_array_size;
991
992 /*
993 * After the task snapshot & list of binary UUIDs, we place a thread snapshot.
994 */
995
996 if ((current_buffer->size - current_buffer->current_position) < sizeof(struct thread_snapshot)) {
997 /* wrap and overwrite */
998 current_buffer->end_point = current_record_start;
999 current_buffer->current_position = 0;
1000 if (current_record_start == 0) {
1001 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1002 rv = ERANGE;
1003 goto cancel_sample;
1004 }
1005 goto copytobuffer;
1006 }
1007
1008 thsnap = (struct thread_snapshot *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position);
1009 bzero(thsnap, sizeof(*thsnap));
1010
1011 thsnap->snapshot_magic = STACKSHOT_THREAD_SNAPSHOT_MAGIC;
1012 thsnap->thread_id = thread_tid(thread);
1013 thsnap->state = thread->state;
1014 thsnap->priority = thread->base_pri;
1015 thsnap->sched_pri = thread->sched_pri;
1016 thsnap->sched_flags = thread->sched_flags;
1017 thsnap->ss_flags |= kStacksPCOnly;
1018 thsnap->ts_qos = thread->effective_policy.thep_qos;
1019 thsnap->ts_rqos = thread->requested_policy.thrp_qos;
1020 thsnap->ts_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
1021 thread->requested_policy.thrp_qos_workq_override);
1022
1023 if (proc_get_effective_thread_policy(thread, TASK_POLICY_DARWIN_BG)) {
1024 thsnap->ss_flags |= kThreadDarwinBG;
1025 }
1026
1027 thsnap->user_time = timer_grab(&thread->user_timer);
1028
1029 uint64_t tval = timer_grab(&thread->system_timer);
1030
1031 if (thread->precise_user_kernel_time) {
1032 thsnap->system_time = tval;
1033 } else {
1034 thsnap->user_time += tval;
1035 thsnap->system_time = 0;
1036 }
1037
1038 current_buffer->current_position += sizeof(struct thread_snapshot);
1039
1040 /*
1041 * If this thread has a dispatch queue serial number, include it here.
1042 */
1043 if (dqserialnum_valid) {
1044 if ((current_buffer->size - current_buffer->current_position) < sizeof(dqserialnum)) {
1045 /* wrap and overwrite */
1046 current_buffer->end_point = current_record_start;
1047 current_buffer->current_position = 0;
1048 if (current_record_start == 0) {
1049 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1050 rv = ERANGE;
1051 goto cancel_sample;
1052 }
1053 goto copytobuffer;
1054 }
1055
1056 thsnap->ss_flags |= kHasDispatchSerial;
1057 bcopy(&dqserialnum, (char *)current_buffer->buffer + current_buffer->current_position, sizeof(dqserialnum));
1058 current_buffer->current_position += sizeof(dqserialnum);
1059 }
1060
1061 if (user64_regs) {
1062 framesize = 8;
1063 thsnap->ss_flags |= kUser64_p;
1064 } else {
1065 framesize = 4;
1066 }
1067
1068 /*
1069 * If we can't fit this entire stacktrace then cancel this record, wrap to the beginning,
1070 * and start again there so that we always store a full record.
1071 */
1072 if ((current_buffer->size - current_buffer->current_position) / framesize < btcount) {
1073 current_buffer->end_point = current_record_start;
1074 current_buffer->current_position = 0;
1075 if (current_record_start == 0) {
1076 /* This sample is too large to fit in the buffer even when we started at 0, so skip it */
1077 rv = ERANGE;
1078 goto cancel_sample;
1079 }
1080 goto copytobuffer;
1081 }
1082
1083 for (bti = 0; bti < btcount; bti++, current_buffer->current_position += framesize) {
1084 if (framesize == 8) {
1085 *(uint64_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = frames[bti];
1086 } else {
1087 *(uint32_t *)(uintptr_t)(current_buffer->buffer + current_buffer->current_position) = (uint32_t)frames[bti];
1088 }
1089 }
1090
1091 if (current_buffer->end_point < current_buffer->current_position) {
1092 /*
1093 * Each time the cursor wraps around to the beginning, we leave a
1094 * differing amount of unused space at the end of the buffer. Make
1095 * sure the cursor pushes the end point in case we're making use of
1096 * more of the buffer than we did the last time we wrapped.
1097 */
1098 current_buffer->end_point = current_buffer->current_position;
1099 }
1100
1101 thsnap->nuser_frames = btcount;
1102
1103 /*
1104 * Now THIS is a hack.
1105 */
1106 if (current_buffer == &telemetry_buffer) {
1107 telemetry_bytes_since_last_mark += (current_buffer->current_position - current_record_start);
1108 if (telemetry_bytes_since_last_mark > telemetry_buffer_notify_at) {
1109 notify = true;
1110 }
1111 }
1112
1113 if (out_current_record_start != NULL) {
1114 *out_current_record_start = current_record_start;
1115 }
1116
1117 cancel_sample:
1118 if (release_buffer_lock) {
1119 lck_mtx_unlock(buffer_mtx);
1120 }
1121
1122 if (notify) {
1123 telemetry_notify_user();
1124 }
1125
1126 if (uuid_info_array != NULL) {
1127 kfree_data(uuid_info_array, uuid_info_array_size);
1128 }
1129
1130 return rv;
1131 }
1132
1133 #if TELEMETRY_DEBUG
1134 static void
log_telemetry_output(vm_offset_t buf,uint32_t pos,uint32_t sz)1135 log_telemetry_output(vm_offset_t buf, uint32_t pos, uint32_t sz)
1136 {
1137 struct micro_snapshot *p;
1138 uint32_t offset;
1139
1140 printf("Copying out %d bytes of telemetry at offset %d\n", sz, pos);
1141
1142 buf += pos;
1143
1144 /*
1145 * Find and log each timestamp in this chunk of buffer.
1146 */
1147 for (offset = 0; offset < sz; offset++) {
1148 p = (struct micro_snapshot *)(buf + offset);
1149 if (p->snapshot_magic == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
1150 printf("telemetry timestamp: %lld\n", p->ms_time);
1151 }
1152 }
1153 }
1154 #endif
1155
1156 int
telemetry_gather(user_addr_t buffer,uint32_t * length,bool mark)1157 telemetry_gather(user_addr_t buffer, uint32_t *length, bool mark)
1158 {
1159 return telemetry_buffer_gather(buffer, length, mark, &telemetry_buffer);
1160 }
1161
1162 int
telemetry_buffer_gather(user_addr_t buffer,uint32_t * length,bool mark,struct micro_snapshot_buffer * current_buffer)1163 telemetry_buffer_gather(user_addr_t buffer, uint32_t *length, bool mark, struct micro_snapshot_buffer * current_buffer)
1164 {
1165 int result = 0;
1166 uint32_t oldest_record_offset;
1167
1168 KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_START,
1169 mark, telemetry_bytes_since_last_mark, 0,
1170 (&telemetry_buffer != current_buffer));
1171
1172 TELEMETRY_LOCK();
1173
1174 if (current_buffer->buffer == 0) {
1175 *length = 0;
1176 goto out;
1177 }
1178
1179 if (*length < current_buffer->size) {
1180 result = KERN_NO_SPACE;
1181 goto out;
1182 }
1183
1184 /*
1185 * Copy the ring buffer out to userland in order sorted by time: least recent to most recent.
1186 * First, we need to search forward from the cursor to find the oldest record in our buffer.
1187 */
1188 oldest_record_offset = current_buffer->current_position;
1189 do {
1190 if (((oldest_record_offset + sizeof(uint32_t)) > current_buffer->size) ||
1191 ((oldest_record_offset + sizeof(uint32_t)) > current_buffer->end_point)) {
1192 if (*(uint32_t *)(uintptr_t)(current_buffer->buffer) == 0) {
1193 /*
1194 * There is no magic number at the start of the buffer, which means
1195 * it's empty; nothing to see here yet.
1196 */
1197 *length = 0;
1198 goto out;
1199 }
1200 /*
1201 * We've looked through the end of the active buffer without finding a valid
1202 * record; that means all valid records are in a single chunk, beginning at
1203 * the very start of the buffer.
1204 */
1205
1206 oldest_record_offset = 0;
1207 assert(*(uint32_t *)(uintptr_t)(current_buffer->buffer) == STACKSHOT_MICRO_SNAPSHOT_MAGIC);
1208 break;
1209 }
1210
1211 if (*(uint32_t *)(uintptr_t)(current_buffer->buffer + oldest_record_offset) == STACKSHOT_MICRO_SNAPSHOT_MAGIC) {
1212 break;
1213 }
1214
1215 /*
1216 * There are no alignment guarantees for micro-stackshot records, so we must search at each
1217 * byte offset.
1218 */
1219 oldest_record_offset++;
1220 } while (oldest_record_offset != current_buffer->current_position);
1221
1222 /*
1223 * If needed, copyout in two chunks: from the oldest record to the end of the buffer, and then
1224 * from the beginning of the buffer up to the current position.
1225 */
1226 if (oldest_record_offset != 0) {
1227 #if TELEMETRY_DEBUG
1228 log_telemetry_output(current_buffer->buffer, oldest_record_offset,
1229 current_buffer->end_point - oldest_record_offset);
1230 #endif
1231 if ((result = copyout((void *)(current_buffer->buffer + oldest_record_offset), buffer,
1232 current_buffer->end_point - oldest_record_offset)) != 0) {
1233 *length = 0;
1234 goto out;
1235 }
1236 *length = current_buffer->end_point - oldest_record_offset;
1237 } else {
1238 *length = 0;
1239 }
1240
1241 #if TELEMETRY_DEBUG
1242 log_telemetry_output(current_buffer->buffer, 0, current_buffer->current_position);
1243 #endif
1244 if ((result = copyout((void *)current_buffer->buffer, buffer + *length,
1245 current_buffer->current_position)) != 0) {
1246 *length = 0;
1247 goto out;
1248 }
1249 *length += (uint32_t)current_buffer->current_position;
1250
1251 out:
1252
1253 if (mark && (*length > 0)) {
1254 telemetry_bytes_since_last_mark = 0;
1255 }
1256
1257 TELEMETRY_UNLOCK();
1258
1259 KDBG(MACHDBG_CODE(DBG_MACH_STACKSHOT, MICROSTACKSHOT_GATHER) | DBG_FUNC_END,
1260 current_buffer->current_position, *length,
1261 current_buffer->end_point, (&telemetry_buffer != current_buffer));
1262
1263 return result;
1264 }
1265
1266 #if CONFIG_MACF
1267 static int
telemetry_macf_init_locked(size_t buffer_size)1268 telemetry_macf_init_locked(size_t buffer_size)
1269 {
1270 kern_return_t kr;
1271
1272 if (buffer_size > TELEMETRY_MAX_BUFFER_SIZE) {
1273 buffer_size = TELEMETRY_MAX_BUFFER_SIZE;
1274 }
1275
1276 telemetry_macf_buffer.size = buffer_size;
1277
1278 kr = kmem_alloc(kernel_map, &telemetry_macf_buffer.buffer,
1279 telemetry_macf_buffer.size, KMA_DATA | KMA_ZERO | KMA_PERMANENT,
1280 VM_KERN_MEMORY_SECURITY);
1281
1282 if (kr != KERN_SUCCESS) {
1283 kprintf("Telemetry (MACF): Allocation failed: %d\n", kr);
1284 return ENOMEM;
1285 }
1286
1287 return 0;
1288 }
1289
1290 int
telemetry_macf_mark_curthread(void)1291 telemetry_macf_mark_curthread(void)
1292 {
1293 thread_t thread = current_thread();
1294 task_t task = get_threadtask(thread);
1295 int rv = 0;
1296
1297 if (task == kernel_task) {
1298 /* Kernel threads never return to an AST boundary, and are ineligible */
1299 return EINVAL;
1300 }
1301
1302 /* Initialize the MACF telemetry buffer if needed. */
1303 TELEMETRY_MACF_LOCK();
1304 if (__improbable(telemetry_macf_buffer.size == 0)) {
1305 rv = telemetry_macf_init_locked(TELEMETRY_MACF_DEFAULT_BUFFER_SIZE);
1306
1307 if (rv != 0) {
1308 return rv;
1309 }
1310 }
1311 TELEMETRY_MACF_UNLOCK();
1312
1313 act_set_macf_telemetry_ast(thread);
1314 return 0;
1315 }
1316 #endif /* CONFIG_MACF */
1317
1318 /************************/
1319 /* BOOT PROFILE SUPPORT */
1320 /************************/
1321 /*
1322 * Boot Profiling
1323 *
1324 * The boot-profiling support is a mechanism to sample activity happening on the
1325 * system during boot. This mechanism sets up a periodic timer and on every timer fire,
1326 * captures a full backtrace into the boot profiling buffer. This buffer can be pulled
1327 * out and analyzed from user-space. It is turned on using the following boot-args:
1328 * "bootprofile_buffer_size" specifies the size of the boot profile buffer
1329 * "bootprofile_interval_ms" specifies the interval for the profiling timer
1330 *
1331 * Process Specific Boot Profiling
1332 *
1333 * The boot-arg "bootprofile_proc_name" can be used to specify a certain
1334 * process that needs to profiled during boot. Setting this boot-arg changes
1335 * the way stackshots are captured. At every timer fire, the code looks at the
1336 * currently running process and takes a stackshot only if the requested process
1337 * is on-core (which makes it unsuitable for MP systems).
1338 *
1339 * Trigger Events
1340 *
1341 * The boot-arg "bootprofile_type=boot" starts the timer during early boot. Using
1342 * "wake" starts the timer at AP wake from suspend-to-RAM.
1343 */
1344
1345 #define BOOTPROFILE_MAX_BUFFER_SIZE (64*1024*1024) /* see also COPYSIZELIMIT_PANIC */
1346
1347 vm_offset_t bootprofile_buffer = 0;
1348 uint32_t bootprofile_buffer_size = 0;
1349 uint32_t bootprofile_buffer_current_position = 0;
1350 uint32_t bootprofile_interval_ms = 0;
1351 uint64_t bootprofile_stackshot_flags = 0;
1352 uint64_t bootprofile_interval_abs = 0;
1353 uint64_t bootprofile_next_deadline = 0;
1354 uint32_t bootprofile_all_procs = 0;
1355 char bootprofile_proc_name[17];
1356 uint64_t bootprofile_delta_since_timestamp = 0;
1357 LCK_GRP_DECLARE(bootprofile_lck_grp, "bootprofile_group");
1358 LCK_MTX_DECLARE(bootprofile_mtx, &bootprofile_lck_grp);
1359
1360
1361 enum {
1362 kBootProfileDisabled = 0,
1363 kBootProfileStartTimerAtBoot,
1364 kBootProfileStartTimerAtWake
1365 } bootprofile_type = kBootProfileDisabled;
1366
1367
1368 static timer_call_data_t bootprofile_timer_call_entry;
1369
1370 #define BOOTPROFILE_LOCK() do { lck_mtx_lock(&bootprofile_mtx); } while(0)
1371 #define BOOTPROFILE_TRY_SPIN_LOCK() lck_mtx_try_lock_spin(&bootprofile_mtx)
1372 #define BOOTPROFILE_UNLOCK() do { lck_mtx_unlock(&bootprofile_mtx); } while(0)
1373
1374 static void bootprofile_timer_call(
1375 timer_call_param_t param0,
1376 timer_call_param_t param1);
1377
1378 void
bootprofile_init(void)1379 bootprofile_init(void)
1380 {
1381 kern_return_t ret;
1382 char type[32];
1383
1384 if (!PE_parse_boot_argn("bootprofile_buffer_size",
1385 &bootprofile_buffer_size, sizeof(bootprofile_buffer_size))) {
1386 bootprofile_buffer_size = 0;
1387 }
1388
1389 if (bootprofile_buffer_size > BOOTPROFILE_MAX_BUFFER_SIZE) {
1390 bootprofile_buffer_size = BOOTPROFILE_MAX_BUFFER_SIZE;
1391 }
1392
1393 if (!PE_parse_boot_argn("bootprofile_interval_ms",
1394 &bootprofile_interval_ms, sizeof(bootprofile_interval_ms))) {
1395 bootprofile_interval_ms = 0;
1396 }
1397
1398 if (!PE_parse_boot_argn("bootprofile_stackshot_flags",
1399 &bootprofile_stackshot_flags, sizeof(bootprofile_stackshot_flags))) {
1400 bootprofile_stackshot_flags = 0;
1401 }
1402
1403 if (!PE_parse_boot_argn("bootprofile_proc_name",
1404 &bootprofile_proc_name, sizeof(bootprofile_proc_name))) {
1405 bootprofile_all_procs = 1;
1406 bootprofile_proc_name[0] = '\0';
1407 }
1408
1409 if (PE_parse_boot_argn("bootprofile_type", type, sizeof(type))) {
1410 if (0 == strcmp(type, "boot")) {
1411 bootprofile_type = kBootProfileStartTimerAtBoot;
1412 } else if (0 == strcmp(type, "wake")) {
1413 bootprofile_type = kBootProfileStartTimerAtWake;
1414 } else {
1415 bootprofile_type = kBootProfileDisabled;
1416 }
1417 } else {
1418 bootprofile_type = kBootProfileDisabled;
1419 }
1420
1421 clock_interval_to_absolutetime_interval(bootprofile_interval_ms, NSEC_PER_MSEC, &bootprofile_interval_abs);
1422
1423 /* Both boot args must be set to enable */
1424 if ((bootprofile_type == kBootProfileDisabled) || (bootprofile_buffer_size == 0) || (bootprofile_interval_abs == 0)) {
1425 return;
1426 }
1427
1428 ret = kmem_alloc(kernel_map, &bootprofile_buffer, bootprofile_buffer_size,
1429 KMA_DATA | KMA_ZERO | KMA_PERMANENT, VM_KERN_MEMORY_DIAG);
1430 if (ret != KERN_SUCCESS) {
1431 kprintf("Boot profile: Allocation failed: %d\n", ret);
1432 return;
1433 }
1434
1435 kprintf("Boot profile: Sampling %s once per %u ms at %s\n",
1436 bootprofile_all_procs ? "all procs" : bootprofile_proc_name, bootprofile_interval_ms,
1437 bootprofile_type == kBootProfileStartTimerAtBoot ? "boot" : (bootprofile_type == kBootProfileStartTimerAtWake ? "wake" : "unknown"));
1438
1439 timer_call_setup(&bootprofile_timer_call_entry,
1440 bootprofile_timer_call,
1441 NULL);
1442
1443 if (bootprofile_type == kBootProfileStartTimerAtBoot) {
1444 bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1445 timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1446 NULL,
1447 bootprofile_next_deadline,
1448 0,
1449 TIMER_CALL_SYS_NORMAL,
1450 false);
1451 }
1452 }
1453
1454 void
bootprofile_wake_from_sleep(void)1455 bootprofile_wake_from_sleep(void)
1456 {
1457 if (bootprofile_type == kBootProfileStartTimerAtWake) {
1458 bootprofile_next_deadline = mach_absolute_time() + bootprofile_interval_abs;
1459 timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1460 NULL,
1461 bootprofile_next_deadline,
1462 0,
1463 TIMER_CALL_SYS_NORMAL,
1464 false);
1465 }
1466 }
1467
1468
1469 static void
bootprofile_timer_call(timer_call_param_t param0 __unused,timer_call_param_t param1 __unused)1470 bootprofile_timer_call(
1471 timer_call_param_t param0 __unused,
1472 timer_call_param_t param1 __unused)
1473 {
1474 unsigned retbytes = 0;
1475 int pid_to_profile = -1;
1476
1477 if (!BOOTPROFILE_TRY_SPIN_LOCK()) {
1478 goto reprogram;
1479 }
1480
1481 /* Check if process-specific boot profiling is turned on */
1482 if (!bootprofile_all_procs) {
1483 /*
1484 * Since boot profiling initializes really early in boot, it is
1485 * possible that at this point, the task/proc is not initialized.
1486 * Nothing to do in that case.
1487 */
1488
1489 if ((current_task() != NULL) && (current_task()->bsd_info != NULL) &&
1490 (0 == strncmp(bootprofile_proc_name, proc_name_address(current_task()->bsd_info), 17))) {
1491 pid_to_profile = proc_selfpid();
1492 } else {
1493 /*
1494 * Process-specific boot profiling requested but the on-core process is
1495 * something else. Nothing to do here.
1496 */
1497 BOOTPROFILE_UNLOCK();
1498 goto reprogram;
1499 }
1500 }
1501
1502 /* initiate a stackshot with whatever portion of the buffer is left */
1503 if (bootprofile_buffer_current_position < bootprofile_buffer_size) {
1504 uint64_t flags = STACKSHOT_KCDATA_FORMAT | STACKSHOT_TRYLOCK | STACKSHOT_SAVE_LOADINFO
1505 | STACKSHOT_GET_GLOBAL_MEM_STATS;
1506 #if defined(XNU_TARGET_OS_OSX)
1507 flags |= STACKSHOT_SAVE_KEXT_LOADINFO;
1508 #endif
1509
1510
1511 /* OR on flags specified in boot-args */
1512 flags |= bootprofile_stackshot_flags;
1513 if ((flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) && (bootprofile_delta_since_timestamp == 0)) {
1514 /* Can't take deltas until the first one */
1515 flags &= ~STACKSHOT_COLLECT_DELTA_SNAPSHOT;
1516 }
1517
1518 uint64_t timestamp = 0;
1519 if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) {
1520 timestamp = mach_absolute_time();
1521 }
1522
1523 kern_return_t r = stack_snapshot_from_kernel(
1524 pid_to_profile, (void *)(bootprofile_buffer + bootprofile_buffer_current_position),
1525 bootprofile_buffer_size - bootprofile_buffer_current_position,
1526 flags, bootprofile_delta_since_timestamp, 0, &retbytes);
1527
1528 /*
1529 * We call with STACKSHOT_TRYLOCK because the stackshot lock is coarser
1530 * than the bootprofile lock. If someone else has the lock we'll just
1531 * try again later.
1532 */
1533
1534 if (r == KERN_LOCK_OWNED) {
1535 BOOTPROFILE_UNLOCK();
1536 goto reprogram;
1537 }
1538
1539 if (bootprofile_stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT &&
1540 r == KERN_SUCCESS) {
1541 bootprofile_delta_since_timestamp = timestamp;
1542 }
1543
1544 bootprofile_buffer_current_position += retbytes;
1545 }
1546
1547 BOOTPROFILE_UNLOCK();
1548
1549 /* If we didn't get any data or have run out of buffer space, stop profiling */
1550 if ((retbytes == 0) || (bootprofile_buffer_current_position == bootprofile_buffer_size)) {
1551 return;
1552 }
1553
1554
1555 reprogram:
1556 /* If the user gathered the buffer, no need to keep profiling */
1557 if (bootprofile_interval_abs == 0) {
1558 return;
1559 }
1560
1561 clock_deadline_for_periodic_event(bootprofile_interval_abs,
1562 mach_absolute_time(),
1563 &bootprofile_next_deadline);
1564 timer_call_enter_with_leeway(&bootprofile_timer_call_entry,
1565 NULL,
1566 bootprofile_next_deadline,
1567 0,
1568 TIMER_CALL_SYS_NORMAL,
1569 false);
1570 }
1571
1572 void
bootprofile_get(void ** buffer,uint32_t * length)1573 bootprofile_get(void **buffer, uint32_t *length)
1574 {
1575 BOOTPROFILE_LOCK();
1576 *buffer = (void*) bootprofile_buffer;
1577 *length = bootprofile_buffer_current_position;
1578 BOOTPROFILE_UNLOCK();
1579 }
1580
1581 int
bootprofile_gather(user_addr_t buffer,uint32_t * length)1582 bootprofile_gather(user_addr_t buffer, uint32_t *length)
1583 {
1584 int result = 0;
1585
1586 BOOTPROFILE_LOCK();
1587
1588 if (bootprofile_buffer == 0) {
1589 *length = 0;
1590 goto out;
1591 }
1592
1593 if (*length < bootprofile_buffer_current_position) {
1594 result = KERN_NO_SPACE;
1595 goto out;
1596 }
1597
1598 if ((result = copyout((void *)bootprofile_buffer, buffer,
1599 bootprofile_buffer_current_position)) != 0) {
1600 *length = 0;
1601 goto out;
1602 }
1603 *length = bootprofile_buffer_current_position;
1604
1605 /* cancel future timers */
1606 bootprofile_interval_abs = 0;
1607
1608 out:
1609
1610 BOOTPROFILE_UNLOCK();
1611
1612 return result;
1613 }
1614