xref: /xnu-12377.1.9/osfmk/kern/kern_stackshot.c (revision f6217f891ac0bb64f3d375211650a4c1ff8ca1ea)
1 /*
2  * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <mach/mach_types.h>
31 #include <mach/vm_param.h>
32 #include <mach/mach_vm.h>
33 #include <mach/clock_types.h>
34 #include <sys/code_signing.h>
35 #include <sys/errno.h>
36 #include <sys/stackshot.h>
37 #if defined(__arm64__)
38 #include <arm/cpu_internal.h>
39 #endif /* __arm64__ */
40 #ifdef IMPORTANCE_INHERITANCE
41 #include <ipc/ipc_importance.h>
42 #endif
43 #include <sys/appleapiopts.h>
44 #include <kern/debug.h>
45 #include <kern/block_hint.h>
46 #include <uuid/uuid.h>
47 
48 #include <kdp/kdp_dyld.h>
49 #include <kdp/kdp_en_debugger.h>
50 #include <kdp/processor_core.h>
51 #include <kdp/kdp_common.h>
52 
53 #include <libsa/types.h>
54 #include <libkern/version.h>
55 #include <libkern/section_keywords.h>
56 
57 #include <string.h> /* bcopy */
58 
59 #include <kern/kern_stackshot.h>
60 #include <kern/kcdata_private.h>
61 #include <kern/backtrace.h>
62 #include <kern/coalition.h>
63 #include <kern/epoch_sync.h>
64 #include <kern/exclaves_stackshot.h>
65 #include <kern/exclaves_inspection.h>
66 #include <kern/processor.h>
67 #include <kern/host_statistics.h>
68 #include <kern/counter.h>
69 #include <kern/thread.h>
70 #include <kern/thread_group.h>
71 #include <kern/task.h>
72 #include <kern/telemetry.h>
73 #include <kern/clock.h>
74 #include <kern/policy_internal.h>
75 #include <kern/socd_client.h>
76 #include <kern/startup.h>
77 #include <vm/pmap.h>
78 #include <vm/vm_map_xnu.h>
79 #include <vm/vm_kern_xnu.h>
80 #include <vm/vm_pageout.h>
81 #include <vm/vm_fault.h>
82 #include <vm/vm_shared_region_xnu.h>
83 #include <vm/vm_compressor_xnu.h>
84 #include <libkern/OSKextLibPrivate.h>
85 #include <os/log.h>
86 
87 
88 
89 #ifdef CONFIG_EXCLAVES
90 #include <kern/exclaves.tightbeam.h>
91 #endif /* CONFIG_EXCLAVES */
92 
93 #include <kern/exclaves_test_stackshot.h>
94 
95 #include <libkern/coreanalytics/coreanalytics.h>
96 
97 #if defined(__x86_64__)
98 #include <i386/mp.h>
99 #include <i386/cpu_threads.h>
100 #endif
101 
102 #include <pexpert/pexpert.h>
103 
104 #if CONFIG_PERVASIVE_CPI
105 #include <kern/monotonic.h>
106 #endif /* CONFIG_PERVASIVE_CPI */
107 
108 #include <san/kasan.h>
109 
110 #if DEBUG || DEVELOPMENT
111 #define STACKSHOT_COLLECTS_DIAGNOSTICS 1
112 #define STACKSHOT_COLLECTS_LATENCY_INFO 1
113 #else
114 #define STACKSHOT_COLLECTS_DIAGNOSTICS 0
115 #define STACKSHOT_COLLECTS_LATENCY_INFO 0
116 #endif /* DEBUG || DEVELOPMENT */
117 
118 #define STACKSHOT_COLLECTS_RDAR_126582377_DATA 0
119 
120 #if defined(__AMP__)
121 #define STACKSHOT_NUM_WORKQUEUES 2
122 #else /* __AMP__ */
123 #define STACKSHOT_NUM_WORKQUEUES 1
124 #endif
125 
126 #if defined(__arm64__)
127 #define STACKSHOT_NUM_BUFFERS MAX_CPU_CLUSTERS
128 #else /* __arm64__ */
129 #define STACKSHOT_NUM_BUFFERS 1
130 #endif /* __arm64__ */
131 
132 /* The number of threads which will land a task in the hardest workqueue. */
133 #define STACKSHOT_HARDEST_THREADCOUNT 10
134 
135 TUNABLE_DEV_WRITEABLE(unsigned int, stackshot_single_thread, "stackshot_single_thread", 0);
136 
137 extern unsigned int not_in_kdp;
138 
139 /* indicate to the compiler that some accesses are unaligned */
140 typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
141 
142 int kdp_snapshot                            = 0;
143 
144 #pragma mark ---Stackshot Struct Definitions---
145 
146 typedef struct linked_kcdata_descriptor {
147 	struct kcdata_descriptor          kcdata;
148 	struct linked_kcdata_descriptor  *next;
149 } * linked_kcdata_descriptor_t;
150 
151 struct stackshot_workitem {
152 	task_t                        sswi_task;
153 	linked_kcdata_descriptor_t    sswi_data; /* The kcdata for this task. */
154 	int                           sswi_idx;  /* The index of this job, used for ordering kcdata across multiple queues. */
155 };
156 
157 struct stackshot_workqueue {
158 	uint32_t _Atomic              sswq_num_items; /* Only modified by main CPU */
159 	uint32_t _Atomic              sswq_cur_item; /* Modified by all CPUs */
160 	size_t                        sswq_capacity; /* Constant after preflight */
161 	bool _Atomic                  sswq_populated; /* Only modified by main CPU */
162 	struct stackshot_workitem    *__counted_by(capacity) sswq_items;
163 };
164 
165 struct freelist_entry {
166 	struct freelist_entry        *fl_next; /* Next entry in the freelist */
167 	size_t                        fl_size; /* Size of the entry (must be >= sizeof(struct freelist_entry)) */
168 };
169 
170 struct stackshot_buffer {
171 	void                         *ssb_ptr; /* Base of buffer */
172 	size_t                        ssb_size;
173 	size_t _Atomic                ssb_used;
174 	struct freelist_entry        *ssb_freelist; /* First freelist entry */
175 	int _Atomic                   ssb_freelist_lock;
176 	size_t _Atomic                ssb_overhead; /* Total amount ever freed (even if re-allocated from freelist) */
177 };
178 
179 struct kdp_snapshot_args {
180 	int                           pid;
181 	void                         *buffer;
182 	struct kcdata_descriptor     *descriptor;
183 	uint32_t                      buffer_size;
184 	uint64_t                      flags;
185 	uint64_t                      since_timestamp;
186 	uint32_t                      pagetable_mask;
187 };
188 
189 /*
190  * Keep a simple cache of the most recent validation done at a page granularity
191  * to avoid the expensive software KVA-to-phys translation in the VM.
192  */
193 
194 struct _stackshot_validation_state {
195 	vm_offset_t last_valid_page_kva;
196 	size_t last_valid_size;
197 };
198 
199 /* CPU-local generation counts for PLH */
200 struct _stackshot_plh_gen_state {
201 	uint8_t                *pgs_gen;       /* last 'gen #' seen in */
202 	int16_t                 pgs_curgen_min; /* min idx seen for this gen */
203 	int16_t                 pgs_curgen_max; /* max idx seen for this gen */
204 	uint8_t                 pgs_curgen;     /* current gen */
205 };
206 
207 /*
208  * For port labels, we have a small hash table we use to track the
209  * struct ipc_service_port_label pointers we see along the way.
210  * This structure encapsulates the global state.
211  *
212  * The hash table is insert-only, similar to "intern"ing strings.  It's
213  * only used an manipulated in during the stackshot collection.  We use
214  * seperate chaining, with the hash elements and chains being int16_ts
215  * indexes into the parallel arrays, with -1 ending the chain.  Array indices are
216  * allocated using a bump allocator.
217  *
218  * The parallel arrays contain:
219  *      - plh_array[idx]	the pointer entered
220  *      - plh_chains[idx]	the hash chain
221  *      - plh_gen[idx]		the last 'generation #' seen
222  *
223  * Generation IDs are used to track entries looked up in the current
224  * task; 0 is never used, and the plh_gen array is cleared to 0 on
225  * rollover.
226  *
227  * The portlabel_ids we report externally are just the index in the array,
228  * plus 1 to avoid 0 as a value.  0 is NONE, -1 is UNKNOWN (e.g. there is
229  * one, but we ran out of space)
230  */
231 struct port_label_hash {
232 	int _Atomic             plh_lock;       /* lock for concurrent modifications to this plh */
233 	uint16_t                plh_size;       /* size of allocations; 0 disables tracking */
234 	uint16_t                plh_count;      /* count of used entries in plh_array */
235 	struct ipc_service_port_label **plh_array; /* _size allocated, _count used */
236 	int16_t                *plh_chains;    /* _size allocated */
237 	int16_t                *plh_hash;      /* (1 << STACKSHOT_PLH_SHIFT) entry hash table: hash(ptr) -> array index */
238 #if DEVELOPMENT || DEBUG
239 	/* statistics */
240 	uint32_t _Atomic        plh_lookups;    /* # lookups or inserts */
241 	uint32_t _Atomic        plh_found;
242 	uint32_t _Atomic        plh_found_depth;
243 	uint32_t _Atomic        plh_insert;
244 	uint32_t _Atomic        plh_insert_depth;
245 	uint32_t _Atomic        plh_bad;
246 	uint32_t _Atomic        plh_bad_depth;
247 	uint32_t _Atomic        plh_lookup_send;
248 	uint32_t _Atomic        plh_lookup_receive;
249 #define PLH_STAT_OP(...)    (void)(__VA_ARGS__)
250 #else /* DEVELOPMENT || DEBUG */
251 #define PLH_STAT_OP(...)    (void)(0)
252 #endif /* DEVELOPMENT || DEBUG */
253 };
254 
255 #define plh_lock(plh) while(!os_atomic_cmpxchg(&(plh)->plh_lock, 0, 1, acquire)) { loop_wait(); }
256 #define plh_unlock(plh) os_atomic_store(&(plh)->plh_lock, 0, release);
257 
258 #define STACKSHOT_PLH_SHIFT    7
259 #define STACKSHOT_PLH_SIZE_MAX ((kdp_ipc_have_splabel)? 1024 : 0)
260 size_t stackshot_port_label_size = (2 * (1u << STACKSHOT_PLH_SHIFT));
261 #define STASKSHOT_PLH_SIZE(x) MIN((x), STACKSHOT_PLH_SIZE_MAX)
262 
263 struct stackshot_cpu_context {
264 	bool                               scc_can_work; /* Whether the CPU can do more stackshot work */
265 	bool                               scc_did_work; /* Whether the CPU actually did any stackshot work */
266 	linked_kcdata_descriptor_t         scc_kcdata_head; /* See `linked_kcdata_alloc_callback */
267 	linked_kcdata_descriptor_t         scc_kcdata_tail; /* See `linked_kcdata_alloc_callback */
268 	uintptr_t                         *scc_stack_buffer; /* A buffer for stacktraces. */
269 	struct stackshot_fault_stats       scc_fault_stats;
270 	struct _stackshot_validation_state scc_validation_state;
271 	struct _stackshot_plh_gen_state    scc_plh_gen;
272 };
273 
274 /*
275  * When directly modifying the stackshot state, always use the macros below to
276  * work wth this enum - the higher order bits are used to store an error code
277  * in the case of SS_ERRORED.
278  *
279  *        +------------------------------------+-------------------+
280  *        |                                    |                   |
281  *        v                                    |                   |
282  * +-------------+     +----------+     +------------+     +------------+
283  * | SS_INACTIVE |---->| SS_SETUP |---->| SS_RUNNING |---->| SS_ERRORED |
284  * +-------------+     +----------+     +------------+     +------------+
285  *                         |  |                |                ^  |
286  *                         |  +----------------|----------------+  |
287  * +-------------+         |                   |                   |
288  * | SS_PANICKED |<--------+-------------------+                   |
289  * +-------------+                                                 |
290  *        ^                                                        |
291  *        |                                                        |
292  *        +--------------------------------------------------------+
293  */
294 __enum_closed_decl(stackshot_state_t, uint, {
295 	SS_INACTIVE = 0x0, /* -> SS_SETUP */
296 	SS_SETUP    = 0x1, /* -> SS_RUNNING, SS_ERRORED, SS_PANICKED */
297 	SS_RUNNING  = 0x2, /* -> SS_ERRORED, SS_PANICKED, SS_INACTIVE */
298 	SS_ERRORED  = 0x3, /* -> SS_INACTIVE, SS_PANICKED */
299 	SS_PANICKED = 0x4, /* -> N/A */
300 	_SS_COUNT
301 });
302 
303 static_assert(_SS_COUNT <= 0x5);
304 /* Get the stackshot state ID from a stackshot_state_t. */
305 #define SS_STATE(state) ((state) & 0x7u)
306 /* Get the error code from a stackshot_state_t. */
307 #define SS_ERRCODE(state) ((state) >> 3)
308 /* Make a stackshot error state with a given code. */
309 #define SS_MKERR(code) (((code) << 3) | SS_ERRORED)
310 
311 struct stackshot_context {
312 	/* Constants & Arguments */
313 	struct kdp_snapshot_args      sc_args;
314 	int                           sc_calling_cpuid;
315 	int                           sc_main_cpuid;
316 	bool                          sc_enable_faulting;
317 	uint64_t                      sc_microsecs; /* Timestamp */
318 	bool                          sc_panic_stackshot;
319 	size_t                        sc_min_kcdata_size;
320 	bool                          sc_is_singlethreaded;
321 
322 	/* State & Errors */
323 	stackshot_state_t _Atomic     sc_state; /* Only modified by calling CPU, main CPU, or panicking CPU. See comment above type definition for details. */
324 	kern_return_t                 sc_retval; /* The return value of the main thread */
325 	uint32_t _Atomic              sc_cpus_working;
326 
327 	/* KCData */
328 	linked_kcdata_descriptor_t    sc_pretask_kcdata;
329 	linked_kcdata_descriptor_t    sc_posttask_kcdata;
330 	kcdata_descriptor_t           sc_finalized_kcdata;
331 
332 	/* Buffers & Queues */
333 	struct stackshot_buffer       __counted_by(num_buffers) sc_buffers[STACKSHOT_NUM_BUFFERS];
334 	size_t                        sc_num_buffers;
335 	struct stackshot_workqueue    __counted_by(STACKSHOT_NUM_WORKQUEUES) sc_workqueues[STACKSHOT_NUM_WORKQUEUES];
336 	struct port_label_hash        sc_plh;
337 
338 	/* Statistics */
339 	struct stackshot_duration_v2  sc_duration;
340 	uint32_t                      sc_bytes_traced;
341 	uint32_t                      sc_bytes_uncompressed;
342 #if STACKSHOT_COLLECTS_LATENCY_INFO
343 	struct stackshot_latency_collection_v2 sc_latency;
344 #endif
345 };
346 
347 #define STACKSHOT_DEBUG_TRACEBUF_SIZE 16
348 
349 struct stackshot_trace_entry {
350 	int               sste_line_no;
351 	uint64_t          sste_timestamp;
352 	mach_vm_address_t sste_data;
353 };
354 
355 struct stackshot_trace_buffer {
356 	uint64_t                     sstb_last_trace_timestamp;
357 	size_t                       sstb_tail_idx;
358 	size_t                       sstb_size;
359 	struct stackshot_trace_entry __counted_by(STACKSHOT_DEBUG_TRACEBUF_SIZE) sstb_entries[STACKSHOT_DEBUG_TRACEBUF_SIZE];
360 };
361 
362 #pragma mark ---Stackshot State and Data---
363 
364 /*
365  * Two stackshot states, one for panic and one for normal.
366  * That way, we can take a stackshot during a panic without clobbering state.
367  */
368 #define STACKSHOT_CTX_IDX_NORMAL 0
369 #define STACKSHOT_CTX_IDX_PANIC  1
370 size_t cur_stackshot_ctx_idx   = STACKSHOT_CTX_IDX_NORMAL;
371 struct stackshot_context stackshot_contexts[2] = {{0}, {0}};
372 #define stackshot_ctx (stackshot_contexts[cur_stackshot_ctx_idx])
373 #define stackshot_args (stackshot_ctx.sc_args)
374 #define stackshot_flags (stackshot_args.flags)
375 
376 static struct {
377 	uint64_t last_abs_start;      /* start time of last stackshot */
378 	uint64_t last_abs_end;        /* end time of last stackshot */
379 	uint64_t stackshots_taken;    /* total stackshots taken since boot */
380 	uint64_t stackshots_duration; /* total abs time spent in stackshot_trap() since boot */
381 } stackshot_stats = { 0 };
382 
383 #if STACKSHOT_COLLECTS_LATENCY_INFO
384 static struct stackshot_latency_cpu PERCPU_DATA(stackshot_cpu_latency_percpu);
385 #define stackshot_cpu_latency (*PERCPU_GET(stackshot_cpu_latency_percpu))
386 #endif
387 
388 static struct stackshot_cpu_context PERCPU_DATA(stackshot_cpu_ctx_percpu);
389 #define stackshot_cpu_ctx (*PERCPU_GET(stackshot_cpu_ctx_percpu))
390 
391 static struct kcdata_descriptor PERCPU_DATA(stackshot_kcdata_percpu);
392 #define stackshot_kcdata_p (PERCPU_GET(stackshot_kcdata_percpu))
393 
394 #if STACKSHOT_COLLECTS_LATENCY_INFO
395 static bool collect_latency_info = true;
396 #endif
397 
398 static uint64_t stackshot_max_fault_time;
399 
400 #if STACKSHOT_COLLECTS_DIAGNOSTICS
401 static struct stackshot_trace_buffer PERCPU_DATA(stackshot_trace_buffer);
402 #endif
403 
404 #pragma mark ---Stackshot Global State---
405 
406 uint32_t stackshot_estimate_adj = 25; /* experiment factor: 0-100, adjust our estimate up by this amount */
407 
408 static uint32_t stackshot_initial_estimate;
409 static uint32_t stackshot_initial_estimate_adj;
410 static uint64_t stackshot_duration_prior_abs;   /* prior attempts, abs */
411 static unaligned_u64 * stackshot_duration_outer;
412 static uint64_t stackshot_tries;
413 
414 void * kernel_stackshot_buf   = NULL; /* Pointer to buffer for stackshots triggered from the kernel and retrieved later */
415 int kernel_stackshot_buf_size = 0;
416 
417 void * stackshot_snapbuf = NULL; /* Used by stack_snapshot2 (to be removed) */
418 
419 #if CONFIG_EXCLAVES
420 static ctid_t *stackshot_exclave_inspect_ctids = NULL;
421 static size_t stackshot_exclave_inspect_ctid_count = 0;
422 static size_t stackshot_exclave_inspect_ctid_capacity = 0;
423 
424 static kern_return_t stackshot_exclave_kr = KERN_SUCCESS;
425 #endif /* CONFIG_EXCLAVES */
426 
427 #if DEBUG || DEVELOPMENT
428 TUNABLE(bool, disable_exclave_stackshot, "-disable_exclave_stackshot", false);
429 #else
430 const bool disable_exclave_stackshot = false;
431 #endif
432 
433 #pragma mark ---Stackshot Static Function Declarations---
434 
435 __private_extern__ void stackshot_init( void );
436 static boolean_t        memory_iszero(void *addr, size_t size);
437 static void             stackshot_cpu_do_work(void);
438 static kern_return_t    stackshot_finalize_kcdata(void);
439 static kern_return_t    stackshot_finalize_singlethreaded_kcdata(void);
440 static kern_return_t    stackshot_collect_kcdata(void);
441 static int              kdp_stackshot_kcdata_format();
442 static void             kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap);
443 static vm_offset_t      stackshot_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_flags, uint32_t *kdp_fault_result_flags);
444 static boolean_t        stackshot_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_result);
445 static int              stackshot_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results);
446 static boolean_t        stackshot_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results);
447 static uint64_t         proc_was_throttled_from_task(task_t task);
448 static void             stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_v2_t * waitinfo);
449 static int              stackshot_thread_has_valid_waitinfo(thread_t thread);
450 static void             stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_v2_t *tsinfo);
451 static int              stackshot_thread_has_valid_turnstileinfo(thread_t thread);
452 static uint32_t         get_stackshot_estsize(uint32_t prev_size_hint, uint32_t adj, uint64_t trace_flags, pid_t target_pid);
453 static kern_return_t    kdp_snapshot_preflight_internal(struct kdp_snapshot_args args);
454 
455 #if CONFIG_COALITIONS
456 static void             stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal);
457 static void             stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t coal);
458 #endif /* CONFIG_COALITIONS */
459 
460 #if CONFIG_THREAD_GROUPS
461 static void             stackshot_thread_group_count(void *arg, int i, struct thread_group *tg);
462 static void             stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg);
463 #endif /* CONFIG_THREAD_GROUPS */
464 
465 extern uint64_t         workqueue_get_task_ss_flags_from_pwq_state_kdp(void *proc);
466 
467 static kcdata_descriptor_t linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor, size_t min_size);
468 
469 #pragma mark ---Stackshot Externs---
470 
471 struct proc;
472 extern int              proc_pid(struct proc *p);
473 extern uint64_t         proc_uniqueid(void *p);
474 extern uint64_t         proc_was_throttled(void *p);
475 extern uint64_t         proc_did_throttle(void *p);
476 extern int              proc_exiting(void *p);
477 extern int              proc_in_teardown(void *p);
478 static uint64_t         proc_did_throttle_from_task(task_t task);
479 extern void             proc_name_kdp(struct proc *p, char * buf, int size);
480 extern int              proc_threadname_kdp(void * uth, char * buf, size_t size);
481 extern void             proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime);
482 extern void             proc_archinfo_kdp(void* p, cpu_type_t* cputype, cpu_subtype_t* cpusubtype);
483 extern uint64_t         proc_getcsflags_kdp(void * p);
484 extern boolean_t        proc_binary_uuid_kdp(task_t task, uuid_t uuid);
485 extern uint32_t         proc_getuid(proc_t);
486 extern uint32_t         proc_getgid(proc_t);
487 extern void             proc_memstat_data_kdp(void *p, int32_t *current_memlimit, int32_t *prio_effective, int32_t *prio_requested, int32_t *prio_assertion);
488 extern int              memorystatus_get_pressure_status_kdp(void);
489 extern void             memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit, boolean_t *is_active, boolean_t *is_managed, boolean_t *has_assertion);
490 extern void             panic_stackshot_release_lock(void);
491 
492 extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */
493 
494 #if CONFIG_TELEMETRY
495 extern kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval);
496 #endif /* CONFIG_TELEMETRY */
497 
498 extern kern_return_t kern_stack_snapshot_with_reason(char* reason);
499 extern kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user);
500 
501 static size_t stackshot_plh_est_size(void);
502 
503 #if CONFIG_EXCLAVES
504 static kern_return_t collect_exclave_threads(uint64_t);
505 static kern_return_t stackshot_setup_exclave_waitlist(void);
506 static void stackshot_cleanup_exclave_waitlist(void);
507 #endif
508 
509 /*
510  * Validates that the given address for a word is both a valid page and has
511  * default caching attributes for the current map.
512  */
513 bool machine_trace_thread_validate_kva(vm_offset_t);
514 /*
515  * Validates a region that stackshot will potentially inspect.
516  */
517 static bool _stackshot_validate_kva(vm_offset_t, size_t);
518 /*
519  * Must be called whenever stackshot is re-driven.
520  */
521 static void _stackshot_validation_reset(void);
522 /*
523  * A kdp-safe strlen() call.  Returns:
524  *      -1 if we reach maxlen or a bad address before the end of the string, or
525  *      strlen(s)
526  */
527 static long _stackshot_strlen(const char *s, size_t maxlen);
528 
529 #define MAX_FRAMES 1000
530 #define STACKSHOT_PAGETABLE_BUFSZ 4000
531 #define MAX_LOADINFOS 500
532 #define MAX_DYLD_COMPACTINFO (20 * 1024)  // max bytes of compactinfo to include per proc/shared region
533 #define TASK_IMP_WALK_LIMIT 20
534 
535 typedef struct thread_snapshot *thread_snapshot_t;
536 typedef struct task_snapshot *task_snapshot_t;
537 
538 #if CONFIG_KDP_INTERACTIVE_DEBUGGING
539 extern kdp_send_t    kdp_en_send_pkt;
540 #endif
541 
542 /*
543  * Stackshot locking and other defines.
544  */
545 LCK_GRP_DECLARE(stackshot_subsys_lck_grp, "stackshot_subsys_lock");
546 LCK_MTX_DECLARE(stackshot_subsys_mutex, &stackshot_subsys_lck_grp);
547 
548 #define STACKSHOT_SUBSYS_LOCK() lck_mtx_lock(&stackshot_subsys_mutex)
549 #define STACKSHOT_SUBSYS_TRY_LOCK() lck_mtx_try_lock(&stackshot_subsys_mutex)
550 #define STACKSHOT_SUBSYS_UNLOCK() lck_mtx_unlock(&stackshot_subsys_mutex)
551 #define STACKSHOT_SUBSYS_ASSERT_LOCKED() lck_mtx_assert(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
552 
553 #define SANE_BOOTPROFILE_TRACEBUF_SIZE (64ULL * 1024ULL * 1024ULL)
554 #define SANE_TRACEBUF_SIZE (8ULL * 1024ULL * 1024ULL)
555 
556 #define TRACEBUF_SIZE_PER_GB (1024ULL * 1024ULL)
557 #define GIGABYTES (1024ULL * 1024ULL * 1024ULL)
558 
559 SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE;
560 
561 /*
562  * We currently set a ceiling of 3 milliseconds spent in the kdp fault path
563  * for non-panic stackshots where faulting is requested.
564  */
565 #define KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS (3 * NSEC_PER_MSEC)
566 
567 
568 #ifndef ROUNDUP
569 #define ROUNDUP(x, y)            ((((x)+(y)-1)/(y))*(y))
570 #endif
571 
572 #define STACKSHOT_QUEUE_LABEL_MAXSIZE  64
573 
574 #pragma mark ---Stackshot Useful Macros---
575 
576 #define kcd_end_address(kcd) ((void *)((uint64_t)((kcd)->kcd_addr_begin) + kcdata_memory_get_used_bytes((kcd))))
577 #define kcd_max_address(kcd) ((void *)((kcd)->kcd_addr_begin + (kcd)->kcd_length))
578 /*
579  * Use of the kcd_exit_on_error(action) macro requires a local
580  * 'kern_return_t error' variable and 'error_exit' label.
581  */
582 #define kcd_exit_on_error(action)                      \
583 	do {                                               \
584 	    if (KERN_SUCCESS != (error = (action))) {      \
585 	        STACKSHOT_TRACE(error);                    \
586 	        if (error == KERN_RESOURCE_SHORTAGE) {     \
587 	            error = KERN_INSUFFICIENT_BUFFER_SIZE; \
588 	        }                                          \
589 	        goto error_exit;                           \
590 	    }                                              \
591 	} while (0); /* end kcd_exit_on_error */
592 
593 #if defined(__arm64__)
594 #define loop_wait_noguard() __builtin_arm_wfe()
595 #elif defined(__x86_64__)
596 #define loop_wait_noguard() __builtin_ia32_pause()
597 #else
598 #define loop_wait_noguard()
599 #endif /* __x86_64__ */
600 
601 #define loop_wait() { loop_wait_noguard(); stackshot_panic_guard(); }
602 
603 static inline void stackshot_panic_guard(void);
604 
605 static __attribute__((noreturn, noinline)) void
stackshot_panic_spin(void)606 stackshot_panic_spin(void)
607 {
608 	if (stackshot_cpu_ctx.scc_can_work) {
609 		stackshot_cpu_ctx.scc_can_work = false;
610 		os_atomic_dec(&stackshot_ctx.sc_cpus_working, acquire);
611 	}
612 	if (stackshot_ctx.sc_calling_cpuid == cpu_number()) {
613 		while (os_atomic_load(&stackshot_ctx.sc_cpus_working, acquire) != 0) {
614 			loop_wait_noguard();
615 		}
616 		panic_stackshot_release_lock();
617 	}
618 	while (1) {
619 		loop_wait_noguard();
620 	}
621 }
622 
623 /**
624  * Immediately aborts if another CPU panicked during the stackshot.
625  */
626 static inline void
stackshot_panic_guard(void)627 stackshot_panic_guard(void)
628 {
629 	if (__improbable(os_atomic_load(&stackshot_ctx.sc_state, relaxed) == SS_PANICKED)) {
630 		stackshot_panic_spin();
631 	}
632 }
633 
634 /*
635  * Signal that we panicked during a stackshot by setting an atomic flag and
636  * waiting for others to coalesce before continuing the panic. Other CPUs will
637  * spin on this as soon as they see it set in order to prevent multiple
638  * concurrent panics. The calling CPU (i.e. the one holding the debugger lock)
639  * will release it for us in `stackshot_panic_spin` so we can continue
640  * panicking.
641  *
642  * This is called from panic_trap_to_debugger.
643  */
644 void
stackshot_cpu_signal_panic(void)645 stackshot_cpu_signal_panic(void)
646 {
647 	stackshot_state_t o_state;
648 	if (stackshot_active()) {
649 		/* Check if someone else panicked before we did. */
650 		o_state = os_atomic_xchg(&stackshot_ctx.sc_state, SS_PANICKED, seq_cst);
651 		if (o_state == SS_PANICKED) {
652 			stackshot_panic_spin();
653 		}
654 
655 		/* We're the first CPU to panic - wait for everyone to coalesce. */
656 		if (stackshot_cpu_ctx.scc_can_work) {
657 			stackshot_cpu_ctx.scc_can_work = false;
658 			os_atomic_dec(&stackshot_ctx.sc_cpus_working, acquire);
659 		}
660 		while (os_atomic_load(&stackshot_ctx.sc_cpus_working, seq_cst) != 0) {
661 			loop_wait_noguard();
662 		}
663 	}
664 }
665 
666 /*
667  * Sets the stackshot state to SS_ERRORED along with the error code.
668  * Only works if the current state is SS_RUNNING or SS_SETUP.
669  */
670 static inline void
stackshot_set_error(kern_return_t error)671 stackshot_set_error(kern_return_t error)
672 {
673 	stackshot_state_t cur_state;
674 	stackshot_state_t err_state = SS_MKERR(error);
675 	if (__improbable(!os_atomic_cmpxchgv(&stackshot_ctx.sc_state, SS_RUNNING, err_state, &cur_state, seq_cst))) {
676 		if (cur_state == SS_SETUP) {
677 			os_atomic_cmpxchg(&stackshot_ctx.sc_state, SS_SETUP, err_state, seq_cst);
678 		} else {
679 			/* Our state is something other than SS_RUNNING or SS_SETUP... Check for panic. */
680 			stackshot_panic_guard();
681 		}
682 	}
683 }
684 
685 /* Returns an error code if the current stackshot context has errored out.
686  * Also functions as a panic guard.
687  */
688 __result_use_check
689 static inline kern_return_t
stackshot_status_check(void)690 stackshot_status_check(void)
691 {
692 	stackshot_state_t state = os_atomic_load(&stackshot_ctx.sc_state, relaxed);
693 
694 	/* Check for panic */
695 	if (__improbable(SS_STATE(state) == SS_PANICKED)) {
696 		stackshot_panic_spin();
697 	}
698 
699 	/* Check for error */
700 	if (__improbable(SS_STATE(state) == SS_ERRORED)) {
701 		kern_return_t err = SS_ERRCODE(state);
702 		assert(err != KERN_SUCCESS); /* SS_ERRORED should always store an associated error code. */
703 		return err;
704 	}
705 
706 	return KERN_SUCCESS;
707 }
708 
709 #pragma mark ---Stackshot Tracing---
710 
711 #if STACKSHOT_COLLECTS_DIAGNOSTICS
712 static void
stackshot_trace(int line_no,mach_vm_address_t data)713 stackshot_trace(int line_no, mach_vm_address_t data)
714 {
715 	struct stackshot_trace_buffer *buffer = PERCPU_GET(stackshot_trace_buffer);
716 	buffer->sstb_entries[buffer->sstb_tail_idx] = (struct stackshot_trace_entry) {
717 		.sste_line_no = line_no,
718 		.sste_timestamp = mach_continuous_time(),
719 		.sste_data = data
720 	};
721 	buffer->sstb_tail_idx = (buffer->sstb_tail_idx + 1) % STACKSHOT_DEBUG_TRACEBUF_SIZE;
722 	buffer->sstb_size = MIN(buffer->sstb_size + 1, STACKSHOT_DEBUG_TRACEBUF_SIZE);
723 }
724 #define STACKSHOT_TRACE(data) stackshot_trace(__LINE__, (mach_vm_address_t) (data))
725 
726 #else /* STACKSHOT_COLLECTS_DIAGNOSTICS */
727 #define STACKSHOT_TRACE(data) ((void) data)
728 #endif /* !STACKSHOT_COLLECTS_DIAGNOSTICS */
729 
730 #pragma mark ---Stackshot Buffer Management---
731 
732 #define freelist_lock(buffer) while(!os_atomic_cmpxchg(&buffer->ssb_freelist_lock, 0, 1, acquire)) { loop_wait(); }
733 #define freelist_unlock(buffer) os_atomic_store(&buffer->ssb_freelist_lock, 0, release);
734 
735 /**
736  * Allocates some data from the shared stackshot buffer freelist.
737  * This should not be used directly, it is a last resort if we run out of space.
738  */
739 static void *
stackshot_freelist_alloc(size_t size,struct stackshot_buffer * buffer,kern_return_t * error)740 stackshot_freelist_alloc(
741 	size_t size,
742 	struct stackshot_buffer *buffer,
743 	kern_return_t *error)
744 {
745 	struct freelist_entry **cur_freelist, **best_freelist = NULL, *ret = NULL;
746 
747 	freelist_lock(buffer);
748 
749 	cur_freelist = &buffer->ssb_freelist;
750 
751 	while (*cur_freelist != NULL) {
752 		if (((*cur_freelist)->fl_size >= size) && ((best_freelist == NULL) || ((*best_freelist)->fl_size > (*cur_freelist)->fl_size))) {
753 			best_freelist = cur_freelist;
754 			if ((*best_freelist)->fl_size == size) {
755 				break;
756 			}
757 		}
758 		cur_freelist = &((*cur_freelist)->fl_next);
759 	}
760 
761 	/* If we found a freelist entry, update the freelist */
762 	if (best_freelist != NULL) {
763 		os_atomic_sub(&buffer->ssb_overhead, size, relaxed);
764 		ret = *best_freelist;
765 
766 		/* If there's enough unused space at the end of this entry, we should make a new one */
767 		if (((*best_freelist)->fl_size - size) > sizeof(struct freelist_entry)) {
768 			struct freelist_entry *new_freelist = (struct freelist_entry*) ((mach_vm_address_t) *best_freelist + size);
769 			*new_freelist = (struct freelist_entry) {
770 				.fl_next = (*best_freelist)->fl_next,
771 				.fl_size = (*best_freelist)->fl_size - size
772 			};
773 			(*best_freelist)->fl_next = new_freelist;
774 		}
775 
776 		/* Update previous entry with next or new entry */
777 		*best_freelist = (*best_freelist)->fl_next;
778 	}
779 
780 	freelist_unlock(buffer);
781 
782 	if (error != NULL) {
783 		if (ret == NULL) {
784 			*error = KERN_INSUFFICIENT_BUFFER_SIZE;
785 		} else {
786 			*error = KERN_SUCCESS;
787 		}
788 	}
789 
790 	return ret;
791 }
792 
793 /**
794  * Allocates some data from the shared stackshot buffer.
795  * Should not be used directly - see the `stackshot_alloc` and
796  * `stackshot_alloc_arr` macros.
797  */
798 static void *
stackshot_buffer_alloc(size_t size,struct stackshot_buffer * buffer,kern_return_t * error)799 stackshot_buffer_alloc(
800 	size_t size,
801 	struct stackshot_buffer *buffer,
802 	kern_return_t *error)
803 {
804 	size_t o_used, new_used;
805 
806 	stackshot_panic_guard();
807 	assert(!stackshot_ctx.sc_is_singlethreaded);
808 	assert(buffer->ssb_ptr != NULL);
809 
810 	os_atomic_rmw_loop(&buffer->ssb_used, o_used, new_used, relaxed, {
811 		new_used = o_used + size;
812 		if (new_used > buffer->ssb_size) {
813 		        os_atomic_rmw_loop_give_up(return stackshot_freelist_alloc(size, buffer, error));
814 		}
815 	});
816 
817 	if (error != NULL) {
818 		*error = KERN_SUCCESS;
819 	}
820 
821 	return (void*) ((mach_vm_address_t) buffer->ssb_ptr + o_used);
822 }
823 
824 /**
825  * Finds the best stackshot buffer to use (prefer our cluster's buffer)
826  * and allocates from it.
827  * Should not be used directly - see the `stackshot_alloc` and
828  * `stackshot_alloc_arr` macros.
829  */
830 __result_use_check
831 static void *
stackshot_best_buffer_alloc(size_t size,kern_return_t * error)832 stackshot_best_buffer_alloc(size_t size, kern_return_t *error)
833 {
834 #if defined(__AMP__)
835 	kern_return_t err;
836 	int           my_cluster;
837 	void         *ret = NULL;
838 #endif /* __AMP__ */
839 
840 #if STACKSHOT_COLLECTS_LATENCY_INFO
841 	stackshot_cpu_latency.total_buf += size;
842 #endif
843 
844 #if defined(__AMP__)
845 	/* First, try our cluster's buffer */
846 	my_cluster = cpu_cluster_id();
847 	ret = stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[my_cluster], &err);
848 
849 	/* Try other buffers now. */
850 	if (err != KERN_SUCCESS) {
851 		for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) {
852 			if ((buf_idx == my_cluster) || (stackshot_ctx.sc_buffers[buf_idx].ssb_ptr == NULL)) {
853 				continue;
854 			}
855 
856 			ret = stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[buf_idx], &err);
857 			if (err == KERN_SUCCESS) {
858 #if STACKSHOT_COLLECTS_LATENCY_INFO
859 				stackshot_cpu_latency.intercluster_buf_used += size;
860 #endif
861 				break;
862 			}
863 		}
864 	}
865 
866 	if (error != NULL) {
867 		*error = err;
868 	}
869 
870 	return ret;
871 #else /* __AMP__ */
872 	return stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[0], error);
873 #endif /* !__AMP__ */
874 }
875 
876 /**
877  * Frees some data from the shared stackshot buffer and adds it to the freelist.
878  */
879 static void
stackshot_buffer_free(void * ptr,struct stackshot_buffer * buffer,size_t size)880 stackshot_buffer_free(
881 	void *ptr,
882 	struct stackshot_buffer *buffer,
883 	size_t size)
884 {
885 	stackshot_panic_guard();
886 
887 	/* This should never be called during a singlethreaded stackshot. */
888 	assert(!stackshot_ctx.sc_is_singlethreaded);
889 
890 	os_atomic_add(&buffer->ssb_overhead, size, relaxed);
891 
892 	/* Make sure we have enough space for the freelist entry */
893 	if (size < sizeof(struct freelist_entry)) {
894 		return;
895 	}
896 
897 	freelist_lock(buffer);
898 
899 	/* Create new freelist entry and push it to the front of the list */
900 	*((struct freelist_entry*) ptr) = (struct freelist_entry) {
901 		.fl_size = size,
902 		.fl_next = buffer->ssb_freelist
903 	};
904 	buffer->ssb_freelist = ptr;
905 
906 	freelist_unlock(buffer);
907 }
908 
909 /**
910  * Allocates some data from the stackshot buffer. Uses the bump allocator in
911  * multithreaded mode and endalloc in singlethreaded.
912  * err must ALWAYS be nonnull.
913  * Should not be used directly - see the macros in kern_stackshot.h.
914  */
915 void *
stackshot_alloc_with_size(size_t size,kern_return_t * err)916 stackshot_alloc_with_size(size_t size, kern_return_t *err)
917 {
918 	void *ptr;
919 	assert(err != NULL);
920 	assert(stackshot_active());
921 
922 	stackshot_panic_guard();
923 
924 	if (stackshot_ctx.sc_is_singlethreaded) {
925 		ptr = kcdata_endalloc(stackshot_kcdata_p, size);
926 		if (ptr == NULL) {
927 			*err = KERN_INSUFFICIENT_BUFFER_SIZE;
928 		}
929 	} else {
930 		ptr = stackshot_best_buffer_alloc(size, err);
931 		if (ptr == NULL) {
932 			/* We should always return an error if we return a null ptr */
933 			assert3u(*err, !=, KERN_SUCCESS);
934 		}
935 	}
936 
937 	return ptr;
938 }
939 
940 /**
941  * Initializes a new kcdata buffer somewhere in a linked kcdata list.
942  * Allocates a buffer for the kcdata from the shared stackshot buffer.
943  *
944  * See `linked_kcdata_alloc_callback` for the implementation details of
945  * linked kcdata for stackshot.
946  */
947 __result_use_check
948 static kern_return_t
linked_kcdata_init(linked_kcdata_descriptor_t descriptor,size_t min_size,unsigned int data_type,unsigned int flags)949 linked_kcdata_init(
950 	linked_kcdata_descriptor_t descriptor,
951 	size_t min_size,
952 	unsigned int data_type,
953 	unsigned int flags)
954 {
955 	void              *buf_ptr;
956 	kern_return_t      error;
957 	size_t             buf_size = MAX(min_size, stackshot_ctx.sc_min_kcdata_size);
958 
959 	buf_ptr = stackshot_alloc_arr(uint8_t, buf_size, &error);
960 	if (error != KERN_SUCCESS) {
961 		return error;
962 	}
963 
964 	error = kcdata_memory_static_init(&descriptor->kcdata, (mach_vm_address_t) buf_ptr, data_type, buf_size, flags);
965 	if (error != KERN_SUCCESS) {
966 		return error;
967 	}
968 
969 	descriptor->kcdata.kcd_alloc_callback = linked_kcdata_alloc_callback;
970 
971 	return KERN_SUCCESS;
972 }
973 
974 static void
stackshot_kcdata_free_unused(kcdata_descriptor_t descriptor)975 stackshot_kcdata_free_unused(kcdata_descriptor_t descriptor)
976 {
977 	/*
978 	 * If we have free space at the end of the kcdata, we can add it to the
979 	 * freelist. We always add to *our* cluster's freelist, no matter where
980 	 * the data was originally allocated.
981 	 *
982 	 * Important Note: We do not use kcdata_memory_get_used_bytes here because
983 	 * that includes extra space for the end tag (which we do not care about).
984 	 */
985 	int    buffer;
986 	size_t used_size = descriptor->kcd_addr_end - descriptor->kcd_addr_begin;
987 	size_t free_size = (descriptor->kcd_length - used_size);
988 	if (free_size > 0) {
989 #if defined(__arm64__)
990 		buffer = cpu_cluster_id();
991 #else /* __arm64__ */
992 		buffer = 0;
993 #endif /* !__arm64__ */
994 		stackshot_buffer_free((void*) descriptor->kcd_addr_end, &stackshot_ctx.sc_buffers[buffer], free_size);
995 		descriptor->kcd_length = used_size;
996 	}
997 }
998 
999 /**
1000  * The callback for linked kcdata, which is called when one of the kcdata
1001  * buffers runs out of space. This allocates a new kcdata descriptor &
1002  * buffer in the linked list and sets it up.
1003  *
1004  * When kcdata calls this callback, it takes the returned descriptor
1005  * and copies it to its own descriptor (which will be the per-cpu kcdata
1006  * descriptor, in the case of stackshot).
1007  *
1008  * --- Stackshot linked kcdata details ---
1009  * The way stackshot allocates kcdata buffers (in a non-panic context) is via
1010  * a basic bump allocator (see `stackshot_buffer_alloc`) and a linked list of
1011  * kcdata structures. The kcdata are allocated with a reasonable size based on
1012  * some system heuristics (or more if whatever is being pushed into the buffer
1013  * is larger). When the current kcdata buffer runs out of space, it calls this
1014  * callback, which allocates a new linked kcdata object at the tail of the
1015  * current list.
1016  *
1017  * The per-cpu `stackshot_kcdata_p` descriptor is the "tail" of the list, but
1018  * is not actually part of the linked list (this simplified implementation,
1019  * since it didn't require changing every kcdata call & a bunch of
1020  * kcdata code, since the current in-use descriptor is always in the same place
1021  * this way). When it is filled up and this callback is called, the
1022  * `stackshot_kcdata_p` descriptor is copied to the *actual* tail of the list
1023  * (in stackshot_cpu_ctx.scc_kcdata_tail), and a new linked kcdata struct is
1024  * allocated at the tail.
1025  */
1026 static kcdata_descriptor_t
linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor,size_t min_size)1027 linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor, size_t min_size)
1028 {
1029 	kern_return_t error;
1030 	linked_kcdata_descriptor_t new_kcdata = NULL;
1031 
1032 	/* This callback should ALWAYS be coming from our per-cpu kcdata. If not, something has gone horribly wrong.*/
1033 	stackshot_panic_guard();
1034 	assert(descriptor == stackshot_kcdata_p);
1035 
1036 	/* Free the unused space in the buffer and copy it to the tail of the linked kcdata list. */
1037 	stackshot_kcdata_free_unused(descriptor);
1038 	stackshot_cpu_ctx.scc_kcdata_tail->kcdata = *descriptor;
1039 
1040 	/* Allocate another linked_kcdata and initialize it. */
1041 	new_kcdata = stackshot_alloc(struct linked_kcdata_descriptor, &error);
1042 	if (error != KERN_SUCCESS) {
1043 		return NULL;
1044 	}
1045 
1046 	/* It doesn't matter what we mark the data type as - we're throwing it away when weave the data together anyway. */
1047 	error = linked_kcdata_init(new_kcdata, min_size, KCDATA_BUFFER_BEGIN_STACKSHOT, descriptor->kcd_flags);
1048 	if (error != KERN_SUCCESS) {
1049 		return NULL;
1050 	}
1051 
1052 	bzero(descriptor, sizeof(struct kcdata_descriptor));
1053 	stackshot_cpu_ctx.scc_kcdata_tail->next = new_kcdata;
1054 	stackshot_cpu_ctx.scc_kcdata_tail = new_kcdata;
1055 
1056 	return &new_kcdata->kcdata;
1057 }
1058 
1059 /**
1060  * Allocates a new linked kcdata list for the current CPU and sets it up.
1061  * If there was a previous linked kcdata descriptor, you should call
1062  * `stackshot_finalize_linked_kcdata` first, or otherwise save it somewhere.
1063  */
1064 __result_use_check
1065 static kern_return_t
stackshot_new_linked_kcdata(void)1066 stackshot_new_linked_kcdata(void)
1067 {
1068 	kern_return_t error;
1069 
1070 	stackshot_panic_guard();
1071 	assert(!stackshot_ctx.sc_panic_stackshot);
1072 
1073 	stackshot_cpu_ctx.scc_kcdata_head = stackshot_alloc(struct linked_kcdata_descriptor, &error);
1074 	if (error != KERN_SUCCESS) {
1075 		return error;
1076 	}
1077 
1078 	kcd_exit_on_error(linked_kcdata_init(stackshot_cpu_ctx.scc_kcdata_head, 0,
1079 	    KCDATA_BUFFER_BEGIN_STACKSHOT,
1080 	    KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER | KCFLAG_ALLOC_CALLBACK));
1081 
1082 	stackshot_cpu_ctx.scc_kcdata_tail = stackshot_cpu_ctx.scc_kcdata_head;
1083 	*stackshot_kcdata_p = stackshot_cpu_ctx.scc_kcdata_head->kcdata;
1084 
1085 error_exit:
1086 	return error;
1087 }
1088 
1089 /**
1090  * Finalizes the current linked kcdata structure for the CPU by updating the
1091  * tail of the list with the per-cpu kcdata descriptor.
1092  */
1093 static void
stackshot_finalize_linked_kcdata(void)1094 stackshot_finalize_linked_kcdata(void)
1095 {
1096 	stackshot_panic_guard();
1097 	assert(!stackshot_ctx.sc_panic_stackshot);
1098 	stackshot_kcdata_free_unused(stackshot_kcdata_p);
1099 	if (stackshot_cpu_ctx.scc_kcdata_tail != NULL) {
1100 		stackshot_cpu_ctx.scc_kcdata_tail->kcdata = *stackshot_kcdata_p;
1101 	}
1102 	*stackshot_kcdata_p = (struct kcdata_descriptor){};
1103 }
1104 
1105 /*
1106  * Initialize the mutex governing access to the stack snapshot subsystem
1107  * and other stackshot related bits.
1108  */
1109 __private_extern__ void
stackshot_init(void)1110 stackshot_init(void)
1111 {
1112 	mach_timebase_info_data_t timebase;
1113 
1114 	clock_timebase_info(&timebase);
1115 	stackshot_max_fault_time = ((KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS * timebase.denom) / timebase.numer);
1116 
1117 	max_tracebuf_size = MAX(max_tracebuf_size, ((ROUNDUP(max_mem, GIGABYTES) / GIGABYTES) * TRACEBUF_SIZE_PER_GB));
1118 
1119 	PE_parse_boot_argn("stackshot_maxsz", &max_tracebuf_size, sizeof(max_tracebuf_size));
1120 }
1121 
1122 /*
1123  * Called with interrupts disabled after stackshot context has been
1124  * initialized.
1125  */
1126 static kern_return_t
stackshot_trap(void)1127 stackshot_trap(void)
1128 {
1129 	kern_return_t   rv;
1130 
1131 #if defined(__x86_64__)
1132 	/*
1133 	 * Since mp_rendezvous and stackshot both attempt to capture cpus then perform an
1134 	 * operation, it's essential to apply mutual exclusion to the other when one
1135 	 * mechanism is in operation, lest there be a deadlock as the mechanisms race to
1136 	 * capture CPUs.
1137 	 *
1138 	 * Further, we assert that invoking stackshot from mp_rendezvous*() is not
1139 	 * allowed, so we check to ensure there there is no rendezvous in progress before
1140 	 * trying to grab the lock (if there is, a deadlock will occur when we try to
1141 	 * grab the lock).  This is accomplished by setting cpu_rendezvous_in_progress to
1142 	 * TRUE in the mp rendezvous action function.  If stackshot_trap() is called by
1143 	 * a subordinate of the call chain within the mp rendezvous action, this flag will
1144 	 * be set and can be used to detect the inevitable deadlock that would occur
1145 	 * if this thread tried to grab the rendezvous lock.
1146 	 */
1147 
1148 	if (current_cpu_datap()->cpu_rendezvous_in_progress == TRUE) {
1149 		panic("Calling stackshot from a rendezvous is not allowed!");
1150 	}
1151 
1152 	mp_rendezvous_lock();
1153 #endif
1154 
1155 	stackshot_stats.last_abs_start = mach_absolute_time();
1156 	stackshot_stats.last_abs_end = 0;
1157 
1158 	rv = DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, NULL, FALSE, 0, NULL);
1159 
1160 	stackshot_stats.last_abs_end = mach_absolute_time();
1161 	stackshot_stats.stackshots_taken++;
1162 	stackshot_stats.stackshots_duration += (stackshot_stats.last_abs_end - stackshot_stats.last_abs_start);
1163 
1164 #if defined(__x86_64__)
1165 	mp_rendezvous_unlock();
1166 #endif
1167 	return rv;
1168 }
1169 
1170 extern void stackshot_get_timing(uint64_t *last_abs_start, uint64_t *last_abs_end, uint64_t *count, uint64_t *total_duration);
1171 void
stackshot_get_timing(uint64_t * last_abs_start,uint64_t * last_abs_end,uint64_t * count,uint64_t * total_duration)1172 stackshot_get_timing(uint64_t *last_abs_start, uint64_t *last_abs_end, uint64_t *count, uint64_t *total_duration)
1173 {
1174 	STACKSHOT_SUBSYS_LOCK();
1175 	*last_abs_start = stackshot_stats.last_abs_start;
1176 	*last_abs_end = stackshot_stats.last_abs_end;
1177 	*count = stackshot_stats.stackshots_taken;
1178 	*total_duration = stackshot_stats.stackshots_duration;
1179 	STACKSHOT_SUBSYS_UNLOCK();
1180 }
1181 
1182 kern_return_t
stack_snapshot_from_kernel(int pid,void * buf,uint32_t size,uint64_t flags,uint64_t delta_since_timestamp,uint32_t pagetable_mask,unsigned * bytes_traced)1183 stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, uint64_t delta_since_timestamp, uint32_t pagetable_mask, unsigned *bytes_traced)
1184 {
1185 	kern_return_t error = KERN_SUCCESS;
1186 	boolean_t istate;
1187 	struct kdp_snapshot_args args;
1188 
1189 	args = (struct kdp_snapshot_args) {
1190 		.pid =               pid,
1191 		.buffer =            buf,
1192 		.buffer_size =       size,
1193 		.flags =             flags,
1194 		.since_timestamp =   delta_since_timestamp,
1195 		.pagetable_mask =    pagetable_mask
1196 	};
1197 
1198 #if DEVELOPMENT || DEBUG
1199 	if (kern_feature_override(KF_STACKSHOT_OVRD) == TRUE) {
1200 		return KERN_NOT_SUPPORTED;
1201 	}
1202 #endif
1203 	if ((buf == NULL) || (size <= 0) || (bytes_traced == NULL)) {
1204 		return KERN_INVALID_ARGUMENT;
1205 	}
1206 
1207 	/* zero caller's buffer to match KMA_ZERO in other path */
1208 	bzero(buf, size);
1209 
1210 	/* cap in individual stackshot to max_tracebuf_size */
1211 	if (size > max_tracebuf_size) {
1212 		size = max_tracebuf_size;
1213 	}
1214 
1215 	/* Serialize tracing */
1216 	if (flags & STACKSHOT_TRYLOCK) {
1217 		if (!STACKSHOT_SUBSYS_TRY_LOCK()) {
1218 			return KERN_LOCK_OWNED;
1219 		}
1220 	} else {
1221 		STACKSHOT_SUBSYS_LOCK();
1222 	}
1223 
1224 #if CONFIG_EXCLAVES
1225 	assert(!stackshot_exclave_inspect_ctids);
1226 #endif
1227 
1228 	stackshot_initial_estimate = 0;
1229 	stackshot_duration_prior_abs = 0;
1230 	stackshot_duration_outer = NULL;
1231 
1232 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_KERN_RECORD) | DBG_FUNC_START,
1233 	    flags, size, pid, delta_since_timestamp);
1234 
1235 	/* Prepare the compressor for a stackshot */
1236 	error = vm_compressor_kdp_init();
1237 	if (error != KERN_SUCCESS) {
1238 		return error;
1239 	}
1240 
1241 #if STACKSHOT_COLLECTS_RDAR_126582377_DATA
1242 	// Opportunistically collect reports of the rdar://126582377 failure.
1243 	// If the allocation doesn't succeed, or if another CPU "steals" the
1244 	// allocated event first, that is acceptable.
1245 	ca_event_t new_event = CA_EVENT_ALLOCATE_FLAGS(bad_stackshot_upper16, Z_NOWAIT);
1246 	if (new_event) {
1247 		if (os_atomic_cmpxchg(&rdar_126582377_event, NULL, new_event, relaxed) == 0) {
1248 			// Already set up, so free it
1249 			CA_EVENT_DEALLOCATE(new_event);
1250 		}
1251 	}
1252 #endif
1253 
1254 	istate = ml_set_interrupts_enabled(FALSE);
1255 	uint64_t time_start      = mach_absolute_time();
1256 
1257 	/* Emit a SOCD tracepoint that we are initiating a stackshot */
1258 	SOCD_TRACE_XNU_START(STACKSHOT);
1259 
1260 	/* Preload trace parameters*/
1261 	error = kdp_snapshot_preflight_internal(args);
1262 
1263 	/*
1264 	 * Trap to the debugger to obtain a coherent stack snapshot; this populates
1265 	 * the trace buffer
1266 	 */
1267 	if (error == KERN_SUCCESS) {
1268 		error = stackshot_trap();
1269 	}
1270 
1271 	uint64_t time_end = mach_absolute_time();
1272 
1273 	/* Emit a SOCD tracepoint that we have completed the stackshot */
1274 	SOCD_TRACE_XNU_END(STACKSHOT);
1275 
1276 	ml_set_interrupts_enabled(istate);
1277 
1278 #if CONFIG_EXCLAVES
1279 	/* stackshot trap should only finish successfully or with no pending Exclave threads */
1280 	assert(error == KERN_SUCCESS || stackshot_exclave_inspect_ctids == NULL);
1281 #endif
1282 
1283 	/*
1284 	 * Stackshot is no longer active.
1285 	 * (We have to do this here for the special interrupt disable timeout case to work)
1286 	 */
1287 	os_atomic_store(&stackshot_ctx.sc_state, SS_INACTIVE, release);
1288 
1289 	/* Release kdp compressor buffers */
1290 	vm_compressor_kdp_teardown();
1291 
1292 	/* Collect multithreaded kcdata into one finalized buffer */
1293 	if (error == KERN_SUCCESS && !stackshot_ctx.sc_is_singlethreaded) {
1294 		error = stackshot_collect_kcdata();
1295 	}
1296 
1297 #if CONFIG_EXCLAVES
1298 	if (stackshot_exclave_inspect_ctids) {
1299 		if (error == KERN_SUCCESS) {
1300 			error = collect_exclave_threads(flags);
1301 		}
1302 		stackshot_cleanup_exclave_waitlist();
1303 	}
1304 #endif /* CONFIG_EXCLAVES */
1305 
1306 	if (error == KERN_SUCCESS) {
1307 		if (!stackshot_ctx.sc_is_singlethreaded) {
1308 			error = stackshot_finalize_kcdata();
1309 		} else {
1310 			error = stackshot_finalize_singlethreaded_kcdata();
1311 		}
1312 	}
1313 
1314 	if (stackshot_duration_outer) {
1315 		*stackshot_duration_outer = time_end - time_start;
1316 	}
1317 	*bytes_traced = kdp_stack_snapshot_bytes_traced();
1318 
1319 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_KERN_RECORD) | DBG_FUNC_END,
1320 	    error, (time_end - time_start), size, *bytes_traced);
1321 
1322 	STACKSHOT_SUBSYS_UNLOCK();
1323 	return error;
1324 }
1325 
1326 #if CONFIG_TELEMETRY
1327 kern_return_t
stack_microstackshot(user_addr_t tracebuf,uint32_t tracebuf_size,uint32_t flags,int32_t * retval)1328 stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval)
1329 {
1330 	int error = KERN_FAILURE;
1331 	uint32_t bytes_traced = 0;
1332 
1333 	/*
1334 	 * "Flags" is actually treated as an enumeration, make sure only one value
1335 	 * is passed at a time.
1336 	 */
1337 	bool set_mark = flags & STACKSHOT_SET_MICROSTACKSHOT_MARK;
1338 	flags &= ~STACKSHOT_SET_MICROSTACKSHOT_MARK;
1339 	if (__builtin_popcount(flags) != 1) {
1340 		return KERN_INVALID_ARGUMENT;
1341 	}
1342 
1343 	/*
1344 	 * Ensure that there's space to copyout to.
1345 	 */
1346 	if (tracebuf == USER_ADDR_NULL || tracebuf_size == 0) {
1347 		return KERN_INVALID_ARGUMENT;
1348 	}
1349 
1350 	STACKSHOT_SUBSYS_LOCK();
1351 
1352 	switch (flags) {
1353 	case STACKSHOT_GET_KERNEL_MICROSTACKSHOT:
1354 		/*
1355 		 * Kernel samples consume from their buffer, so using a mark is the only
1356 		 * allowed option.
1357 		 */
1358 		if (!set_mark) {
1359 			error = KERN_INVALID_ARGUMENT;
1360 			break;
1361 		}
1362 		bytes_traced = tracebuf_size;
1363 		error = telemetry_kernel_gather(tracebuf, &bytes_traced);
1364 		*retval = (int)bytes_traced;
1365 		break;
1366 	case STACKSHOT_GET_MICROSTACKSHOT: {
1367 		if (tracebuf_size > max_tracebuf_size) {
1368 			error = KERN_INVALID_ARGUMENT;
1369 			break;
1370 		}
1371 
1372 		bytes_traced = tracebuf_size;
1373 		error = telemetry_gather(tracebuf, &bytes_traced, set_mark);
1374 		*retval = (int)bytes_traced;
1375 		break;
1376 	}
1377 	default:
1378 		error = KERN_NOT_SUPPORTED;
1379 		break;
1380 	}
1381 
1382 	STACKSHOT_SUBSYS_UNLOCK();
1383 	return error;
1384 }
1385 #endif /* CONFIG_TELEMETRY */
1386 
1387 /**
1388  * Grabs the next work item from the stackshot work queue.
1389  */
1390 static struct stackshot_workitem *
stackshot_get_workitem(struct stackshot_workqueue * queue)1391 stackshot_get_workitem(struct stackshot_workqueue *queue)
1392 {
1393 	uint32_t old_count, new_count;
1394 
1395 	/* note: this relies on give_up not performing the write, just bailing out immediately */
1396 	os_atomic_rmw_loop(&queue->sswq_cur_item, old_count, new_count, acq_rel, {
1397 		if (old_count >= os_atomic_load(&queue->sswq_num_items, relaxed)) {
1398 		        os_atomic_rmw_loop_give_up(return NULL);
1399 		}
1400 		new_count = old_count + 1;
1401 	});
1402 
1403 	return &queue->sswq_items[old_count];
1404 };
1405 
1406 /**
1407  * Puts an item on the appropriate stackshot work queue.
1408  * We don't need the lock for this, but only because it's
1409  * only called by one writer..
1410  *
1411  * @returns
1412  * true if the item fit in the queue, false if not.
1413  */
1414 static kern_return_t
stackshot_put_workitem(struct stackshot_workitem item)1415 stackshot_put_workitem(struct stackshot_workitem item)
1416 {
1417 	struct stackshot_workqueue *queue;
1418 
1419 	/* Put in higher queue if task has more threads, with highest queue having >= STACKSHOT_HARDEST_THREADCOUNT threads */
1420 	size_t queue_idx = ((item.sswi_task->thread_count * (STACKSHOT_NUM_WORKQUEUES - 1)) / STACKSHOT_HARDEST_THREADCOUNT);
1421 	queue_idx = MIN(queue_idx, STACKSHOT_NUM_WORKQUEUES - 1);
1422 
1423 	queue = &stackshot_ctx.sc_workqueues[queue_idx];
1424 
1425 	size_t num_items = os_atomic_load(&queue->sswq_num_items, relaxed);
1426 
1427 	if (num_items >= queue->sswq_capacity) {
1428 		return KERN_INSUFFICIENT_BUFFER_SIZE;
1429 	}
1430 
1431 	queue->sswq_items[num_items] = item;
1432 	os_atomic_inc(&queue->sswq_num_items, release);
1433 
1434 	return KERN_SUCCESS;
1435 }
1436 
1437 #define calc_num_linked_kcdata_frames(size, kcdata_size) (1 + ((size) - 1) / (kcdata_size))
1438 #define calc_linked_kcdata_size(size, kcdata_size) (calc_num_linked_kcdata_frames((size), (kcdata_size)) * ((kcdata_size) + sizeof(struct linked_kcdata_descriptor)))
1439 
1440 #define TASK_UUID_AVG_SIZE (16 * sizeof(uuid_t)) /* Average space consumed by UUIDs/task */
1441 #define TASK_SHARED_CACHE_AVG_SIZE (128) /* Average space consumed by task shared cache info */
1442 #define sizeof_if_traceflag(a, flag) (((trace_flags & (flag)) != 0) ? sizeof(a) : 0)
1443 
1444 #define FUDGED_SIZE(size, adj) (((size) * ((adj) + 100)) / 100)
1445 
1446 /*
1447  * Return the estimated size of a single task (including threads)
1448  * in a stackshot with the given flags.
1449  */
1450 static uint32_t
get_stackshot_est_tasksize(uint64_t trace_flags)1451 get_stackshot_est_tasksize(uint64_t trace_flags)
1452 {
1453 	size_t total_size;
1454 	size_t threads_per_task = (((threads_count + terminated_threads_count) - 1) / (tasks_count + terminated_tasks_count)) + 1;
1455 	size_t est_thread_size = sizeof(struct thread_snapshot_v4) + 42 * sizeof(uintptr_t);
1456 	size_t est_task_size = sizeof(struct task_snapshot_v3) +
1457 	    TASK_UUID_AVG_SIZE +
1458 	    TASK_SHARED_CACHE_AVG_SIZE +
1459 	    sizeof_if_traceflag(struct io_stats_snapshot, STACKSHOT_INSTRS_CYCLES) +
1460 	    sizeof_if_traceflag(uint32_t, STACKSHOT_ASID) +
1461 	    sizeof_if_traceflag(sizeof(uintptr_t) * STACKSHOT_PAGETABLE_BUFSZ, STACKSHOT_PAGE_TABLES) +
1462 	    sizeof_if_traceflag(struct instrs_cycles_snapshot_v2, STACKSHOT_INSTRS_CYCLES) +
1463 	    sizeof(struct stackshot_cpu_architecture) +
1464 	    sizeof(struct stackshot_task_codesigning_info);
1465 
1466 #if STACKSHOT_COLLECTS_LATENCY_INFO
1467 	if (collect_latency_info) {
1468 		est_thread_size += sizeof(struct stackshot_latency_thread);
1469 		est_task_size += sizeof(struct stackshot_latency_task);
1470 	}
1471 #endif
1472 
1473 	total_size = est_task_size + threads_per_task * est_thread_size;
1474 
1475 	return total_size;
1476 }
1477 
1478 /*
1479  * Return the estimated size of a stackshot based on the
1480  * number of currently running threads and tasks.
1481  *
1482  * adj is an adjustment in units of percentage
1483  */
1484 static uint32_t
get_stackshot_estsize(uint32_t prev_size_hint,uint32_t adj,uint64_t trace_flags,pid_t target_pid)1485 get_stackshot_estsize(
1486 	uint32_t prev_size_hint,
1487 	uint32_t adj,
1488 	uint64_t trace_flags,
1489 	pid_t target_pid)
1490 {
1491 	vm_size_t thread_and_task_total;
1492 	uint64_t  size;
1493 	uint32_t  estimated_size;
1494 	bool      process_scoped = ((target_pid != -1) && ((trace_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0));
1495 
1496 	/*
1497 	 * We use the estimated task size (with a fudge factor) as the default
1498 	 * linked kcdata buffer size in an effort to reduce overhead (ideally, we want
1499 	 * each task to only need a single kcdata buffer.)
1500 	 */
1501 	uint32_t est_task_size = get_stackshot_est_tasksize(trace_flags);
1502 	uint32_t est_kcdata_size = FUDGED_SIZE(est_task_size, adj);
1503 	uint64_t est_preamble_size = calc_linked_kcdata_size(8192 * 4, est_kcdata_size);
1504 	uint64_t est_postamble_size = calc_linked_kcdata_size(8192 * 2, est_kcdata_size);
1505 	uint64_t est_extra_size = 0;
1506 
1507 	adj = MIN(adj, 100u);   /* no more than double our estimate */
1508 
1509 #if STACKSHOT_COLLECTS_LATENCY_INFO
1510 	est_extra_size += real_ncpus * sizeof(struct stackshot_latency_cpu);
1511 	est_extra_size += sizeof(struct stackshot_latency_collection_v2);
1512 #endif
1513 
1514 	est_extra_size += real_ncpus * MAX_FRAMES * sizeof(uintptr_t); /* Stacktrace buffers */
1515 	est_extra_size += FUDGED_SIZE(tasks_count, 10) * sizeof(uintptr_t) * STACKSHOT_NUM_WORKQUEUES; /* Work queues */
1516 	est_extra_size += sizeof_if_traceflag(sizeof(uintptr_t) * STACKSHOT_PAGETABLE_BUFSZ * real_ncpus, STACKSHOT_PAGE_TABLES);
1517 
1518 	thread_and_task_total = calc_linked_kcdata_size(est_task_size, est_kcdata_size);
1519 	if (!process_scoped) {
1520 		thread_and_task_total *= tasks_count;
1521 	}
1522 	size = thread_and_task_total + est_preamble_size + est_postamble_size + est_extra_size; /* estimate */
1523 	size = FUDGED_SIZE(size, adj); /* add adj */
1524 	size = MAX(size, prev_size_hint); /* allow hint to increase */
1525 	size += stackshot_plh_est_size(); /* add space for the port label hash */
1526 	size = MIN(size, VM_MAP_TRUNC_PAGE(UINT32_MAX, PAGE_MASK)); /* avoid overflow */
1527 	estimated_size = (uint32_t) VM_MAP_ROUND_PAGE(size, PAGE_MASK); /* round to pagesize */
1528 
1529 	return estimated_size;
1530 }
1531 
1532 /**
1533  * Copies a linked list of kcdata structures into a final kcdata structure.
1534  * Only used from stackshot_finalize_kcdata.
1535  */
1536 __result_use_check
1537 static kern_return_t
stackshot_copy_linked_kcdata(kcdata_descriptor_t final_kcdata,linked_kcdata_descriptor_t linked_kcdata)1538 stackshot_copy_linked_kcdata(kcdata_descriptor_t final_kcdata, linked_kcdata_descriptor_t linked_kcdata)
1539 {
1540 	kern_return_t error = KERN_SUCCESS;
1541 
1542 	while (linked_kcdata) {
1543 		/* Walk linked kcdata list */
1544 		kcdata_descriptor_t cur_kcdata = &linked_kcdata->kcdata;
1545 		if ((cur_kcdata->kcd_addr_end - cur_kcdata->kcd_addr_begin) == 0) {
1546 			linked_kcdata = linked_kcdata->next;
1547 			continue;
1548 		}
1549 
1550 		/* Every item in the linked kcdata should have a header tag of type KCDATA_BUFFER_BEGIN_STACKSHOT. */
1551 		assert(((struct kcdata_item*) cur_kcdata->kcd_addr_begin)->type == KCDATA_BUFFER_BEGIN_STACKSHOT);
1552 		assert((final_kcdata->kcd_addr_begin + final_kcdata->kcd_length) > final_kcdata->kcd_addr_end);
1553 		size_t header_size = sizeof(kcdata_item_t) + kcdata_calc_padding(sizeof(kcdata_item_t));
1554 		size_t size = cur_kcdata->kcd_addr_end - cur_kcdata->kcd_addr_begin - header_size;
1555 		size_t free = (final_kcdata->kcd_length + final_kcdata->kcd_addr_begin) - final_kcdata->kcd_addr_end;
1556 		if (free < size) {
1557 			error = KERN_INSUFFICIENT_BUFFER_SIZE;
1558 			goto error_exit;
1559 		}
1560 
1561 		/* Just memcpy the data over (and compress if we need to.) */
1562 		kcdata_compression_window_open(final_kcdata);
1563 		error = kcdata_memcpy(final_kcdata, final_kcdata->kcd_addr_end, (void*) (cur_kcdata->kcd_addr_begin + header_size), size);
1564 		if (error != KERN_SUCCESS) {
1565 			goto error_exit;
1566 		}
1567 		final_kcdata->kcd_addr_end += size;
1568 		kcdata_compression_window_close(final_kcdata);
1569 
1570 		linked_kcdata = linked_kcdata->next;
1571 	}
1572 
1573 error_exit:
1574 	return error;
1575 }
1576 
1577 /**
1578  * Copies the duration, latency, and diagnostic info into a final kcdata buffer.
1579  * Only used by stackshot_finalize_kcdata and stackshot_finalize_singlethreaded_kcdata.
1580  */
1581 __result_use_check
1582 static kern_return_t
stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata)1583 stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata)
1584 {
1585 	kern_return_t error;
1586 	mach_vm_address_t out_addr;
1587 	bool use_fault_path = ((stackshot_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0);
1588 #if STACKSHOT_COLLECTS_LATENCY_INFO
1589 	size_t buffer_used = 0;
1590 	size_t buffer_overhead = 0;
1591 	struct stackshot_latency_buffer buffer_latency;
1592 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
1593 
1594 	if (use_fault_path) {
1595 		struct stackshot_fault_stats stats = (struct stackshot_fault_stats) {
1596 			.sfs_pages_faulted_in = 0,
1597 			.sfs_time_spent_faulting = 0,
1598 			.sfs_system_max_fault_time = stackshot_max_fault_time,
1599 			.sfs_stopped_faulting = false
1600 		};
1601 		percpu_foreach_base(base) {
1602 			struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
1603 			if (!cpu_ctx->scc_did_work) {
1604 				continue;
1605 			}
1606 			stats.sfs_pages_faulted_in += cpu_ctx->scc_fault_stats.sfs_pages_faulted_in;
1607 			stats.sfs_time_spent_faulting += cpu_ctx->scc_fault_stats.sfs_time_spent_faulting;
1608 			stats.sfs_stopped_faulting = stats.sfs_stopped_faulting || cpu_ctx->scc_fault_stats.sfs_stopped_faulting;
1609 		}
1610 		kcdata_push_data(kcdata, STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS,
1611 		    sizeof(struct stackshot_fault_stats), &stats);
1612 	}
1613 
1614 #if STACKSHOT_COLLECTS_LATENCY_INFO
1615 	int num_working_cpus = 0;
1616 	if (collect_latency_info) {
1617 		/* Add per-CPU latency info */
1618 		percpu_foreach(cpu_ctx, stackshot_cpu_ctx_percpu) {
1619 			if (cpu_ctx->scc_did_work) {
1620 				num_working_cpus++;
1621 			}
1622 		}
1623 		kcdata_compression_window_open(kcdata);
1624 		kcd_exit_on_error(kcdata_get_memory_addr_for_array(
1625 			    kcdata, STACKSHOT_KCTYPE_LATENCY_INFO_CPU, sizeof(struct stackshot_latency_cpu), num_working_cpus, &out_addr));
1626 		percpu_foreach_base(base) {
1627 			if (PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu)->scc_did_work) {
1628 				kcdata_memcpy(kcdata, out_addr, PERCPU_GET_WITH_BASE(base, stackshot_cpu_latency_percpu),
1629 				    sizeof(struct stackshot_latency_cpu));
1630 				out_addr += sizeof(struct stackshot_latency_cpu);
1631 			}
1632 		}
1633 		kcd_exit_on_error(kcdata_compression_window_close(kcdata));
1634 
1635 		kcdata_compression_window_open(kcdata);
1636 		kcd_exit_on_error(kcdata_get_memory_addr_for_array(
1637 			    kcdata, STACKSHOT_KCTYPE_LATENCY_INFO_BUFFER, sizeof(struct stackshot_latency_buffer), stackshot_ctx.sc_num_buffers, &out_addr));
1638 
1639 		/* Add up buffer info */
1640 		for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++, out_addr += sizeof(buffer_latency)) {
1641 			struct stackshot_buffer *buf = &stackshot_ctx.sc_buffers[buf_idx];
1642 			if (buf->ssb_ptr == NULL) {
1643 				kcdata_bzero(kcdata, out_addr, sizeof(struct stackshot_latency_buffer));
1644 				continue;
1645 			}
1646 
1647 #if defined(__arm64__)
1648 			ml_topology_cluster_t *cluster = &ml_get_topology_info()->clusters[buf_idx];
1649 			buffer_latency.cluster_type = cluster->cluster_type;
1650 #else /* __arm64__ */
1651 			buffer_latency.cluster_type = CLUSTER_TYPE_SMP;
1652 #endif /* !__arm64__ */
1653 			buffer_latency.size = buf->ssb_size;
1654 			buffer_latency.used = os_atomic_load(&buf->ssb_used, relaxed);
1655 			buffer_latency.overhead = os_atomic_load(&buf->ssb_overhead, relaxed);
1656 			kcd_exit_on_error(kcdata_memcpy(
1657 				    kcdata, out_addr, &buffer_latency, sizeof(buffer_latency)));
1658 
1659 			buffer_used += buffer_latency.used;
1660 			buffer_overhead += buffer_latency.overhead;
1661 		}
1662 		kcd_exit_on_error(kcdata_compression_window_close(kcdata));
1663 
1664 		stackshot_ctx.sc_latency.buffer_size = stackshot_ctx.sc_args.buffer_size;
1665 		stackshot_ctx.sc_latency.buffer_overhead = buffer_overhead;
1666 		stackshot_ctx.sc_latency.buffer_used = buffer_used;
1667 		stackshot_ctx.sc_latency.buffer_count = stackshot_ctx.sc_num_buffers;
1668 
1669 		/* Add overall latency info */
1670 		kcd_exit_on_error(kcdata_push_data(
1671 			    kcdata, STACKSHOT_KCTYPE_LATENCY_INFO,
1672 			    sizeof(stackshot_ctx.sc_latency), &stackshot_ctx.sc_latency));
1673 	}
1674 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
1675 
1676 	if ((stackshot_flags & STACKSHOT_DO_COMPRESS) == 0) {
1677 		assert(!stackshot_ctx.sc_panic_stackshot);
1678 		kcd_exit_on_error(kcdata_get_memory_addr(kcdata, STACKSHOT_KCTYPE_STACKSHOT_DURATION,
1679 		    sizeof(struct stackshot_duration_v2), &out_addr));
1680 		struct stackshot_duration_v2 *duration_p = (void *) out_addr;
1681 		memcpy(duration_p, &stackshot_ctx.sc_duration, sizeof(*duration_p));
1682 		stackshot_duration_outer = (unaligned_u64 *) &duration_p->stackshot_duration_outer;
1683 		kcd_exit_on_error(kcdata_add_uint64_with_description(kcdata, stackshot_tries, "stackshot_tries"));
1684 	} else {
1685 		kcd_exit_on_error(kcdata_push_data(kcdata, STACKSHOT_KCTYPE_STACKSHOT_DURATION, sizeof(stackshot_ctx.sc_duration), &stackshot_ctx.sc_duration));
1686 		stackshot_duration_outer = NULL;
1687 	}
1688 
1689 error_exit:
1690 	return error;
1691 }
1692 
1693 /**
1694  * Allocates the final kcdata buffer for a mulitithreaded stackshot,
1695  * where all of the per-task kcdata (and exclave kcdata) will end up.
1696  */
1697 __result_use_check
1698 static kern_return_t
stackshot_alloc_final_kcdata(void)1699 stackshot_alloc_final_kcdata(void)
1700 {
1701 	vm_offset_t   final_kcdata_buffer = 0;
1702 	kern_return_t error = KERN_SUCCESS;
1703 	uint32_t hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
1704 	    : (stackshot_flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
1705 	    : KCDATA_BUFFER_BEGIN_STACKSHOT;
1706 
1707 	if (stackshot_ctx.sc_is_singlethreaded) {
1708 		return KERN_SUCCESS;
1709 	}
1710 
1711 	if ((error = kmem_alloc(kernel_map, &final_kcdata_buffer, stackshot_args.buffer_size,
1712 	    KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) {
1713 		os_log_error(OS_LOG_DEFAULT, "stackshot: final allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, stackshot_args.buffer_size, max_tracebuf_size, stackshot_tries);
1714 		return KERN_RESOURCE_SHORTAGE;
1715 	}
1716 
1717 	stackshot_ctx.sc_finalized_kcdata = kcdata_memory_alloc_init(final_kcdata_buffer, hdr_tag,
1718 	    stackshot_args.buffer_size, KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER);
1719 
1720 	if (stackshot_ctx.sc_finalized_kcdata == NULL) {
1721 		kmem_free(kernel_map, final_kcdata_buffer, stackshot_args.buffer_size);
1722 		return KERN_FAILURE;
1723 	}
1724 
1725 	return KERN_SUCCESS;
1726 }
1727 
1728 /**
1729  * Frees the final kcdata buffer.
1730  */
1731 static void
stackshot_free_final_kcdata(void)1732 stackshot_free_final_kcdata(void)
1733 {
1734 	if (stackshot_ctx.sc_is_singlethreaded || (stackshot_ctx.sc_finalized_kcdata == NULL)) {
1735 		return;
1736 	}
1737 
1738 	kmem_free(kernel_map, stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin, stackshot_args.buffer_size);
1739 	kcdata_memory_destroy(stackshot_ctx.sc_finalized_kcdata);
1740 	stackshot_ctx.sc_finalized_kcdata = NULL;
1741 }
1742 
1743 /**
1744  * Called once we exit the debugger trap to collate all of the separate linked
1745  * kcdata lists into one kcdata buffer. The calling thread will run this, and
1746  * it is guaranteed that nobody else is touching any stackshot state at this
1747  * point. In the case of a panic stackshot, this is never called since we only
1748  * use one thread.
1749  *
1750  * Called with interrupts enabled, stackshot subsys lock held.
1751  */
1752 __result_use_check
1753 static kern_return_t
stackshot_collect_kcdata(void)1754 stackshot_collect_kcdata(void)
1755 {
1756 	kern_return_t error = 0;
1757 	uint32_t      hdr_tag;
1758 
1759 	assert(!stackshot_ctx.sc_panic_stackshot && !stackshot_ctx.sc_is_singlethreaded);
1760 	LCK_MTX_ASSERT(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1761 
1762 	/* Allocate our final kcdata buffer. */
1763 	kcd_exit_on_error(stackshot_alloc_final_kcdata());
1764 	assert(stackshot_ctx.sc_finalized_kcdata != NULL);
1765 
1766 	/* Setup compression if we need it. */
1767 	if (stackshot_flags & STACKSHOT_DO_COMPRESS) {
1768 		hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
1769 		    : KCDATA_BUFFER_BEGIN_STACKSHOT;
1770 		kcd_exit_on_error(kcdata_init_compress(stackshot_ctx.sc_finalized_kcdata, hdr_tag, kdp_memcpy, KCDCT_ZLIB));
1771 	}
1772 
1773 	/* Copy over all of the pre task-iteration kcdata (to preserve order as if it were single-threaded) */
1774 	kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, stackshot_ctx.sc_pretask_kcdata));
1775 
1776 	/* Set each queue's cur_item to 0. */
1777 	for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
1778 		os_atomic_store(&stackshot_ctx.sc_workqueues[i].sswq_cur_item, 0, relaxed);
1779 	}
1780 
1781 	/*
1782 	 * Iterate over work queue(s) and copy the kcdata in.
1783 	 */
1784 	while (true) {
1785 		struct stackshot_workitem  *next_item = NULL;
1786 		struct stackshot_workqueue *next_queue = NULL;
1787 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
1788 			struct stackshot_workqueue *queue = &stackshot_ctx.sc_workqueues[i];
1789 			size_t cur_item = os_atomic_load(&queue->sswq_cur_item, relaxed);
1790 
1791 			/* Check if we're done with this queue */
1792 			if (cur_item >= os_atomic_load(&queue->sswq_num_items, relaxed)) {
1793 				continue;
1794 			}
1795 
1796 			/* Check if this workitem should come next */
1797 			struct stackshot_workitem *item = &queue->sswq_items[cur_item];
1798 			if ((next_item == NULL) || (next_item->sswi_idx > item->sswi_idx)) {
1799 				next_item = item;
1800 				next_queue = queue;
1801 			}
1802 		}
1803 
1804 		/* Queues are empty. */
1805 		if (next_item == NULL) {
1806 			break;
1807 		}
1808 
1809 		assert(next_queue);
1810 		assert(next_item->sswi_data != NULL);
1811 
1812 		os_atomic_inc(&next_queue->sswq_cur_item, relaxed);
1813 		kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, next_item->sswi_data));
1814 	}
1815 
1816 	/* Write post-task kcdata */
1817 	kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, stackshot_ctx.sc_posttask_kcdata));
1818 error_exit:
1819 	if (error != KERN_SUCCESS) {
1820 		stackshot_free_final_kcdata();
1821 	}
1822 	return error;
1823 }
1824 
1825 
1826 /**
1827  * Called at the very end of stackshot data generation, to write final timing
1828  * data to the kcdata structure and close compression. Only called for
1829  * multi-threaded stackshots; see stackshot_finalize_singlethreaded_kcata for
1830  * single-threaded variant.
1831  *
1832  * Called with interrupts enabled, stackshot subsys lock held.
1833  */
1834 __result_use_check
1835 static kern_return_t
stackshot_finalize_kcdata(void)1836 stackshot_finalize_kcdata(void)
1837 {
1838 	kern_return_t error = 0;
1839 
1840 	assert(!stackshot_ctx.sc_panic_stackshot && !stackshot_ctx.sc_is_singlethreaded);
1841 	LCK_MTX_ASSERT(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1842 
1843 	assert(stackshot_ctx.sc_finalized_kcdata != NULL);
1844 
1845 	/* Write stackshot timing info */
1846 	kcd_exit_on_error(stackshot_push_duration_and_latency(stackshot_ctx.sc_finalized_kcdata));
1847 
1848 	/* Note: exactly 0 or 1 call to something pushing more data can be called after kcd_finalize_compression */
1849 	kcd_finalize_compression(stackshot_ctx.sc_finalized_kcdata);
1850 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_ctx.sc_finalized_kcdata, stackshot_flags, "stackshot_out_flags"));
1851 	kcd_exit_on_error(kcdata_write_buffer_end(stackshot_ctx.sc_finalized_kcdata));
1852 
1853 	stackshot_ctx.sc_bytes_traced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_ctx.sc_finalized_kcdata);
1854 	stackshot_ctx.sc_bytes_uncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_ctx.sc_finalized_kcdata);
1855 
1856 	if (os_atomic_load(&stackshot_ctx.sc_retval, relaxed) == KERN_SUCCESS) {
1857 		/* releases and zeros done */
1858 		kcd_exit_on_error(kcdata_finish(stackshot_ctx.sc_finalized_kcdata));
1859 	}
1860 
1861 	memcpy(stackshot_args.buffer, (void*) stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin, stackshot_args.buffer_size);
1862 
1863 	/* Fix duration_outer offset */
1864 	if (stackshot_duration_outer != NULL) {
1865 		stackshot_duration_outer = (unaligned_u64*) ((mach_vm_address_t) stackshot_args.buffer + ((mach_vm_address_t) stackshot_duration_outer - stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin));
1866 	}
1867 
1868 error_exit:
1869 	stackshot_free_final_kcdata();
1870 	return error;
1871 }
1872 
1873 /**
1874  * Finalizes the kcdata for a singlethreaded stackshot.
1875  *
1876  * May be called from interrupt/panic context.
1877  */
1878 __result_use_check
1879 static kern_return_t
stackshot_finalize_singlethreaded_kcdata(void)1880 stackshot_finalize_singlethreaded_kcdata(void)
1881 {
1882 	kern_return_t error;
1883 
1884 	assert(stackshot_ctx.sc_is_singlethreaded);
1885 
1886 	kcd_exit_on_error(stackshot_push_duration_and_latency(stackshot_ctx.sc_finalized_kcdata));
1887 	/* Note: exactly 0 or 1 call to something pushing more data can be called after kcd_finalize_compression */
1888 	kcd_finalize_compression(stackshot_ctx.sc_finalized_kcdata);
1889 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_ctx.sc_finalized_kcdata, stackshot_flags, "stackshot_out_flags"));
1890 	kcd_exit_on_error(kcdata_write_buffer_end(stackshot_ctx.sc_finalized_kcdata));
1891 
1892 	stackshot_ctx.sc_bytes_traced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_ctx.sc_finalized_kcdata);
1893 	stackshot_ctx.sc_bytes_uncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_ctx.sc_finalized_kcdata);
1894 
1895 	kcd_exit_on_error(kcdata_finish(stackshot_ctx.sc_finalized_kcdata));
1896 
1897 	if (stackshot_ctx.sc_panic_stackshot) {
1898 		*stackshot_args.descriptor = *stackshot_ctx.sc_finalized_kcdata;
1899 	}
1900 
1901 error_exit:
1902 	return error;
1903 }
1904 
1905 /*
1906  * stackshot_remap_buffer:	Utility function to remap bytes_traced bytes starting at stackshotbuf
1907  *				into the current task's user space and subsequently copy out the address
1908  *				at which the buffer has been mapped in user space to out_buffer_addr.
1909  *
1910  * Inputs:			stackshotbuf - pointer to the original buffer in the kernel's address space
1911  *				bytes_traced - length of the buffer to remap starting from stackshotbuf
1912  *				out_buffer_addr - pointer to placeholder where newly mapped buffer will be mapped.
1913  *				out_size_addr - pointer to be filled in with the size of the buffer
1914  *
1915  * Outputs:			ENOSPC if there is not enough free space in the task's address space to remap the buffer
1916  *				EINVAL for all other errors returned by task_remap_buffer/mach_vm_remap
1917  *				an error from copyout
1918  */
1919 static kern_return_t
stackshot_remap_buffer(void * stackshotbuf,uint32_t bytes_traced,uint64_t out_buffer_addr,uint64_t out_size_addr)1920 stackshot_remap_buffer(void *stackshotbuf, uint32_t bytes_traced, uint64_t out_buffer_addr, uint64_t out_size_addr)
1921 {
1922 	int                     error = 0;
1923 	mach_vm_offset_t        stackshotbuf_user_addr = (mach_vm_offset_t)NULL;
1924 	vm_prot_t               cur_prot = VM_PROT_NONE, max_prot = VM_PROT_NONE;
1925 
1926 	error = mach_vm_remap(current_map(), &stackshotbuf_user_addr, bytes_traced, 0,
1927 	    VM_FLAGS_ANYWHERE, kernel_map, (mach_vm_offset_t)stackshotbuf, FALSE,
1928 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT);
1929 	/*
1930 	 * If the call to mach_vm_remap fails, we return the appropriate converted error
1931 	 */
1932 	if (error == KERN_SUCCESS) {
1933 		/* If the user addr somehow didn't get set, we should make sure that we fail, and (eventually)
1934 		 * panic on development kernels to find out why
1935 		 */
1936 		if (stackshotbuf_user_addr == (mach_vm_offset_t)NULL) {
1937 #if DEVELOPMENT || DEBUG
1938 			os_log_error(OS_LOG_DEFAULT, "stackshot: mach_vm_remap succeeded with NULL\n");
1939 #endif // DEVELOPMENT || DEBUG
1940 			return KERN_FAILURE;
1941 		}
1942 
1943 		/*
1944 		 * If we fail to copy out the address or size of the new buffer, we remove the buffer mapping that
1945 		 * we just made in the task's user space.
1946 		 */
1947 		error = copyout(CAST_DOWN(void *, &stackshotbuf_user_addr), (user_addr_t)out_buffer_addr, sizeof(stackshotbuf_user_addr));
1948 		if (error != KERN_SUCCESS) {
1949 			mach_vm_deallocate(get_task_map(current_task()), stackshotbuf_user_addr, (mach_vm_size_t)bytes_traced);
1950 			return error;
1951 		}
1952 		error = copyout(&bytes_traced, (user_addr_t)out_size_addr, sizeof(bytes_traced));
1953 		if (error != KERN_SUCCESS) {
1954 			mach_vm_deallocate(get_task_map(current_task()), stackshotbuf_user_addr, (mach_vm_size_t)bytes_traced);
1955 			return error;
1956 		}
1957 	}
1958 	return error;
1959 }
1960 
1961 #if CONFIG_EXCLAVES
1962 
1963 /*
1964  * Allocates an array for exclaves inspection from the stackshot buffer. This
1965  * state must be cleaned up by calling `stackshot_cleanup_exclave_waitlist`
1966  * after the stackshot is finished.
1967  */
1968 static kern_return_t
stackshot_setup_exclave_waitlist(void)1969 stackshot_setup_exclave_waitlist(void)
1970 {
1971 	kern_return_t error = KERN_SUCCESS;
1972 	size_t exclave_threads_max = exclaves_ipc_buffer_count();
1973 	size_t waitlist_size = 0;
1974 
1975 	assert(!stackshot_exclave_inspect_ctids);
1976 
1977 	if (exclaves_inspection_is_initialized() && exclave_threads_max) {
1978 		if (os_mul_overflow(exclave_threads_max, sizeof(ctid_t), &waitlist_size)) {
1979 			error = KERN_INVALID_ARGUMENT;
1980 			goto error;
1981 		}
1982 		stackshot_exclave_inspect_ctids = stackshot_alloc_with_size(waitlist_size, &error);
1983 		if (!stackshot_exclave_inspect_ctids) {
1984 			goto error;
1985 		}
1986 		stackshot_exclave_inspect_ctid_count = 0;
1987 		stackshot_exclave_inspect_ctid_capacity = exclave_threads_max;
1988 	}
1989 
1990 error:
1991 	return error;
1992 }
1993 
1994 static void
stackshot_cleanup_exclave_waitlist(void)1995 stackshot_cleanup_exclave_waitlist(void)
1996 {
1997 	stackshot_exclave_inspect_ctids = NULL;
1998 	stackshot_exclave_inspect_ctid_capacity = 0;
1999 	stackshot_exclave_inspect_ctid_count = 0;
2000 }
2001 
2002 static kern_return_t
collect_exclave_threads(uint64_t ss_flags)2003 collect_exclave_threads(uint64_t ss_flags)
2004 {
2005 	size_t i;
2006 	ctid_t ctid;
2007 	thread_t thread;
2008 	kern_return_t kr = KERN_SUCCESS;
2009 	STACKSHOT_SUBSYS_ASSERT_LOCKED();
2010 
2011 	lck_mtx_lock(&exclaves_collect_mtx);
2012 
2013 	if (stackshot_exclave_inspect_ctid_count == 0) {
2014 		/* Nothing to do */
2015 		goto out;
2016 	}
2017 
2018 	// When asking for ASIDs, make sure we get all exclaves asids and mappings as well
2019 	exclaves_stackshot_raw_addresses = (ss_flags & STACKSHOT_ASID);
2020 	exclaves_stackshot_all_address_spaces = (ss_flags & (STACKSHOT_ASID | STACKSHOT_EXCLAVES));
2021 
2022 	/* This error is intentionally ignored: we are now committed to collecting
2023 	 * these threads, or at least properly waking them. If this fails, the first
2024 	 * collected thread should also fail to append to the kcdata, and will abort
2025 	 * further collection, properly clearing the AST and waking these threads.
2026 	 */
2027 	kcdata_add_container_marker(stackshot_ctx.sc_finalized_kcdata, KCDATA_TYPE_CONTAINER_BEGIN,
2028 	    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
2029 
2030 	for (i = 0; i < stackshot_exclave_inspect_ctid_count; ++i) {
2031 		ctid = stackshot_exclave_inspect_ctids[i];
2032 		thread = ctid_get_thread(ctid);
2033 		assert(thread);
2034 		exclaves_inspection_queue_add(&exclaves_inspection_queue_stackshot, &thread->th_exclaves_inspection_queue_stackshot);
2035 	}
2036 	exclaves_inspection_begin_collecting();
2037 	exclaves_inspection_wait_complete(&exclaves_inspection_queue_stackshot);
2038 	kr = stackshot_exclave_kr; /* Read the result of work done on our behalf, by collection thread */
2039 	if (kr != KERN_SUCCESS) {
2040 		goto out;
2041 	}
2042 
2043 	kr = kcdata_add_container_marker(stackshot_ctx.sc_finalized_kcdata, KCDATA_TYPE_CONTAINER_END,
2044 	    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
2045 	if (kr != KERN_SUCCESS) {
2046 		goto out;
2047 	}
2048 out:
2049 	lck_mtx_unlock(&exclaves_collect_mtx);
2050 	return kr;
2051 }
2052 
2053 static kern_return_t
stackshot_exclaves_process_stacktrace(const address_v__opt_s * _Nonnull st,void * kcdata_ptr)2054 stackshot_exclaves_process_stacktrace(const address_v__opt_s *_Nonnull st, void *kcdata_ptr)
2055 {
2056 	kern_return_t error = KERN_SUCCESS;
2057 	exclave_ecstackentry_addr_t * addr = NULL;
2058 	__block size_t count = 0;
2059 
2060 	if (!st->has_value) {
2061 		goto error_exit;
2062 	}
2063 
2064 	address__v_visit(&st->value, ^(size_t __unused i, const stackshottypes_address_s __unused item) {
2065 		count++;
2066 	});
2067 
2068 	kcdata_compression_window_open(kcdata_ptr);
2069 	kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_IPCSTACKENTRY_ECSTACK,
2070 	    sizeof(exclave_ecstackentry_addr_t), count, (mach_vm_address_t*)&addr));
2071 
2072 	address__v_visit(&st->value, ^(size_t i, const stackshottypes_address_s item) {
2073 		addr[i] = (exclave_ecstackentry_addr_t)item;
2074 	});
2075 
2076 	kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2077 
2078 error_exit:
2079 	return error;
2080 }
2081 
2082 static kern_return_t
stackshot_exclaves_process_ipcstackentry(uint64_t index,const stackshottypes_ipcstackentry_s * _Nonnull ise,void * kcdata_ptr)2083 stackshot_exclaves_process_ipcstackentry(uint64_t index, const stackshottypes_ipcstackentry_s *_Nonnull ise, void *kcdata_ptr)
2084 {
2085 	kern_return_t error = KERN_SUCCESS;
2086 
2087 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2088 	    STACKSHOT_KCCONTAINER_EXCLAVE_IPCSTACKENTRY, index));
2089 
2090 	struct exclave_ipcstackentry_info info = { 0 };
2091 	info.eise_asid = ise->asid;
2092 
2093 	info.eise_tnid = ise->tnid;
2094 
2095 	if (ise->invocationid.has_value) {
2096 		info.eise_flags |= kExclaveIpcStackEntryHaveInvocationID;
2097 		info.eise_invocationid = ise->invocationid.value;
2098 	} else {
2099 		info.eise_invocationid = 0;
2100 	}
2101 
2102 	info.eise_flags |= (ise->stacktrace.has_value ? kExclaveIpcStackEntryHaveStack : 0);
2103 
2104 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_IPCSTACKENTRY_INFO, sizeof(struct exclave_ipcstackentry_info), &info));
2105 
2106 	if (ise->stacktrace.has_value) {
2107 		kcd_exit_on_error(stackshot_exclaves_process_stacktrace(&ise->stacktrace, kcdata_ptr));
2108 	}
2109 
2110 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2111 	    STACKSHOT_KCCONTAINER_EXCLAVE_IPCSTACKENTRY, index));
2112 
2113 error_exit:
2114 	return error;
2115 }
2116 
2117 static kern_return_t
stackshot_exclaves_process_ipcstack(const stackshottypes_ipcstackentry_v__opt_s * _Nonnull ipcstack,void * kcdata_ptr)2118 stackshot_exclaves_process_ipcstack(const stackshottypes_ipcstackentry_v__opt_s *_Nonnull ipcstack, void *kcdata_ptr)
2119 {
2120 	__block kern_return_t kr = KERN_SUCCESS;
2121 
2122 	if (!ipcstack->has_value) {
2123 		goto error_exit;
2124 	}
2125 
2126 	stackshottypes_ipcstackentry__v_visit(&ipcstack->value, ^(size_t i, const stackshottypes_ipcstackentry_s *_Nonnull item) {
2127 		if (kr == KERN_SUCCESS) {
2128 		        kr = stackshot_exclaves_process_ipcstackentry(i, item, kcdata_ptr);
2129 		}
2130 	});
2131 
2132 error_exit:
2133 	return kr;
2134 }
2135 
2136 static kern_return_t
stackshot_exclaves_process_stackshotentry(const stackshot_stackshotentry_s * _Nonnull se,void * kcdata_ptr)2137 stackshot_exclaves_process_stackshotentry(const stackshot_stackshotentry_s *_Nonnull se, void *kcdata_ptr)
2138 {
2139 	kern_return_t error = KERN_SUCCESS;
2140 
2141 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2142 	    STACKSHOT_KCCONTAINER_EXCLAVE_SCRESULT, se->scid));
2143 
2144 	struct exclave_scresult_info info = { 0 };
2145 	info.esc_id = se->scid;
2146 	info.esc_flags = se->ipcstack.has_value ? kExclaveScresultHaveIPCStack : 0;
2147 
2148 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_SCRESULT_INFO, sizeof(struct exclave_scresult_info), &info));
2149 
2150 	if (se->ipcstack.has_value) {
2151 		kcd_exit_on_error(stackshot_exclaves_process_ipcstack(&se->ipcstack, kcdata_ptr));
2152 	}
2153 
2154 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2155 	    STACKSHOT_KCCONTAINER_EXCLAVE_SCRESULT, se->scid));
2156 
2157 error_exit:
2158 	return error;
2159 }
2160 
2161 static kern_return_t
stackshot_exclaves_process_textlayout_segments(const stackshottypes_textlayout_s * _Nonnull tl,void * kcdata_ptr,bool want_raw_addresses)2162 stackshot_exclaves_process_textlayout_segments(const stackshottypes_textlayout_s *_Nonnull tl, void *kcdata_ptr, bool want_raw_addresses)
2163 {
2164 	kern_return_t error = KERN_SUCCESS;
2165 	__block struct exclave_textlayout_segment_v2 * info = NULL;
2166 
2167 	__block size_t count = 0;
2168 	stackshottypes_textsegment__v_visit(&tl->textsegments, ^(size_t __unused i, const stackshottypes_textsegment_s __unused *_Nonnull item) {
2169 		count++;
2170 	});
2171 
2172 	if (!count) {
2173 		goto error_exit;
2174 	}
2175 
2176 	kcdata_compression_window_open(kcdata_ptr);
2177 	kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGMENTS,
2178 	    sizeof(struct exclave_textlayout_segment_v2), count, (mach_vm_address_t*)&info));
2179 
2180 	stackshottypes_textsegment__v_visit(&tl->textsegments, ^(size_t __unused i, const stackshottypes_textsegment_s *_Nonnull item) {
2181 		memcpy(&info->layoutSegment_uuid, item->uuid, sizeof(uuid_t));
2182 		info->layoutSegment_loadAddress = item->loadaddress;
2183 		if (want_raw_addresses) {
2184 		        info->layoutSegment_rawLoadAddress = item->rawloadaddress.has_value ? item->rawloadaddress.value: 0;
2185 		} else {
2186 		        info->layoutSegment_rawLoadAddress = 0;
2187 		}
2188 		info++;
2189 	});
2190 
2191 	kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2192 
2193 error_exit:
2194 	return error;
2195 }
2196 
2197 static kern_return_t
stackshot_exclaves_process_textlayout(const stackshottypes_textlayout_s * _Nonnull tl,void * kcdata_ptr,bool want_raw_addresses)2198 stackshot_exclaves_process_textlayout(const stackshottypes_textlayout_s *_Nonnull tl, void *kcdata_ptr, bool want_raw_addresses)
2199 {
2200 	kern_return_t error = KERN_SUCCESS;
2201 	__block struct exclave_textlayout_info info = { 0 };
2202 
2203 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2204 	    STACKSHOT_KCCONTAINER_EXCLAVE_TEXTLAYOUT, tl->textlayoutid));
2205 
2206 	// tightbeam optional interfaced don't have enough const.
2207 	u32__opt_s sharedcacheindex_opt = tl->sharedcacheindex;
2208 	const uint32_t *sharedcache_index = u32__opt_get(&sharedcacheindex_opt);
2209 
2210 	info.layout_id = tl->textlayoutid;
2211 
2212 	info.etl_flags =
2213 	    (want_raw_addresses ? 0 : kExclaveTextLayoutLoadAddressesUnslid) |
2214 	    (sharedcache_index == NULL ? 0 : kExclaveTextLayoutHasSharedCache);
2215 	info.sharedcache_index = (sharedcache_index == NULL) ? UINT32_MAX : *sharedcache_index;
2216 
2217 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_INFO, sizeof(struct exclave_textlayout_info), &info));
2218 	kcd_exit_on_error(stackshot_exclaves_process_textlayout_segments(tl, kcdata_ptr, want_raw_addresses));
2219 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2220 	    STACKSHOT_KCCONTAINER_EXCLAVE_TEXTLAYOUT, tl->textlayoutid));
2221 error_exit:
2222 	return error;
2223 }
2224 
2225 static kern_return_t
stackshot_exclaves_process_addressspace(const stackshottypes_addressspace_s * _Nonnull as,void * kcdata_ptr,bool want_raw_addresses)2226 stackshot_exclaves_process_addressspace(const stackshottypes_addressspace_s *_Nonnull as, void *kcdata_ptr, bool want_raw_addresses)
2227 {
2228 	kern_return_t error = KERN_SUCCESS;
2229 	struct exclave_addressspace_info info = { 0 };
2230 	__block size_t name_len = 0;
2231 	uint8_t * name = NULL;
2232 
2233 	u8__v_visit(&as->name, ^(size_t __unused i, const uint8_t __unused item) {
2234 		name_len++;
2235 	});
2236 
2237 	info.eas_id = as->asid;
2238 
2239 	if (want_raw_addresses && as->rawaddressslide.has_value) {
2240 		info.eas_flags = kExclaveAddressSpaceHaveSlide;
2241 		info.eas_slide = as->rawaddressslide.value;
2242 	} else {
2243 		info.eas_flags = 0;
2244 		info.eas_slide = UINT64_MAX;
2245 	}
2246 
2247 	info.eas_layoutid = as->textlayoutid; // text layout for this address space
2248 	info.eas_asroot = as->asroot.has_value ? as->asroot.value : 0;
2249 
2250 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2251 	    STACKSHOT_KCCONTAINER_EXCLAVE_ADDRESSSPACE, as->asid));
2252 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_ADDRESSSPACE_INFO, sizeof(struct exclave_addressspace_info), &info));
2253 
2254 	if (name_len > 0) {
2255 		kcdata_compression_window_open(kcdata_ptr);
2256 		kcd_exit_on_error(kcdata_get_memory_addr(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_ADDRESSSPACE_NAME, name_len + 1, (mach_vm_address_t*)&name));
2257 
2258 		u8__v_visit(&as->name, ^(size_t i, const uint8_t item) {
2259 			name[i] = item;
2260 		});
2261 		name[name_len] = 0;
2262 
2263 		kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2264 	}
2265 
2266 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2267 	    STACKSHOT_KCCONTAINER_EXCLAVE_ADDRESSSPACE, as->asid));
2268 error_exit:
2269 	return error;
2270 }
2271 
2272 kern_return_t
2273 stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s *result, void *kcdata_ptr, bool want_raw_addresses);
2274 
2275 kern_return_t
stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s * result,void * kcdata_ptr,bool want_raw_addresses)2276 stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s *result, void *kcdata_ptr, bool want_raw_addresses)
2277 {
2278 	__block kern_return_t kr = KERN_SUCCESS;
2279 
2280 	stackshot_stackshotentry__v_visit(&result->stackshotentries, ^(size_t __unused i, const stackshot_stackshotentry_s *_Nonnull item) {
2281 		if (kr == KERN_SUCCESS) {
2282 		        kr = stackshot_exclaves_process_stackshotentry(item, kcdata_ptr);
2283 		}
2284 	});
2285 
2286 	stackshottypes_addressspace__v_visit(&result->addressspaces, ^(size_t __unused i, const stackshottypes_addressspace_s *_Nonnull item) {
2287 		if (kr == KERN_SUCCESS) {
2288 		        kr = stackshot_exclaves_process_addressspace(item, kcdata_ptr, want_raw_addresses);
2289 		}
2290 	});
2291 
2292 	stackshottypes_textlayout__v_visit(&result->textlayouts, ^(size_t __unused i, const stackshottypes_textlayout_s *_Nonnull item) {
2293 		if (kr == KERN_SUCCESS) {
2294 		        kr = stackshot_exclaves_process_textlayout(item, kcdata_ptr, want_raw_addresses);
2295 		}
2296 	});
2297 
2298 	return kr;
2299 }
2300 
2301 kern_return_t
2302 stackshot_exclaves_process_result(kern_return_t collect_kr, const stackshot_stackshotresult_s *result, bool want_raw_addresses);
2303 
2304 kern_return_t
stackshot_exclaves_process_result(kern_return_t collect_kr,const stackshot_stackshotresult_s * result,bool want_raw_addresses)2305 stackshot_exclaves_process_result(kern_return_t collect_kr, const stackshot_stackshotresult_s *result, bool want_raw_addresses)
2306 {
2307 	kern_return_t kr = KERN_SUCCESS;
2308 	if (result == NULL) {
2309 		return collect_kr;
2310 	}
2311 
2312 	kr = stackshot_exclaves_process_stackshot(result, stackshot_ctx.sc_finalized_kcdata, want_raw_addresses);
2313 
2314 	stackshot_exclave_kr = kr;
2315 
2316 	return kr;
2317 }
2318 
2319 
2320 static void
commit_exclaves_ast(void)2321 commit_exclaves_ast(void)
2322 {
2323 	size_t i = 0;
2324 	thread_t thread = NULL;
2325 	size_t count;
2326 
2327 	assert(debug_mode_active());
2328 
2329 	count = os_atomic_load(&stackshot_exclave_inspect_ctid_count, acquire);
2330 
2331 	if (stackshot_exclave_inspect_ctids) {
2332 		for (i = 0; i < count; ++i) {
2333 			thread = ctid_get_thread(stackshot_exclave_inspect_ctids[i]);
2334 			assert(thread);
2335 			thread_reference(thread);
2336 			os_atomic_or(&thread->th_exclaves_inspection_state, TH_EXCLAVES_INSPECTION_STACKSHOT, relaxed);
2337 		}
2338 	}
2339 }
2340 
2341 #endif /* CONFIG_EXCLAVES */
2342 
2343 kern_return_t
kern_stack_snapshot_internal(int stackshot_config_version,void * stackshot_config,size_t stackshot_config_size,boolean_t stackshot_from_user)2344 kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user)
2345 {
2346 	int error = 0;
2347 	boolean_t prev_interrupt_state;
2348 	bool did_copyout = false;
2349 	uint32_t bytes_traced = 0;
2350 	uint32_t stackshot_estimate = 0;
2351 	struct kdp_snapshot_args snapshot_args;
2352 
2353 	void * buf_to_free = NULL;
2354 	int size_to_free = 0;
2355 	bool is_traced = false;    /* has FUNC_START tracepoint fired? */
2356 	uint64_t tot_interrupts_off_abs = 0; /* sum(time with interrupts off) */
2357 
2358 	/* Parsed arguments */
2359 	uint64_t                out_buffer_addr;
2360 	uint64_t                out_size_addr;
2361 	uint32_t                size_hint = 0;
2362 
2363 	snapshot_args.pagetable_mask = STACKSHOT_PAGETABLES_MASK_ALL;
2364 
2365 	if (stackshot_config == NULL) {
2366 		return KERN_INVALID_ARGUMENT;
2367 	}
2368 #if DEVELOPMENT || DEBUG
2369 	/* TBD: ask stackshot clients to avoid issuing stackshots in this
2370 	 * configuration in lieu of the kernel feature override.
2371 	 */
2372 	if (kern_feature_override(KF_STACKSHOT_OVRD) == TRUE) {
2373 		return KERN_NOT_SUPPORTED;
2374 	}
2375 #endif
2376 
2377 	switch (stackshot_config_version) {
2378 	case STACKSHOT_CONFIG_TYPE:
2379 		if (stackshot_config_size != sizeof(stackshot_config_t)) {
2380 			return KERN_INVALID_ARGUMENT;
2381 		}
2382 		stackshot_config_t *config = (stackshot_config_t *) stackshot_config;
2383 		out_buffer_addr = config->sc_out_buffer_addr;
2384 		out_size_addr = config->sc_out_size_addr;
2385 		snapshot_args.pid = config->sc_pid;
2386 		snapshot_args.flags = config->sc_flags;
2387 		snapshot_args.since_timestamp = config->sc_delta_timestamp;
2388 		if (config->sc_size <= max_tracebuf_size) {
2389 			size_hint = config->sc_size;
2390 		}
2391 		/*
2392 		 * Retain the pre-sc_pagetable_mask behavior of STACKSHOT_PAGE_TABLES,
2393 		 * dump every level if the pagetable_mask is not set
2394 		 */
2395 		if (snapshot_args.flags & STACKSHOT_PAGE_TABLES && config->sc_pagetable_mask) {
2396 			snapshot_args.pagetable_mask = config->sc_pagetable_mask;
2397 		}
2398 		break;
2399 	default:
2400 		return KERN_NOT_SUPPORTED;
2401 	}
2402 
2403 	/*
2404 	 * Currently saving a kernel buffer and trylock are only supported from the
2405 	 * internal/KEXT API.
2406 	 */
2407 	if (stackshot_from_user) {
2408 		if (snapshot_args.flags & (STACKSHOT_TRYLOCK | STACKSHOT_SAVE_IN_KERNEL_BUFFER | STACKSHOT_FROM_PANIC)) {
2409 			return KERN_NO_ACCESS;
2410 		}
2411 #if !DEVELOPMENT && !DEBUG
2412 		if (snapshot_args.flags & (STACKSHOT_DO_COMPRESS)) {
2413 			return KERN_NO_ACCESS;
2414 		}
2415 #endif
2416 	} else {
2417 		if (!(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2418 			return KERN_NOT_SUPPORTED;
2419 		}
2420 	}
2421 
2422 	if (!((snapshot_args.flags & STACKSHOT_KCDATA_FORMAT) || (snapshot_args.flags & STACKSHOT_RETRIEVE_EXISTING_BUFFER))) {
2423 		return KERN_NOT_SUPPORTED;
2424 	}
2425 
2426 	/* Compresssed delta stackshots or page dumps are not yet supported */
2427 	if (((snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) || (snapshot_args.flags & STACKSHOT_PAGE_TABLES))
2428 	    && (snapshot_args.flags & STACKSHOT_DO_COMPRESS)) {
2429 		return KERN_NOT_SUPPORTED;
2430 	}
2431 
2432 	/*
2433 	 * If we're not saving the buffer in the kernel pointer, we need a place to copy into.
2434 	 */
2435 	if ((!out_buffer_addr || !out_size_addr) && !(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2436 		return KERN_INVALID_ARGUMENT;
2437 	}
2438 
2439 	if (snapshot_args.since_timestamp != 0 && ((snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) == 0)) {
2440 		return KERN_INVALID_ARGUMENT;
2441 	}
2442 
2443 	/* EXCLAVES and SKIP_EXCLAVES conflict */
2444 	if ((snapshot_args.flags & (STACKSHOT_EXCLAVES | STACKSHOT_SKIP_EXCLAVES)) == (STACKSHOT_EXCLAVES | STACKSHOT_SKIP_EXCLAVES)) {
2445 		return KERN_INVALID_ARGUMENT;
2446 	}
2447 
2448 #if CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS
2449 	if (!mt_core_supported) {
2450 		snapshot_args.flags &= ~STACKSHOT_INSTRS_CYCLES;
2451 	}
2452 #else /* CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS */
2453 	snapshot_args.flags &= ~STACKSHOT_INSTRS_CYCLES;
2454 #endif /* !CONFIG_PERVASIVE_CPI || !CONFIG_CPU_COUNTERS */
2455 
2456 	STACKSHOT_TESTPOINT(TP_WAIT_START_STACKSHOT);
2457 	STACKSHOT_SUBSYS_LOCK();
2458 
2459 	stackshot_tries = 0;
2460 
2461 	if (snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER) {
2462 		/*
2463 		 * Don't overwrite an existing stackshot
2464 		 */
2465 		if (kernel_stackshot_buf != NULL) {
2466 			error = KERN_MEMORY_PRESENT;
2467 			goto error_early_exit;
2468 		}
2469 	} else if (snapshot_args.flags & STACKSHOT_RETRIEVE_EXISTING_BUFFER) {
2470 		if ((kernel_stackshot_buf == NULL) || (kernel_stackshot_buf_size <= 0)) {
2471 			error = KERN_NOT_IN_SET;
2472 			goto error_early_exit;
2473 		}
2474 		error = stackshot_remap_buffer(kernel_stackshot_buf, kernel_stackshot_buf_size,
2475 		    out_buffer_addr, out_size_addr);
2476 		/*
2477 		 * If we successfully remapped the buffer into the user's address space, we
2478 		 * set buf_to_free and size_to_free so the prior kernel mapping will be removed
2479 		 * and then clear the kernel stackshot pointer and associated size.
2480 		 */
2481 		if (error == KERN_SUCCESS) {
2482 			did_copyout = true;
2483 			buf_to_free = kernel_stackshot_buf;
2484 			size_to_free = (int) VM_MAP_ROUND_PAGE(kernel_stackshot_buf_size, PAGE_MASK);
2485 			kernel_stackshot_buf = NULL;
2486 			kernel_stackshot_buf_size = 0;
2487 		}
2488 
2489 		goto error_early_exit;
2490 	}
2491 
2492 	if (snapshot_args.flags & STACKSHOT_GET_BOOT_PROFILE) {
2493 		void *bootprofile = NULL;
2494 		uint32_t len = 0;
2495 #if CONFIG_TELEMETRY
2496 		bootprofile_get(&bootprofile, &len);
2497 #endif
2498 		if (!bootprofile || !len) {
2499 			error = KERN_NOT_IN_SET;
2500 			goto error_early_exit;
2501 		}
2502 		error = stackshot_remap_buffer(bootprofile, len, out_buffer_addr, out_size_addr);
2503 		if (error == KERN_SUCCESS) {
2504 			did_copyout = true;
2505 		}
2506 		goto error_early_exit;
2507 	}
2508 
2509 	stackshot_duration_prior_abs = 0;
2510 	stackshot_initial_estimate_adj = os_atomic_load(&stackshot_estimate_adj, relaxed);
2511 	snapshot_args.buffer_size = stackshot_estimate =
2512 	    get_stackshot_estsize(size_hint, stackshot_initial_estimate_adj, snapshot_args.flags, snapshot_args.pid);
2513 	stackshot_initial_estimate = stackshot_estimate;
2514 
2515 	// ensure at least one attempt, even if the initial size from estimate was too big
2516 	snapshot_args.buffer_size = MIN(snapshot_args.buffer_size, max_tracebuf_size);
2517 
2518 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD) | DBG_FUNC_START,
2519 	    snapshot_args.flags, snapshot_args.buffer_size, snapshot_args.pid, snapshot_args.since_timestamp);
2520 	is_traced = true;
2521 
2522 #if CONFIG_EXCLAVES
2523 	assert(!stackshot_exclave_inspect_ctids);
2524 #endif
2525 
2526 	for (; snapshot_args.buffer_size <= max_tracebuf_size; snapshot_args.buffer_size = MIN(snapshot_args.buffer_size << 1, max_tracebuf_size)) {
2527 		stackshot_tries++;
2528 		if ((error = kmem_alloc(kernel_map, (vm_offset_t *)&snapshot_args.buffer, snapshot_args.buffer_size,
2529 		    KMA_ZERO | KMA_DATA_SHARED, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) {
2530 			os_log_error(OS_LOG_DEFAULT, "stackshot: initial allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, snapshot_args.buffer_size, max_tracebuf_size, stackshot_tries);
2531 			error = KERN_RESOURCE_SHORTAGE;
2532 			goto error_exit;
2533 		}
2534 
2535 		uint32_t hdr_tag = (snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2536 		    : (snapshot_args.flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
2537 		    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2538 		#pragma unused(hdr_tag)
2539 
2540 		stackshot_duration_outer = NULL;
2541 
2542 		/* if compression was requested, allocate the extra zlib scratch area */
2543 		if (snapshot_args.flags & STACKSHOT_DO_COMPRESS) {
2544 			hdr_tag = (snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2545 			    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2546 			if (error != KERN_SUCCESS) {
2547 				os_log_error(OS_LOG_DEFAULT, "failed to initialize compression: %d!\n",
2548 				    (int) error);
2549 				goto error_exit;
2550 			}
2551 		}
2552 
2553 		/* Prepare the compressor for a stackshot */
2554 		error = vm_compressor_kdp_init();
2555 		if (error != KERN_SUCCESS) {
2556 			goto error_exit;
2557 		}
2558 
2559 		/*
2560 		 * Disable interrupts and save the current interrupt state.
2561 		 */
2562 		prev_interrupt_state = ml_set_interrupts_enabled(FALSE);
2563 		uint64_t time_start  = mach_absolute_time();
2564 
2565 		/* Emit a SOCD tracepoint that we are initiating a stackshot */
2566 		SOCD_TRACE_XNU_START(STACKSHOT);
2567 
2568 		/*
2569 		 * Load stackshot parameters.
2570 		 */
2571 		error = kdp_snapshot_preflight_internal(snapshot_args);
2572 
2573 		if (error == KERN_SUCCESS) {
2574 			error = stackshot_trap();
2575 		}
2576 
2577 		/* Emit a SOCD tracepoint that we have completed the stackshot */
2578 		SOCD_TRACE_XNU_END(STACKSHOT);
2579 		ml_set_interrupts_enabled(prev_interrupt_state);
2580 
2581 #if CONFIG_EXCLAVES
2582 		/* stackshot trap should only finish successfully or with no pending Exclave threads */
2583 		assert(error == KERN_SUCCESS || stackshot_exclave_inspect_ctids == NULL);
2584 #endif
2585 
2586 		/*
2587 		 * Stackshot is no longer active.
2588 		 * (We have to do this here for the special interrupt disable timeout case to work)
2589 		 */
2590 		os_atomic_store(&stackshot_ctx.sc_state, SS_INACTIVE, release);
2591 
2592 		/* Release compressor kdp buffers */
2593 		vm_compressor_kdp_teardown();
2594 
2595 		/* Record duration that interrupts were disabled */
2596 		uint64_t time_end = mach_absolute_time();
2597 		tot_interrupts_off_abs += (time_end - time_start);
2598 
2599 		/* Collect multithreaded kcdata into one finalized buffer */
2600 		if (error == KERN_SUCCESS && !stackshot_ctx.sc_is_singlethreaded) {
2601 			error = stackshot_collect_kcdata();
2602 		}
2603 
2604 #if CONFIG_EXCLAVES
2605 		if (stackshot_exclave_inspect_ctids) {
2606 			if (error == KERN_SUCCESS) {
2607 				if (stackshot_exclave_inspect_ctid_count > 0) {
2608 					STACKSHOT_TESTPOINT(TP_START_COLLECTION);
2609 				}
2610 				error = collect_exclave_threads(snapshot_args.flags);
2611 			}
2612 			stackshot_cleanup_exclave_waitlist();
2613 		}
2614 #endif /* CONFIG_EXCLAVES */
2615 
2616 		if (error == KERN_SUCCESS) {
2617 			if (stackshot_ctx.sc_is_singlethreaded) {
2618 				error = stackshot_finalize_singlethreaded_kcdata();
2619 			} else {
2620 				error = stackshot_finalize_kcdata();
2621 			}
2622 
2623 			if ((error != KERN_SUCCESS) && (error != KERN_INSUFFICIENT_BUFFER_SIZE)) {
2624 				goto error_exit;
2625 			}
2626 			if (error == KERN_INSUFFICIENT_BUFFER_SIZE && snapshot_args.buffer_size == max_tracebuf_size) {
2627 				os_log_error(OS_LOG_DEFAULT, "stackshot: final buffer size was insufficient at maximum size: "
2628 				    "try %llu, estimate %u, flags %llu, pid %d, "
2629 				    "tasks: %d, terminated_tasks %d, threads: %d, terminated_threads: %d\n",
2630 				    stackshot_tries, snapshot_args.buffer_size, snapshot_args.flags, snapshot_args.pid,
2631 				    tasks_count, terminated_tasks_count,
2632 				    threads_count, terminated_threads_count);
2633 				error = KERN_RESOURCE_SHORTAGE;
2634 				goto error_exit;
2635 			}
2636 		}
2637 
2638 		/* record the duration that interupts were disabled + kcdata was being finalized */
2639 		if (stackshot_duration_outer) {
2640 			*stackshot_duration_outer = mach_absolute_time() - time_start;
2641 		}
2642 
2643 		if (error != KERN_SUCCESS) {
2644 			os_log_error(OS_LOG_DEFAULT, "stackshot: debugger call failed: %d, try %llu, buffer %u estimate %u\n", (int)error, stackshot_tries, snapshot_args.buffer_size, stackshot_estimate);
2645 			kmem_free(kernel_map, (vm_offset_t)snapshot_args.buffer, snapshot_args.buffer_size);
2646 			snapshot_args.buffer = NULL;
2647 			if (error == KERN_INSUFFICIENT_BUFFER_SIZE) {
2648 				/*
2649 				 * If we didn't allocate a big enough buffer, deallocate and try again.
2650 				 */
2651 				KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD_SHORT) | DBG_FUNC_NONE,
2652 				    time_end - time_start, stackshot_estimate, snapshot_args.buffer_size);
2653 				stackshot_duration_prior_abs += (time_end - time_start);
2654 				if (snapshot_args.buffer_size == max_tracebuf_size) {
2655 					os_log_error(OS_LOG_DEFAULT, "stackshot: initial buffer size was insufficient at maximum size: "
2656 					    "try %llu, estimate %u, flags %llu, pid %d, "
2657 					    "tasks: %d, terminated_tasks %d, threads: %d, terminated_threads: %d\n",
2658 					    stackshot_tries, snapshot_args.buffer_size, snapshot_args.flags, snapshot_args.pid,
2659 					    tasks_count, terminated_tasks_count,
2660 					    threads_count, terminated_threads_count);
2661 					error = KERN_RESOURCE_SHORTAGE;
2662 					goto error_exit;
2663 				}
2664 				continue;
2665 			} else {
2666 				goto error_exit;
2667 			}
2668 		}
2669 
2670 		bytes_traced = kdp_stack_snapshot_bytes_traced();
2671 		if (bytes_traced <= 0) {
2672 			error = KERN_ABORTED;
2673 			goto error_exit;
2674 		}
2675 
2676 		if (!(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2677 			error = stackshot_remap_buffer(snapshot_args.buffer, bytes_traced, out_buffer_addr, out_size_addr);
2678 			if (error == KERN_SUCCESS) {
2679 				did_copyout = true;
2680 			}
2681 			goto error_exit;
2682 		}
2683 
2684 		if (!(snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT)) {
2685 			os_log_info(OS_LOG_DEFAULT, "stackshot: succeeded, traced %u bytes to %u buffer (estimate %u) try %llu\n", bytes_traced, snapshot_args.buffer_size, stackshot_estimate, stackshot_tries);
2686 		}
2687 
2688 		/*
2689 		 * Save the stackshot in the kernel buffer.
2690 		 */
2691 		kernel_stackshot_buf = snapshot_args.buffer;
2692 		kernel_stackshot_buf_size =  bytes_traced;
2693 		/*
2694 		 * Figure out if we didn't use all the pages in the buffer. If so, we set buf_to_free to the beginning of
2695 		 * the next page after the end of the stackshot in the buffer so that the kmem_free clips the buffer and
2696 		 * update size_to_free for kmem_free accordingly.
2697 		 */
2698 		size_to_free = snapshot_args.buffer_size - (int) VM_MAP_ROUND_PAGE(bytes_traced, PAGE_MASK);
2699 
2700 		assert(size_to_free >= 0);
2701 
2702 		if (size_to_free != 0) {
2703 			buf_to_free = (void *)((uint64_t)snapshot_args.buffer + snapshot_args.buffer_size - size_to_free);
2704 		}
2705 
2706 		snapshot_args.buffer = NULL;
2707 		snapshot_args.buffer_size = 0;
2708 		goto error_exit;
2709 	}
2710 
2711 error_exit:
2712 	if (is_traced) {
2713 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD) | DBG_FUNC_END,
2714 		    error, tot_interrupts_off_abs, snapshot_args.buffer_size, bytes_traced);
2715 	}
2716 
2717 error_early_exit:
2718 	if (snapshot_args.buffer != NULL) {
2719 		kmem_free(kernel_map, (vm_offset_t)snapshot_args.buffer, snapshot_args.buffer_size);
2720 	}
2721 	if (buf_to_free != NULL) {
2722 		kmem_free(kernel_map, (vm_offset_t)buf_to_free, size_to_free);
2723 	}
2724 
2725 	if (error == KERN_SUCCESS && !(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER) && !did_copyout) {
2726 		/* If we return success, we must have done the copyout to userspace. If
2727 		 * we somehow did not, we need to indicate failure instead.
2728 		 */
2729 #if DEVELOPMENT || DEBUG
2730 		os_log_error(OS_LOG_DEFAULT, "stackshot: reached end without doing copyout\n");
2731 #endif // DEVELOPMENT || DEBUG
2732 		error = KERN_FAILURE;
2733 	}
2734 
2735 	STACKSHOT_SUBSYS_UNLOCK();
2736 	STACKSHOT_TESTPOINT(TP_STACKSHOT_DONE);
2737 
2738 	return error;
2739 }
2740 
2741 /*
2742  * Set up state and parameters for a stackshot.
2743  * (This runs on the calling CPU before other CPUs enter the debugger trap.)
2744  * Called when interrupts are disabled, but we're not in the debugger trap yet.
2745  */
2746 __result_use_check
2747 static kern_return_t
kdp_snapshot_preflight_internal(struct kdp_snapshot_args args)2748 kdp_snapshot_preflight_internal(struct kdp_snapshot_args args)
2749 {
2750 	kern_return_t error = KERN_SUCCESS;
2751 	uint64_t microsecs = 0, secs = 0;
2752 	bool is_panic = ((args.flags & STACKSHOT_FROM_PANIC) != 0);
2753 	bool process_scoped = (args.pid != -1) &&
2754 	    ((args.flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0);
2755 	bool is_singlethreaded = stackshot_single_thread || (process_scoped || is_panic || ((args.flags & STACKSHOT_PAGE_TABLES) != 0));
2756 	clock_get_calendar_microtime((clock_sec_t *)&secs, (clock_usec_t *)&microsecs);
2757 
2758 	cur_stackshot_ctx_idx = (is_panic ? STACKSHOT_CTX_IDX_PANIC : STACKSHOT_CTX_IDX_NORMAL);
2759 
2760 	/* Setup overall state */
2761 	stackshot_ctx = (struct stackshot_context) {
2762 		.sc_args               = args,
2763 		.sc_state              = SS_SETUP,
2764 		.sc_bytes_traced       = 0,
2765 		.sc_bytes_uncompressed = 0,
2766 		.sc_microsecs          = microsecs + (secs * USEC_PER_SEC),
2767 		.sc_panic_stackshot    = is_panic,
2768 		.sc_is_singlethreaded  = is_singlethreaded,
2769 		.sc_cpus_working       = 0,
2770 		.sc_retval             = 0,
2771 		.sc_calling_cpuid      = cpu_number(),
2772 		.sc_main_cpuid         = is_singlethreaded ? cpu_number() : -1,
2773 		.sc_min_kcdata_size    = get_stackshot_est_tasksize(args.flags),
2774 		.sc_enable_faulting    = false,
2775 	};
2776 
2777 	if (!stackshot_ctx.sc_panic_stackshot) {
2778 #if defined(__AMP__)
2779 		/* On AMP systems, we want to split the buffers up by cluster to avoid cache line effects. */
2780 		stackshot_ctx.sc_num_buffers = is_singlethreaded ? 1 : ml_get_cluster_count();
2781 #else /* __AMP__ */
2782 		stackshot_ctx.sc_num_buffers = 1;
2783 #endif /* !__AMP__ */
2784 
2785 		/*
2786 		 * Set all buffer sizes to zero. We'll use ssb_size to track how many CPUs in
2787 		 * that cluster are participating in the stackshot.
2788 		 */
2789 		bzero(stackshot_ctx.sc_buffers, sizeof(stackshot_ctx.sc_buffers));
2790 
2791 		/* Setup per-cpu state */
2792 		percpu_foreach_base(base) {
2793 			*PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu) = (struct stackshot_cpu_context) { 0 };
2794 		}
2795 
2796 		if (is_singlethreaded) {
2797 			/* If the stackshot is singlethreaded, set up the kcdata - we don't bother with linked-list kcdata in singlethreaded mode. */
2798 			uint32_t hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2799 			    : (stackshot_flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
2800 			    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2801 			kcdata_memory_static_init(stackshot_kcdata_p, (mach_vm_address_t) stackshot_args.buffer, hdr_tag,
2802 			    stackshot_args.buffer_size, KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER);
2803 			if (stackshot_flags & STACKSHOT_DO_COMPRESS) {
2804 				hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2805 				    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2806 				kcd_exit_on_error(kcdata_init_compress(stackshot_kcdata_p, hdr_tag, kdp_memcpy, KCDCT_ZLIB));
2807 			}
2808 			stackshot_cpu_ctx.scc_stack_buffer = kcdata_endalloc(stackshot_kcdata_p, sizeof(uintptr_t) * MAX_FRAMES);
2809 		}
2810 	} else {
2811 		/*
2812 		 * If this is a panic stackshot, we need to handle things differently.
2813 		 * The panic code hands us a kcdata descriptor to work with instead of
2814 		 * us making one ourselves.
2815 		 */
2816 		*stackshot_kcdata_p = *stackshot_args.descriptor;
2817 		stackshot_cpu_ctx = (struct stackshot_cpu_context) {
2818 			.scc_can_work = true,
2819 			.scc_stack_buffer = kcdata_endalloc(stackshot_kcdata_p, sizeof(uintptr_t) * MAX_FRAMES)
2820 		};
2821 #if STACKSHOT_COLLECTS_LATENCY_INFO
2822 		*(PERCPU_GET(stackshot_trace_buffer)) = (struct stackshot_trace_buffer) {};
2823 #endif
2824 	}
2825 
2826 	/* Set up our cpu state */
2827 	stackshot_cpu_preflight();
2828 
2829 error_exit:
2830 	return error;
2831 }
2832 
2833 /*
2834  * The old function signature for kdp_snapshot_preflight, used in the panic path.
2835  * Called when interrupts are disabled, but we're not in the debugger trap yet.
2836  */
2837 void
kdp_snapshot_preflight(int pid,void * tracebuf,uint32_t tracebuf_size,uint64_t flags,kcdata_descriptor_t data_p,uint64_t since_timestamp,uint32_t pagetable_mask)2838 kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint64_t flags,
2839     kcdata_descriptor_t data_p, uint64_t since_timestamp, uint32_t pagetable_mask)
2840 {
2841 	__assert_only kern_return_t err;
2842 	err = kdp_snapshot_preflight_internal((struct kdp_snapshot_args) {
2843 		.pid = pid,
2844 		.buffer = tracebuf,
2845 		.buffer_size = tracebuf_size,
2846 		.flags = flags,
2847 		.descriptor = data_p,
2848 		.since_timestamp = since_timestamp,
2849 		.pagetable_mask = pagetable_mask
2850 	});
2851 
2852 
2853 	/* This shouldn't ever return an error in the panic path. */
2854 	assert(err == KERN_SUCCESS);
2855 }
2856 
2857 static void
stackshot_reset_state(void)2858 stackshot_reset_state(void)
2859 {
2860 	stackshot_ctx = (struct stackshot_context) { 0 };
2861 }
2862 
2863 void
panic_stackshot_reset_state(void)2864 panic_stackshot_reset_state(void)
2865 {
2866 	stackshot_reset_state();
2867 }
2868 
2869 boolean_t
stackshot_active(void)2870 stackshot_active(void)
2871 {
2872 	return os_atomic_load(&stackshot_ctx.sc_state, relaxed) != SS_INACTIVE;
2873 }
2874 
2875 boolean_t
panic_stackshot_active(void)2876 panic_stackshot_active(void)
2877 {
2878 	return os_atomic_load(&stackshot_contexts[STACKSHOT_CTX_IDX_PANIC].sc_state, relaxed) != SS_INACTIVE;
2879 }
2880 
2881 uint32_t
kdp_stack_snapshot_bytes_traced(void)2882 kdp_stack_snapshot_bytes_traced(void)
2883 {
2884 	return stackshot_ctx.sc_bytes_traced;
2885 }
2886 
2887 uint32_t
kdp_stack_snapshot_bytes_uncompressed(void)2888 kdp_stack_snapshot_bytes_uncompressed(void)
2889 {
2890 	return stackshot_ctx.sc_bytes_uncompressed;
2891 }
2892 
2893 static boolean_t
memory_iszero(void * addr,size_t size)2894 memory_iszero(void *addr, size_t size)
2895 {
2896 	char *data = (char *)addr;
2897 	for (size_t i = 0; i < size; i++) {
2898 		if (data[i] != 0) {
2899 			return FALSE;
2900 		}
2901 	}
2902 	return TRUE;
2903 }
2904 
2905 static void
_stackshot_validation_reset(void)2906 _stackshot_validation_reset(void)
2907 {
2908 	percpu_foreach_base(base) {
2909 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
2910 		cpu_ctx->scc_validation_state.last_valid_page_kva = -1;
2911 		cpu_ctx->scc_validation_state.last_valid_size = 0;
2912 	}
2913 }
2914 
2915 static bool
_stackshot_validate_kva(vm_offset_t addr,size_t size)2916 _stackshot_validate_kva(vm_offset_t addr, size_t size)
2917 {
2918 	vm_offset_t page_addr = atop_kernel(addr);
2919 	if (stackshot_cpu_ctx.scc_validation_state.last_valid_page_kva == page_addr &&
2920 	    stackshot_cpu_ctx.scc_validation_state.last_valid_size <= size) {
2921 		return true;
2922 	}
2923 
2924 	if (ml_validate_nofault(addr, size)) {
2925 		stackshot_cpu_ctx.scc_validation_state.last_valid_page_kva = page_addr;
2926 		stackshot_cpu_ctx.scc_validation_state.last_valid_size = size;
2927 		return true;
2928 	}
2929 	return false;
2930 }
2931 
2932 static long
_stackshot_strlen(const char * s,size_t maxlen)2933 _stackshot_strlen(const char *s, size_t maxlen)
2934 {
2935 	size_t len = 0;
2936 	for (len = 0; _stackshot_validate_kva((vm_offset_t)s, 1); len++, s++) {
2937 		if (*s == 0) {
2938 			return len;
2939 		}
2940 		if (len >= maxlen) {
2941 			return -1;
2942 		}
2943 	}
2944 	return -1; /* failed before end of string */
2945 }
2946 
2947 
2948 static size_t
stackshot_plh_est_size(void)2949 stackshot_plh_est_size(void)
2950 {
2951 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
2952 	size_t size = STASKSHOT_PLH_SIZE(stackshot_port_label_size);
2953 
2954 	if (size == 0) {
2955 		return 0;
2956 	}
2957 #define SIZE_EST(x) ROUNDUP((x), sizeof (uintptr_t))
2958 	return SIZE_EST(size * sizeof(*plh->plh_array)) +
2959 	       SIZE_EST(size * sizeof(*plh->plh_chains)) +
2960 	       SIZE_EST(size * sizeof(*stackshot_cpu_ctx.scc_plh_gen.pgs_gen) * real_ncpus) +
2961 	       SIZE_EST((1ul << STACKSHOT_PLH_SHIFT) * sizeof(*plh->plh_hash));
2962 #undef SIZE_EST
2963 }
2964 
2965 static void
stackshot_plh_reset(void)2966 stackshot_plh_reset(void)
2967 {
2968 	stackshot_ctx.sc_plh = (struct port_label_hash){.plh_size = 0};  /* structure assignment */
2969 }
2970 
2971 static kern_return_t
stackshot_plh_setup(void)2972 stackshot_plh_setup(void)
2973 {
2974 	kern_return_t error;
2975 	size_t size;
2976 	bool percpu_alloc_failed = false;
2977 	struct port_label_hash plh = {
2978 		.plh_size = STASKSHOT_PLH_SIZE(stackshot_port_label_size),
2979 		.plh_count = 0,
2980 	};
2981 
2982 	stackshot_plh_reset();
2983 
2984 	percpu_foreach_base(base) {
2985 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
2986 		cpu_ctx->scc_plh_gen = (struct _stackshot_plh_gen_state){
2987 			.pgs_gen = NULL,
2988 			.pgs_curgen = 1,
2989 			.pgs_curgen_min = STACKSHOT_PLH_SIZE_MAX,
2990 			.pgs_curgen_max = 0,
2991 		};
2992 	}
2993 
2994 	size = plh.plh_size;
2995 	if (size == 0) {
2996 		return KERN_SUCCESS;
2997 	}
2998 	plh.plh_array = stackshot_alloc_with_size(size * sizeof(*plh.plh_array), &error);
2999 	plh.plh_chains = stackshot_alloc_with_size(size * sizeof(*plh.plh_chains), &error);
3000 	percpu_foreach_base(base) {
3001 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
3002 		cpu_ctx->scc_plh_gen.pgs_gen = stackshot_alloc_with_size(size * sizeof(*cpu_ctx->scc_plh_gen.pgs_gen), &error);
3003 		if (cpu_ctx->scc_plh_gen.pgs_gen == NULL) {
3004 			percpu_alloc_failed = true;
3005 			break;
3006 		}
3007 		for (int x = 0; x < size; x++) {
3008 			cpu_ctx->scc_plh_gen.pgs_gen[x] = 0;
3009 		}
3010 	}
3011 	plh.plh_hash = stackshot_alloc_with_size((1ul << STACKSHOT_PLH_SHIFT) * sizeof(*plh.plh_hash), &error);
3012 	if (error != KERN_SUCCESS) {
3013 		return error;
3014 	}
3015 	if (plh.plh_array == NULL || plh.plh_chains == NULL || percpu_alloc_failed || plh.plh_hash == NULL) {
3016 		PLH_STAT_OP(os_atomic_inc(&stackshot_ctx.sc_plh.plh_bad, relaxed));
3017 		return KERN_SUCCESS;
3018 	}
3019 	for (int x = 0; x < size; x++) {
3020 		plh.plh_array[x] = NULL;
3021 		plh.plh_chains[x] = -1;
3022 	}
3023 	for (int x = 0; x < (1ul << STACKSHOT_PLH_SHIFT); x++) {
3024 		plh.plh_hash[x] = -1;
3025 	}
3026 	stackshot_ctx.sc_plh = plh;  /* structure assignment */
3027 	return KERN_SUCCESS;
3028 }
3029 
3030 static int16_t
stackshot_plh_hash(struct ipc_service_port_label * ispl)3031 stackshot_plh_hash(struct ipc_service_port_label *ispl)
3032 {
3033 	uintptr_t ptr = VM_KERNEL_STRIP_PTR((uintptr_t)ispl);
3034 
3035 	static_assert(STACKSHOT_PLH_SHIFT < 16, "plh_hash must fit in 15 bits");
3036 #define PLH_HASH_STEP(ptr, x) \
3037 	    ((((x) * STACKSHOT_PLH_SHIFT) < (sizeof(ispl) * CHAR_BIT)) ? ((ptr) >> ((x) * STACKSHOT_PLH_SHIFT)) : 0)
3038 	ptr ^= PLH_HASH_STEP(ptr, 16);
3039 	ptr ^= PLH_HASH_STEP(ptr, 8);
3040 	ptr ^= PLH_HASH_STEP(ptr, 4);
3041 	ptr ^= PLH_HASH_STEP(ptr, 2);
3042 	ptr ^= PLH_HASH_STEP(ptr, 1);
3043 #undef PLH_HASH_STEP
3044 	return (int16_t)(ptr & ((1ul << STACKSHOT_PLH_SHIFT) - 1));
3045 }
3046 
3047 enum stackshot_plh_lookup_type {
3048 	STACKSHOT_PLH_LOOKUP_UNKNOWN,
3049 	STACKSHOT_PLH_LOOKUP_SEND,
3050 	STACKSHOT_PLH_LOOKUP_RECEIVE,
3051 };
3052 
3053 static void
stackshot_plh_resetgen(void)3054 stackshot_plh_resetgen(void)
3055 {
3056 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3057 	uint16_t plh_size = stackshot_ctx.sc_plh.plh_size;
3058 
3059 	if (pgs->pgs_curgen_min == STACKSHOT_PLH_SIZE_MAX && pgs->pgs_curgen_max == 0) {
3060 		return;  // no lookups, nothing using the current generation
3061 	}
3062 	pgs->pgs_curgen++;
3063 	pgs->pgs_curgen_min = STACKSHOT_PLH_SIZE_MAX;
3064 	pgs->pgs_curgen_max = 0;
3065 	if (pgs->pgs_curgen == 0) { // wrapped, zero the array and increment the generation
3066 		for (int x = 0; x < plh_size; x++) {
3067 			pgs->pgs_gen[x] = 0;
3068 		}
3069 		pgs->pgs_curgen = 1;
3070 	}
3071 }
3072 
3073 static int16_t
stackshot_plh_lookup_locked(struct ipc_service_port_label * ispl,enum stackshot_plh_lookup_type type)3074 stackshot_plh_lookup_locked(struct ipc_service_port_label *ispl, enum stackshot_plh_lookup_type type)
3075 {
3076 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3077 	int depth;
3078 	int16_t cur;
3079 	if (ispl == NULL) {
3080 		return STACKSHOT_PORTLABELID_NONE;
3081 	}
3082 	switch (type) {
3083 	case STACKSHOT_PLH_LOOKUP_SEND:
3084 		PLH_STAT_OP(os_atomic_inc(&plh->plh_lookup_send, relaxed));
3085 		break;
3086 	case STACKSHOT_PLH_LOOKUP_RECEIVE:
3087 		PLH_STAT_OP(os_atomic_inc(&plh->plh_lookup_receive, relaxed));
3088 		break;
3089 	default:
3090 		break;
3091 	}
3092 	PLH_STAT_OP(os_atomic_inc(&plh->plh_lookups, relaxed));
3093 	if (plh->plh_size == 0) {
3094 		return STACKSHOT_PORTLABELID_MISSING;
3095 	}
3096 	int16_t hash = stackshot_plh_hash(ispl);
3097 	assert(hash >= 0 && hash < (1ul << STACKSHOT_PLH_SHIFT));
3098 	depth = 0;
3099 	for (cur = plh->plh_hash[hash]; cur >= 0; cur = plh->plh_chains[cur]) {
3100 		/* cur must be in-range, and chain depth can never be above our # allocated */
3101 		if (cur >= plh->plh_count || depth > plh->plh_count || depth > plh->plh_size) {
3102 			PLH_STAT_OP(os_atomic_inc(&plh->plh_bad, relaxed));
3103 			PLH_STAT_OP(os_atomic_add(&plh->plh_bad_depth, depth, relaxed));
3104 			return STACKSHOT_PORTLABELID_MISSING;
3105 		}
3106 		assert(cur < plh->plh_count);
3107 		if (plh->plh_array[cur] == ispl) {
3108 			PLH_STAT_OP(os_atomic_inc(&plh->plh_found, relaxed));
3109 			PLH_STAT_OP(os_atomic_add(&plh->plh_found_depth, depth, relaxed));
3110 			goto found;
3111 		}
3112 		depth++;
3113 	}
3114 	/* not found in hash table, so alloc and insert it */
3115 	if (cur != -1) {
3116 		PLH_STAT_OP(os_atomic_inc(&plh->plh_bad, relaxed));
3117 		PLH_STAT_OP(os_atomic_add(&plh->plh_bad_depth, depth, relaxed));
3118 		return STACKSHOT_PORTLABELID_MISSING; /* bad end of chain */
3119 	}
3120 	PLH_STAT_OP(os_atomic_inc(&plh->plh_insert, relaxed));
3121 	PLH_STAT_OP(os_atomic_add(&plh->plh_insert_depth, depth, relaxed));
3122 	if (plh->plh_count >= plh->plh_size) {
3123 		return STACKSHOT_PORTLABELID_MISSING; /* no space */
3124 	}
3125 	cur = plh->plh_count;
3126 	plh->plh_count++;
3127 	plh->plh_array[cur] = ispl;
3128 	plh->plh_chains[cur] = plh->plh_hash[hash];
3129 	plh->plh_hash[hash] = cur;
3130 found:  ;
3131 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3132 	pgs->pgs_gen[cur] = pgs->pgs_curgen;
3133 	if (pgs->pgs_curgen_min > cur) {
3134 		pgs->pgs_curgen_min = cur;
3135 	}
3136 	if (pgs->pgs_curgen_max < cur) {
3137 		pgs->pgs_curgen_max = cur;
3138 	}
3139 	return cur + 1;   /* offset to avoid 0 */
3140 }
3141 
3142 static kern_return_t
kdp_stackshot_plh_record_locked(void)3143 kdp_stackshot_plh_record_locked(void)
3144 {
3145 	kern_return_t error = KERN_SUCCESS;
3146 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3147 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3148 	uint16_t count = plh->plh_count;
3149 	uint8_t curgen = pgs->pgs_curgen;
3150 	int16_t curgen_min = pgs->pgs_curgen_min;
3151 	int16_t curgen_max = pgs->pgs_curgen_max;
3152 	if (curgen_min <= curgen_max && curgen_max < count &&
3153 	    count <= plh->plh_size && plh->plh_size <= STACKSHOT_PLH_SIZE_MAX) {
3154 		struct ipc_service_port_label **arr = plh->plh_array;
3155 		size_t ispl_size, max_namelen;
3156 		kdp_ipc_splabel_size(&ispl_size, &max_namelen);
3157 		for (int idx = curgen_min; idx <= curgen_max; idx++) {
3158 			struct ipc_service_port_label *ispl = arr[idx];
3159 			struct portlabel_info spl = {
3160 				.portlabel_id = (idx + 1),
3161 			};
3162 			const char *name = NULL;
3163 			long name_sz = 0;
3164 			if (pgs->pgs_gen[idx] != curgen) {
3165 				continue;
3166 			}
3167 			if (_stackshot_validate_kva((vm_offset_t)ispl, ispl_size)) {
3168 				kdp_ipc_fill_splabel(ispl, &spl, &name);
3169 #if STACKSHOT_COLLECTS_RDAR_126582377_DATA
3170 			} else {
3171 				if (ispl != NULL && (vm_offset_t)ispl >> 48 == 0x0000) {
3172 					ca_event_t event_to_send = os_atomic_xchg(&rdar_126582377_event, NULL, relaxed);
3173 					if (event_to_send) {
3174 						CA_EVENT_SEND(event_to_send);
3175 					}
3176 				}
3177 #endif
3178 			}
3179 
3180 			kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
3181 			    STACKSHOT_KCCONTAINER_PORTLABEL, idx + 1));
3182 			if (name != NULL && (name_sz = _stackshot_strlen(name, max_namelen)) > 0) {   /* validates the kva */
3183 				kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_PORTLABEL_NAME, name_sz + 1, name));
3184 			} else {
3185 				spl.portlabel_flags |= STACKSHOT_PORTLABEL_READFAILED;
3186 			}
3187 			kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_PORTLABEL, sizeof(spl), &spl));
3188 			kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
3189 			    STACKSHOT_KCCONTAINER_PORTLABEL, idx + 1));
3190 		}
3191 	}
3192 
3193 error_exit:
3194 	return error;
3195 }
3196 
3197 // record any PLH referenced since the last stackshot_plh_resetgen() call
3198 static kern_return_t
kdp_stackshot_plh_record(void)3199 kdp_stackshot_plh_record(void)
3200 {
3201 	kern_return_t error;
3202 	plh_lock(&stackshot_ctx.sc_plh);
3203 	error = kdp_stackshot_plh_record_locked();
3204 	plh_unlock(&stackshot_ctx.sc_plh);
3205 	return error;
3206 }
3207 
3208 static int16_t
stackshot_plh_lookup(struct ipc_service_port_label * ispl,enum stackshot_plh_lookup_type type)3209 stackshot_plh_lookup(struct ipc_service_port_label *ispl, enum stackshot_plh_lookup_type type)
3210 {
3211 	int16_t result;
3212 	plh_lock(&stackshot_ctx.sc_plh);
3213 	result = stackshot_plh_lookup_locked(ispl, type);
3214 	plh_unlock(&stackshot_ctx.sc_plh);
3215 	return result;
3216 }
3217 
3218 #if DEVELOPMENT || DEBUG
3219 static kern_return_t
kdp_stackshot_plh_stats(void)3220 kdp_stackshot_plh_stats(void)
3221 {
3222 	kern_return_t error = KERN_SUCCESS;
3223 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3224 
3225 #define PLH_STAT(x) do { if (os_atomic_load(&plh->x, relaxed) != 0) { \
3226 	kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, os_atomic_load(&plh->x, relaxed), "stackshot_" #x)); \
3227 } } while (0)
3228 	PLH_STAT(plh_size);
3229 	PLH_STAT(plh_lookups);
3230 	PLH_STAT(plh_found);
3231 	PLH_STAT(plh_found_depth);
3232 	PLH_STAT(plh_insert);
3233 	PLH_STAT(plh_insert_depth);
3234 	PLH_STAT(plh_bad);
3235 	PLH_STAT(plh_bad_depth);
3236 	PLH_STAT(plh_lookup_send);
3237 	PLH_STAT(plh_lookup_receive);
3238 #undef PLH_STAT
3239 
3240 error_exit:
3241 	return error;
3242 }
3243 #endif /* DEVELOPMENT || DEBUG */
3244 
3245 /*
3246  * This function can be called from stackshot / kdp context or
3247  * from telemetry / current task context
3248  */
3249 uint64_t
kcdata_get_task_ss_flags(task_t task,bool from_stackshot)3250 kcdata_get_task_ss_flags(task_t task, bool from_stackshot)
3251 {
3252 	uint64_t ss_flags = 0;
3253 	boolean_t task_64bit_addr = task_has_64Bit_addr(task);
3254 	void *bsd_info = get_bsdtask_info(task);
3255 
3256 	if (task_64bit_addr) {
3257 		ss_flags |= kUser64_p;
3258 	}
3259 	if (!task->active || task_is_a_corpse(task) || proc_exiting(bsd_info)) {
3260 		ss_flags |= kTerminatedSnapshot;
3261 	}
3262 	if (task->pidsuspended) {
3263 		ss_flags |= kPidSuspended;
3264 	}
3265 	if (task->frozen) {
3266 		ss_flags |= kFrozen;
3267 	}
3268 	if (task->effective_policy.tep_darwinbg == 1) {
3269 		ss_flags |= kTaskDarwinBG;
3270 	}
3271 	if (task->requested_policy.trp_ext_darwinbg == 1) {
3272 		ss_flags |= kTaskExtDarwinBG;
3273 	}
3274 	if (task->requested_policy.trp_role == TASK_FOREGROUND_APPLICATION) {
3275 		ss_flags |= kTaskIsForeground;
3276 	}
3277 	if (task->requested_policy.trp_boosted == 1) {
3278 		ss_flags |= kTaskIsBoosted;
3279 	}
3280 	if (task->effective_policy.tep_sup_active == 1) {
3281 		ss_flags |= kTaskIsSuppressed;
3282 	}
3283 #if CONFIG_MEMORYSTATUS
3284 
3285 	boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE;
3286 	boolean_t is_active = FALSE, is_managed = FALSE, has_assertion = FALSE;
3287 	memorystatus_proc_flags_unsafe(bsd_info, &dirty, &dirty_tracked, &allow_idle_exit, &is_active, &is_managed, &has_assertion);
3288 	if (dirty) {
3289 		ss_flags |= kTaskIsDirty;
3290 	}
3291 	if (dirty_tracked) {
3292 		ss_flags |= kTaskIsDirtyTracked;
3293 	}
3294 	if (allow_idle_exit) {
3295 		ss_flags |= kTaskAllowIdleExit;
3296 	}
3297 	if (is_active) {
3298 		ss_flags |= kTaskIsActive;
3299 	}
3300 	if (is_managed) {
3301 		ss_flags |= kTaskIsManaged;
3302 	}
3303 	if (has_assertion) {
3304 		ss_flags |= kTaskHasAssertion;
3305 	}
3306 
3307 #endif
3308 	if (task->effective_policy.tep_tal_engaged) {
3309 		ss_flags |= kTaskTALEngaged;
3310 	}
3311 
3312 	if (from_stackshot) {
3313 		ss_flags |= workqueue_get_task_ss_flags_from_pwq_state_kdp(bsd_info);
3314 	}
3315 
3316 #if IMPORTANCE_INHERITANCE
3317 	if (task->task_imp_base) {
3318 		if (task->task_imp_base->iit_donor) {
3319 			ss_flags |= kTaskIsImpDonor;
3320 		}
3321 		if (task->task_imp_base->iit_live_donor) {
3322 			ss_flags |= kTaskIsLiveImpDonor;
3323 		}
3324 	}
3325 #endif
3326 
3327 	if (task->effective_policy.tep_runaway_mitigation) {
3328 		ss_flags |= kTaskRunawayMitigation;
3329 	}
3330 
3331 	if (task->t_flags & TF_TELEMETRY) {
3332 		ss_flags |= kTaskRsrcFlagged;
3333 	}
3334 
3335 	return ss_flags;
3336 }
3337 
3338 static kern_return_t
kcdata_record_shared_cache_info(kcdata_descriptor_t kcd,task_t task,unaligned_u64 * task_snap_ss_flags)3339 kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_u64 *task_snap_ss_flags)
3340 {
3341 	kern_return_t error = KERN_SUCCESS;
3342 
3343 	uint64_t shared_cache_slide = 0;
3344 	uint64_t shared_cache_first_mapping = 0;
3345 	uint32_t shared_cache_id = 0;
3346 	struct dyld_shared_cache_loadinfo shared_cache_data = {0};
3347 
3348 
3349 	assert(task_snap_ss_flags != NULL);
3350 
3351 	/* Get basic info about the shared region pointer, regardless of any failures */
3352 	if (task->shared_region == NULL) {
3353 		*task_snap_ss_flags |= kTaskSharedRegionNone;
3354 	} else if (task->shared_region == primary_system_shared_region) {
3355 		*task_snap_ss_flags |= kTaskSharedRegionSystem;
3356 	} else {
3357 		*task_snap_ss_flags |= kTaskSharedRegionOther;
3358 	}
3359 
3360 	if (task->shared_region && _stackshot_validate_kva((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) {
3361 		struct vm_shared_region *sr = task->shared_region;
3362 		shared_cache_first_mapping = sr->sr_base_address + sr->sr_first_mapping;
3363 
3364 		shared_cache_id = sr->sr_id;
3365 	} else {
3366 		*task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable;
3367 		goto error_exit;
3368 	}
3369 
3370 	/*
3371 	 * We haven't copied in the shared region UUID yet as part of setup
3372 	 * This seems to happen infrequently with DriverKit processes on certain
3373 	 * configurations, even once the process has already been set up.
3374 	 * rdar://139753101
3375 	 */
3376 	if (!shared_cache_first_mapping || !task->shared_region->sr_uuid_copied) {
3377 		*task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable;
3378 		goto error_exit;
3379 	}
3380 
3381 
3382 	/*
3383 	 * No refcounting here, but we are in debugger context, so that should be safe.
3384 	 */
3385 	shared_cache_slide = task->shared_region->sr_slide;
3386 
3387 	if (task->shared_region == primary_system_shared_region) {
3388 		/* skip adding shared cache info -- it's the same as the system level one */
3389 		goto error_exit;
3390 	}
3391 	/*
3392 	 * New-style shared cache reference: for non-primary shared regions,
3393 	 * just include the ID of the shared cache we're attached to.  Consumers
3394 	 * should use the following info from the task's ts_ss_flags as well:
3395 	 *
3396 	 * kTaskSharedRegionNone - task is not attached to a shared region
3397 	 * kTaskSharedRegionSystem - task is attached to the shared region
3398 	 *     with kSharedCacheSystemPrimary set in sharedCacheFlags.
3399 	 * kTaskSharedRegionOther - task is attached to the shared region with
3400 	 *     sharedCacheID matching the STACKSHOT_KCTYPE_SHAREDCACHE_ID entry.
3401 	 */
3402 	kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_ID, sizeof(shared_cache_id), &shared_cache_id));
3403 
3404 	/*
3405 	 * For backwards compatibility; this should eventually be removed.
3406 	 *
3407 	 * Historically, this data was in a dyld_uuid_info_64 structure, but the
3408 	 * naming of both the structure and fields for this use wasn't great.  The
3409 	 * dyld_shared_cache_loadinfo structure has better names, but the same
3410 	 * layout and content as the original.
3411 	 *
3412 	 * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
3413 	 * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
3414 	 * entries; here, it's the slid first mapping, and we leave it that way
3415 	 * for backwards compatibility.
3416 	 */
3417 	shared_cache_data.sharedCacheSlide = shared_cache_slide;
3418 	kdp_memcpy(&shared_cache_data.sharedCacheUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid));
3419 	shared_cache_data.sharedCacheUnreliableSlidBaseAddress = shared_cache_first_mapping;
3420 	shared_cache_data.sharedCacheSlidFirstMapping = shared_cache_first_mapping;
3421 	kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(shared_cache_data), &shared_cache_data));
3422 
3423 error_exit:
3424 	return error;
3425 }
3426 
3427 static kern_return_t
kcdata_record_uuid_info(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 * task_snap_ss_flags)3428 kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 *task_snap_ss_flags)
3429 {
3430 	bool save_loadinfo_p         = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0);
3431 	bool save_kextloadinfo_p     = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0);
3432 	bool save_compactinfo_p      = ((trace_flags & STACKSHOT_SAVE_DYLD_COMPACTINFO) != 0);
3433 	bool should_fault            = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING);
3434 
3435 	kern_return_t error        = KERN_SUCCESS;
3436 	mach_vm_address_t out_addr = 0;
3437 
3438 	mach_vm_address_t dyld_compactinfo_addr = 0;
3439 	uint32_t dyld_compactinfo_size = 0;
3440 
3441 	uint32_t uuid_info_count         = 0;
3442 	mach_vm_address_t uuid_info_addr = 0;
3443 	uint64_t uuid_info_timestamp     = 0;
3444 	#pragma unused(uuid_info_timestamp)
3445 	kdp_fault_result_flags_t kdp_fault_results = 0;
3446 
3447 
3448 	assert(task_snap_ss_flags != NULL);
3449 
3450 	int task_pid     = pid_from_task(task);
3451 	boolean_t task_64bit_addr = task_has_64Bit_addr(task);
3452 
3453 	if ((save_loadinfo_p || save_compactinfo_p) && have_pmap && task->active && task_pid > 0) {
3454 		/* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */
3455 		if (task_64bit_addr) {
3456 			struct user64_dyld_all_image_infos task_image_infos;
3457 			if (stackshot_copyin(task->map, task->all_image_info_addr, &task_image_infos,
3458 			    sizeof(struct user64_dyld_all_image_infos), should_fault, &kdp_fault_results)) {
3459 				uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
3460 				uuid_info_addr = task_image_infos.uuidArray;
3461 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION) {
3462 					uuid_info_timestamp = task_image_infos.timestamp;
3463 				}
3464 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_COMPACTINFO_MINIMUM_VERSION) {
3465 					dyld_compactinfo_addr = task_image_infos.compact_dyld_image_info_addr;
3466 					dyld_compactinfo_size = task_image_infos.compact_dyld_image_info_size;
3467 				}
3468 
3469 			}
3470 		} else {
3471 			struct user32_dyld_all_image_infos task_image_infos;
3472 			if (stackshot_copyin(task->map, task->all_image_info_addr, &task_image_infos,
3473 			    sizeof(struct user32_dyld_all_image_infos), should_fault, &kdp_fault_results)) {
3474 				uuid_info_count = task_image_infos.uuidArrayCount;
3475 				uuid_info_addr = task_image_infos.uuidArray;
3476 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION) {
3477 					uuid_info_timestamp = task_image_infos.timestamp;
3478 				}
3479 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_COMPACTINFO_MINIMUM_VERSION) {
3480 					dyld_compactinfo_addr = task_image_infos.compact_dyld_image_info_addr;
3481 					dyld_compactinfo_size = task_image_infos.compact_dyld_image_info_size;
3482 				}
3483 			}
3484 		}
3485 
3486 		/*
3487 		 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
3488 		 * this data structure), we zero the uuid_info_count so that we won't even try to save load info
3489 		 * for this task.
3490 		 */
3491 		if (!uuid_info_addr) {
3492 			uuid_info_count = 0;
3493 		}
3494 
3495 		if (!dyld_compactinfo_addr) {
3496 			dyld_compactinfo_size = 0;
3497 		}
3498 
3499 	}
3500 
3501 	if (have_pmap && task_pid == 0) {
3502 		if (save_kextloadinfo_p && _stackshot_validate_kva((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader))) {
3503 			uuid_info_count = gLoadedKextSummaries->numSummaries + 1; /* include main kernel UUID */
3504 		} else {
3505 			uuid_info_count = 1; /* include kernelcache UUID (embedded) or kernel UUID (desktop) */
3506 		}
3507 	}
3508 
3509 	if (save_compactinfo_p && task_pid > 0) {
3510 		if (dyld_compactinfo_size == 0) {
3511 			*task_snap_ss_flags |= kTaskDyldCompactInfoNone;
3512 		} else if (dyld_compactinfo_size > MAX_DYLD_COMPACTINFO) {
3513 			*task_snap_ss_flags |= kTaskDyldCompactInfoTooBig;
3514 		} else {
3515 			kdp_fault_result_flags_t ci_kdp_fault_results = 0;
3516 
3517 			/* Open a compression window to avoid overflowing the stack */
3518 			kcdata_compression_window_open(kcd);
3519 			kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_DYLD_COMPACTINFO,
3520 			    dyld_compactinfo_size, &out_addr));
3521 
3522 			if (!stackshot_copyin(task->map, dyld_compactinfo_addr, (void *)out_addr,
3523 			    dyld_compactinfo_size, should_fault, &ci_kdp_fault_results)) {
3524 				bzero((void *)out_addr, dyld_compactinfo_size);
3525 			}
3526 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
3527 				*task_snap_ss_flags |= kTaskDyldCompactInfoMissing;
3528 			}
3529 
3530 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) {
3531 				*task_snap_ss_flags |= kTaskDyldCompactInfoTriedFault;
3532 			}
3533 
3534 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) {
3535 				*task_snap_ss_flags |= kTaskDyldCompactInfoFaultedIn;
3536 			}
3537 
3538 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
3539 		}
3540 	}
3541 	if (save_loadinfo_p && task_pid > 0 && (uuid_info_count < MAX_LOADINFOS)) {
3542 		uint32_t copied_uuid_count = 0;
3543 		uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
3544 		uint32_t uuid_info_array_size = 0;
3545 
3546 		/* Open a compression window to avoid overflowing the stack */
3547 		kcdata_compression_window_open(kcd);
3548 
3549 		/* If we found some UUID information, first try to copy it in -- this will only be non-zero if we had a pmap above */
3550 		if (uuid_info_count > 0) {
3551 			uuid_info_array_size = uuid_info_count * uuid_info_size;
3552 
3553 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
3554 			    uuid_info_size, uuid_info_count, &out_addr));
3555 
3556 			if (!stackshot_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) {
3557 				bzero((void *)out_addr, uuid_info_array_size);
3558 			} else {
3559 				copied_uuid_count = uuid_info_count;
3560 			}
3561 		}
3562 
3563 		uuid_t binary_uuid;
3564 		if (!copied_uuid_count && proc_binary_uuid_kdp(task, binary_uuid)) {
3565 			/* We failed to copyin the UUID information, try to store the UUID of the main binary we have in the proc */
3566 			if (uuid_info_array_size == 0) {
3567 				/* We just need to store one UUID */
3568 				uuid_info_array_size = uuid_info_size;
3569 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
3570 				    uuid_info_size, 1, &out_addr));
3571 			}
3572 
3573 			if (task_64bit_addr) {
3574 				struct user64_dyld_uuid_info *uuid_info = (struct user64_dyld_uuid_info *)out_addr;
3575 				uint64_t image_load_address = task->mach_header_vm_address;
3576 
3577 				kdp_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
3578 				kdp_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
3579 			} else {
3580 				struct user32_dyld_uuid_info *uuid_info = (struct user32_dyld_uuid_info *)out_addr;
3581 				uint32_t image_load_address = (uint32_t) task->mach_header_vm_address;
3582 
3583 				kdp_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
3584 				kdp_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
3585 			}
3586 		}
3587 
3588 		kcd_exit_on_error(kcdata_compression_window_close(kcd));
3589 	} else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
3590 		uintptr_t image_load_address;
3591 
3592 		do {
3593 #if defined(__arm64__)
3594 			if (kernelcache_uuid_valid && !save_kextloadinfo_p) {
3595 				struct dyld_uuid_info_64 kc_uuid = {0};
3596 				kc_uuid.imageLoadAddress = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3597 				kdp_memcpy(&kc_uuid.imageUUID, &kernelcache_uuid, sizeof(uuid_t));
3598 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64), &kc_uuid));
3599 				break;
3600 			}
3601 #endif /* defined(__arm64__) */
3602 
3603 			if (!kernel_uuid || !_stackshot_validate_kva((vm_offset_t)kernel_uuid, sizeof(uuid_t))) {
3604 				/* Kernel UUID not found or inaccessible */
3605 				break;
3606 			}
3607 
3608 			uint32_t uuid_type = KCDATA_TYPE_LIBRARY_LOADINFO;
3609 			if ((sizeof(kernel_uuid_info) == sizeof(struct user64_dyld_uuid_info))) {
3610 				uuid_type = KCDATA_TYPE_LIBRARY_LOADINFO64;
3611 #if  defined(__arm64__)
3612 				kc_format_t primary_kc_type = KCFormatUnknown;
3613 				if (PE_get_primary_kc_format(&primary_kc_type) && (primary_kc_type == KCFormatFileset)) {
3614 					/* return TEXT_EXEC based load information on arm devices running with fileset kernelcaches */
3615 					uuid_type = STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC;
3616 				}
3617 #endif
3618 			}
3619 
3620 			/*
3621 			 * The element count of the array can vary - avoid overflowing the
3622 			 * stack by opening a window.
3623 			 */
3624 			kcdata_compression_window_open(kcd);
3625 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, uuid_type,
3626 			    sizeof(kernel_uuid_info), uuid_info_count, &out_addr));
3627 			kernel_uuid_info *uuid_info_array = (kernel_uuid_info *)out_addr;
3628 
3629 			image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(vm_kernel_stext);
3630 #if defined(__arm64__)
3631 			if (uuid_type == STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC) {
3632 				/* If we're reporting TEXT_EXEC load info, populate the TEXT_EXEC base instead */
3633 				extern vm_offset_t segTEXTEXECB;
3634 				image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(segTEXTEXECB);
3635 			}
3636 #endif
3637 			uuid_info_array[0].imageLoadAddress = image_load_address;
3638 			kdp_memcpy(&uuid_info_array[0].imageUUID, kernel_uuid, sizeof(uuid_t));
3639 
3640 			if (save_kextloadinfo_p &&
3641 			    _stackshot_validate_kva((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader)) &&
3642 			    _stackshot_validate_kva((vm_offset_t)(&gLoadedKextSummaries->summaries[0]),
3643 			    gLoadedKextSummaries->entry_size * gLoadedKextSummaries->numSummaries)) {
3644 				uint32_t kexti;
3645 				for (kexti = 0; kexti < gLoadedKextSummaries->numSummaries; kexti++) {
3646 					image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].address);
3647 #if defined(__arm64__)
3648 					if (uuid_type == STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC) {
3649 						/* If we're reporting TEXT_EXEC load info, populate the TEXT_EXEC base instead */
3650 						image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].text_exec_address);
3651 					}
3652 #endif
3653 					uuid_info_array[kexti + 1].imageLoadAddress = image_load_address;
3654 					kdp_memcpy(&uuid_info_array[kexti + 1].imageUUID, &gLoadedKextSummaries->summaries[kexti].uuid, sizeof(uuid_t));
3655 				}
3656 			}
3657 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
3658 		} while (0);
3659 	}
3660 
3661 error_exit:
3662 	if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
3663 		*task_snap_ss_flags |= kTaskUUIDInfoMissing;
3664 	}
3665 
3666 	if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) {
3667 		*task_snap_ss_flags |= kTaskUUIDInfoTriedFault;
3668 	}
3669 
3670 	if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) {
3671 		*task_snap_ss_flags |= kTaskUUIDInfoFaultedIn;
3672 	}
3673 
3674 	return error;
3675 }
3676 
3677 uint64_t kdp_task_exec_meta_flags(task_t task);
3678 
3679 uint64_t
kdp_task_exec_meta_flags(task_t task)3680 kdp_task_exec_meta_flags(task_t task)
3681 {
3682 	uint64_t flags = 0;
3683 
3684 #if CONFIG_ROSETTA
3685 	if (task_is_translated(task)) {
3686 		flags |= kTaskExecTranslated;
3687 	}
3688 #endif /* CONFIG_ROSETTA */
3689 
3690 	if (task_has_hardened_heap(task)) {
3691 		flags |= kTaskExecHardenedHeap;
3692 	}
3693 
3694 
3695 	return flags;
3696 }
3697 
3698 /* Compute the set of flags that kdp_task_exec_meta_flags can return based on the kernel config */
3699 static uint64_t
stackshot_available_task_exec_flags(void)3700 stackshot_available_task_exec_flags(void)
3701 {
3702 	uint64_t flags_mask = 0;
3703 
3704 #if CONFIG_ROSETTA
3705 	flags_mask |= kTaskExecTranslated;
3706 #endif /* CONFIG_ROSETTA */
3707 
3708 	flags_mask |= kTaskExecHardenedHeap;
3709 
3710 
3711 	return flags_mask;
3712 }
3713 
3714 static kern_return_t
kcdata_record_task_exec_meta(kcdata_descriptor_t kcd,task_t task)3715 kcdata_record_task_exec_meta(kcdata_descriptor_t kcd, task_t task)
3716 {
3717 	struct task_exec_meta tem = {};
3718 	kern_return_t error = KERN_SUCCESS;
3719 
3720 	tem.tem_flags = kdp_task_exec_meta_flags(task);
3721 
3722 	if (tem.tem_flags != 0) {
3723 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_TASK_EXEC_META, sizeof(struct task_exec_meta), &tem));
3724 	}
3725 
3726 error_exit:
3727 	return error;
3728 }
3729 
3730 static kern_return_t
kcdata_record_task_iostats(kcdata_descriptor_t kcd,task_t task)3731 kcdata_record_task_iostats(kcdata_descriptor_t kcd, task_t task)
3732 {
3733 	kern_return_t error = KERN_SUCCESS;
3734 	mach_vm_address_t out_addr = 0;
3735 
3736 	/* I/O Statistics if any counters are non zero */
3737 	assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES);
3738 	if (task->task_io_stats && !memory_iszero(task->task_io_stats, sizeof(struct io_stat_info))) {
3739 		/* struct io_stats_snapshot is quite large - avoid overflowing the stack. */
3740 		kcdata_compression_window_open(kcd);
3741 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr));
3742 		struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr;
3743 		_iostat->ss_disk_reads_count = task->task_io_stats->disk_reads.count;
3744 		_iostat->ss_disk_reads_size = task->task_io_stats->disk_reads.size;
3745 		_iostat->ss_disk_writes_count = (task->task_io_stats->total_io.count - task->task_io_stats->disk_reads.count);
3746 		_iostat->ss_disk_writes_size = (task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size);
3747 		_iostat->ss_paging_count = task->task_io_stats->paging.count;
3748 		_iostat->ss_paging_size = task->task_io_stats->paging.size;
3749 		_iostat->ss_non_paging_count = (task->task_io_stats->total_io.count - task->task_io_stats->paging.count);
3750 		_iostat->ss_non_paging_size = (task->task_io_stats->total_io.size - task->task_io_stats->paging.size);
3751 		_iostat->ss_metadata_count = task->task_io_stats->metadata.count;
3752 		_iostat->ss_metadata_size = task->task_io_stats->metadata.size;
3753 		_iostat->ss_data_count = (task->task_io_stats->total_io.count - task->task_io_stats->metadata.count);
3754 		_iostat->ss_data_size = (task->task_io_stats->total_io.size - task->task_io_stats->metadata.size);
3755 		for (int i = 0; i < IO_NUM_PRIORITIES; i++) {
3756 			_iostat->ss_io_priority_count[i] = task->task_io_stats->io_priority[i].count;
3757 			_iostat->ss_io_priority_size[i] = task->task_io_stats->io_priority[i].size;
3758 		}
3759 		kcd_exit_on_error(kcdata_compression_window_close(kcd));
3760 	}
3761 
3762 
3763 error_exit:
3764 	return error;
3765 }
3766 
3767 #if CONFIG_PERVASIVE_CPI
3768 static kern_return_t
kcdata_record_task_instrs_cycles(kcdata_descriptor_t kcd,task_t task)3769 kcdata_record_task_instrs_cycles(kcdata_descriptor_t kcd, task_t task)
3770 {
3771 	struct instrs_cycles_snapshot_v2 instrs_cycles = { 0 };
3772 	struct recount_usage usage = { 0 };
3773 	struct recount_usage perf_only = { 0 };
3774 	recount_task_terminated_usage_perf_only(task, &usage, &perf_only);
3775 	instrs_cycles.ics_instructions = recount_usage_instructions(&usage);
3776 	instrs_cycles.ics_cycles = recount_usage_cycles(&usage);
3777 	instrs_cycles.ics_p_instructions = recount_usage_instructions(&perf_only);
3778 	instrs_cycles.ics_p_cycles = recount_usage_cycles(&perf_only);
3779 
3780 	return kcdata_push_data(kcd, STACKSHOT_KCTYPE_INSTRS_CYCLES, sizeof(instrs_cycles), &instrs_cycles);
3781 }
3782 #endif /* CONFIG_PERVASIVE_CPI */
3783 
3784 static kern_return_t
kcdata_record_task_cpu_architecture(kcdata_descriptor_t kcd,task_t task)3785 kcdata_record_task_cpu_architecture(kcdata_descriptor_t kcd, task_t task)
3786 {
3787 	struct stackshot_cpu_architecture cpu_architecture = {0};
3788 	int32_t cputype;
3789 	int32_t cpusubtype;
3790 
3791 	proc_archinfo_kdp(get_bsdtask_info(task), &cputype, &cpusubtype);
3792 	cpu_architecture.cputype = cputype;
3793 	cpu_architecture.cpusubtype = cpusubtype;
3794 
3795 	return kcdata_push_data(kcd, STACKSHOT_KCTYPE_TASK_CPU_ARCHITECTURE, sizeof(struct stackshot_cpu_architecture), &cpu_architecture);
3796 }
3797 
3798 static kern_return_t
kcdata_record_task_codesigning_info(kcdata_descriptor_t kcd,task_t task)3799 kcdata_record_task_codesigning_info(kcdata_descriptor_t kcd, task_t task)
3800 {
3801 	struct stackshot_task_codesigning_info codesigning_info = {};
3802 	void * bsdtask_info = NULL;
3803 	uint32_t trust = 0;
3804 	kern_return_t ret = 0;
3805 	pmap_t pmap = get_task_pmap(task);
3806 	uint64_t cs_auxiliary_info = 0;
3807 	if (task != kernel_task) {
3808 		bsdtask_info = get_bsdtask_info(task);
3809 		codesigning_info.csflags = proc_getcsflags_kdp(bsdtask_info);
3810 		ret = get_trust_level_kdp(pmap, &trust);
3811 		if (ret != KERN_SUCCESS) {
3812 			trust = KCDATA_INVALID_CS_TRUST_LEVEL;
3813 		}
3814 		codesigning_info.cs_trust_level = trust;
3815 		cs_auxiliary_info = task_get_cs_auxiliary_info_kdp(task);
3816 	} else {
3817 		return KERN_SUCCESS;
3818 	}
3819 	ret = kcdata_push_data(kcd, STACKSHOT_KCTYPE_CODESIGNING_INFO, sizeof(struct stackshot_task_codesigning_info), &codesigning_info);
3820 	if (ret != KERN_SUCCESS) {
3821 		return ret;
3822 	}
3823 	return kcdata_push_data(kcd, TASK_CRASHINFO_CS_AUXILIARY_INFO, sizeof(cs_auxiliary_info), &cs_auxiliary_info);
3824 }
3825 
3826 static kern_return_t
kcdata_record_task_jit_address_range(kcdata_descriptor_t kcd,task_t task)3827 kcdata_record_task_jit_address_range(kcdata_descriptor_t kcd, task_t task)
3828 {
3829 	uint64_t jit_start_addr = 0;
3830 	uint64_t jit_end_addr = 0;
3831 	struct crashinfo_jit_address_range range = {};
3832 	kern_return_t ret = 0;
3833 	pmap_t pmap = get_task_pmap(task);
3834 	if (task == kernel_task || NULL == pmap) {
3835 		return KERN_SUCCESS;
3836 	}
3837 	ret = get_jit_address_range_kdp(pmap, (uintptr_t*)&jit_start_addr, (uintptr_t*)&jit_end_addr);
3838 	if (KERN_SUCCESS == ret) {
3839 		range.start_address = jit_start_addr;
3840 		range.end_address = jit_end_addr;
3841 		return kcdata_push_data(kcd, TASK_CRASHINFO_JIT_ADDRESS_RANGE, sizeof(struct crashinfo_jit_address_range), &range);
3842 	} else {
3843 		return KERN_SUCCESS;
3844 	}
3845 }
3846 
3847 #if CONFIG_TASK_SUSPEND_STATS
3848 static kern_return_t
kcdata_record_task_suspension_info(kcdata_descriptor_t kcd,task_t task)3849 kcdata_record_task_suspension_info(kcdata_descriptor_t kcd, task_t task)
3850 {
3851 	kern_return_t ret = KERN_SUCCESS;
3852 	struct stackshot_suspension_info suspension_info = {};
3853 	task_suspend_stats_data_t suspend_stats;
3854 	task_suspend_source_array_t suspend_sources;
3855 	struct stackshot_suspension_source suspension_sources[TASK_SUSPEND_SOURCES_MAX];
3856 	int i;
3857 
3858 	if (task == kernel_task) {
3859 		return KERN_SUCCESS;
3860 	}
3861 
3862 	ret = task_get_suspend_stats_kdp(task, &suspend_stats);
3863 	if (ret != KERN_SUCCESS) {
3864 		return ret;
3865 	}
3866 
3867 	suspension_info.tss_count = suspend_stats.tss_count;
3868 	suspension_info.tss_duration = suspend_stats.tss_duration;
3869 	suspension_info.tss_last_end = suspend_stats.tss_last_end;
3870 	suspension_info.tss_last_start = suspend_stats.tss_last_start;
3871 	ret = kcdata_push_data(kcd, STACKSHOT_KCTYPE_SUSPENSION_INFO, sizeof(suspension_info), &suspension_info);
3872 	if (ret != KERN_SUCCESS) {
3873 		return ret;
3874 	}
3875 
3876 	ret = task_get_suspend_sources_kdp(task, suspend_sources);
3877 	if (ret != KERN_SUCCESS) {
3878 		return ret;
3879 	}
3880 
3881 	for (i = 0; i < TASK_SUSPEND_SOURCES_MAX; ++i) {
3882 		suspension_sources[i].tss_pid = suspend_sources[i].tss_pid;
3883 		strlcpy(suspension_sources[i].tss_procname, suspend_sources[i].tss_procname, sizeof(suspend_sources[i].tss_procname));
3884 		suspension_sources[i].tss_tid = suspend_sources[i].tss_tid;
3885 		suspension_sources[i].tss_time = suspend_sources[i].tss_time;
3886 	}
3887 	return kcdata_push_array(kcd, STACKSHOT_KCTYPE_SUSPENSION_SOURCE, sizeof(suspension_sources[0]), TASK_SUSPEND_SOURCES_MAX, &suspension_sources);
3888 }
3889 #endif /* CONFIG_TASK_SUSPEND_STATS */
3890 
3891 static kern_return_t
kcdata_record_transitioning_task_snapshot(kcdata_descriptor_t kcd,task_t task,unaligned_u64 task_snap_ss_flags,uint64_t transition_type)3892 kcdata_record_transitioning_task_snapshot(kcdata_descriptor_t kcd, task_t task, unaligned_u64 task_snap_ss_flags, uint64_t transition_type)
3893 {
3894 	kern_return_t error                 = KERN_SUCCESS;
3895 	mach_vm_address_t out_addr          = 0;
3896 	struct transitioning_task_snapshot * cur_tsnap = NULL;
3897 
3898 	int task_pid           = pid_from_task(task);
3899 	/* Is returning -1 ok for terminating task ok ??? */
3900 	uint64_t task_uniqueid = get_task_uniqueid(task);
3901 
3902 	if (task_pid && (task_did_exec_internal(task) || task_is_exec_copy_internal(task))) {
3903 		/*
3904 		 * if this task is a transit task from another one, show the pid as
3905 		 * negative
3906 		 */
3907 		task_pid = 0 - task_pid;
3908 	}
3909 
3910 	/* the transitioning_task_snapshot struct is large - avoid overflowing the stack */
3911 	kcdata_compression_window_open(kcd);
3912 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TRANSITIONING_TASK_SNAPSHOT, sizeof(struct transitioning_task_snapshot), &out_addr));
3913 	cur_tsnap = (struct transitioning_task_snapshot *)out_addr;
3914 	bzero(cur_tsnap, sizeof(*cur_tsnap));
3915 
3916 	cur_tsnap->tts_unique_pid = task_uniqueid;
3917 	cur_tsnap->tts_ss_flags = kcdata_get_task_ss_flags(task, true);
3918 	cur_tsnap->tts_ss_flags |= task_snap_ss_flags;
3919 	cur_tsnap->tts_transition_type = transition_type;
3920 	cur_tsnap->tts_pid = task_pid;
3921 
3922 	/* Add the BSD process identifiers */
3923 	if (task_pid != -1 && get_bsdtask_info(task) != NULL) {
3924 		proc_name_kdp(get_bsdtask_info(task), cur_tsnap->tts_p_comm, sizeof(cur_tsnap->tts_p_comm));
3925 	} else {
3926 		cur_tsnap->tts_p_comm[0] = '\0';
3927 	}
3928 
3929 	kcd_exit_on_error(kcdata_compression_window_close(kcd));
3930 
3931 error_exit:
3932 	return error;
3933 }
3934 
3935 static kern_return_t
3936 #if STACKSHOT_COLLECTS_LATENCY_INFO
kcdata_record_task_snapshot(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 task_snap_ss_flags,struct stackshot_latency_task * latency_info)3937 kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags, struct stackshot_latency_task *latency_info)
3938 #else
3939 kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags)
3940 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3941 {
3942 	bool collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
3943 	bool collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
3944 #if CONFIG_PERVASIVE_CPI
3945 	bool collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
3946 #endif /* CONFIG_PERVASIVE_CPI */
3947 #if __arm64__
3948 	bool collect_asid            = ((trace_flags & STACKSHOT_ASID) != 0);
3949 #endif
3950 	bool collect_pagetables      = ((trace_flags & STACKSHOT_PAGE_TABLES) != 0);
3951 
3952 
3953 	kern_return_t error                 = KERN_SUCCESS;
3954 	mach_vm_address_t out_addr          = 0;
3955 	struct task_snapshot_v3 * cur_tsnap = NULL;
3956 #if CONFIG_MEMORYSTATUS
3957 	mach_vm_address_t memorystatus_addr = 0;
3958 	struct task_memorystatus_snapshot *memorystatus_snapshot = NULL;
3959 #endif /* CONFIG_MEMORYSTATUS */
3960 #if STACKSHOT_COLLECTS_LATENCY_INFO
3961 	latency_info->cur_tsnap_latency = mach_absolute_time();
3962 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3963 
3964 	int task_pid           = pid_from_task(task);
3965 	uint64_t task_uniqueid = get_task_uniqueid(task);
3966 	void *bsd_info = get_bsdtask_info(task);
3967 	uint64_t proc_starttime_secs = 0;
3968 
3969 	if (task_pid && (task_did_exec_internal(task) || task_is_exec_copy_internal(task))) {
3970 		/*
3971 		 * if this task is a transit task from another one, show the pid as
3972 		 * negative
3973 		 */
3974 		task_pid = 0 - task_pid;
3975 	}
3976 
3977 	/* the task_snapshot_v3 struct is large - avoid overflowing the stack */
3978 	kcdata_compression_window_open(kcd);
3979 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v3), &out_addr));
3980 	cur_tsnap = (struct task_snapshot_v3 *)out_addr;
3981 	bzero(cur_tsnap, sizeof(*cur_tsnap));
3982 
3983 	cur_tsnap->ts_unique_pid = task_uniqueid;
3984 	cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task, true);
3985 	cur_tsnap->ts_ss_flags |= task_snap_ss_flags;
3986 
3987 	struct recount_usage term_usage = { 0 };
3988 	recount_task_terminated_usage(task, &term_usage);
3989 	struct recount_times_mach term_times = recount_usage_times_mach(&term_usage);
3990 	cur_tsnap->ts_user_time_in_terminated_threads = term_times.rtm_user;
3991 	cur_tsnap->ts_system_time_in_terminated_threads = term_times.rtm_system;
3992 
3993 	proc_starttime_kdp(bsd_info, &proc_starttime_secs, NULL, NULL);
3994 	cur_tsnap->ts_p_start_sec = proc_starttime_secs;
3995 	cur_tsnap->ts_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
3996 	cur_tsnap->ts_max_resident_size = get_task_resident_max(task);
3997 	cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task);
3998 	cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task);
3999 
4000 	cur_tsnap->ts_suspend_count = task->suspend_count;
4001 	cur_tsnap->ts_faults = counter_load(&task->faults);
4002 	cur_tsnap->ts_pageins = counter_load(&task->pageins);
4003 	cur_tsnap->ts_cow_faults = counter_load(&task->cow_faults);
4004 	cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ?
4005 	    LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.tep_latency_qos);
4006 	cur_tsnap->ts_pid = task_pid;
4007 
4008 	/* Add the BSD process identifiers */
4009 	if (task_pid != -1 && bsd_info != NULL) {
4010 		proc_name_kdp(bsd_info, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm));
4011 		cur_tsnap->ts_uid = proc_getuid(bsd_info);
4012 		cur_tsnap->ts_gid = proc_getgid(bsd_info);
4013 	} else {
4014 		cur_tsnap->ts_p_comm[0] = '\0';
4015 		cur_tsnap->ts_uid = UINT32_MAX;
4016 		cur_tsnap->ts_gid = UINT32_MAX;
4017 #if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG)
4018 		if (task->task_imp_base != NULL) {
4019 			kdp_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0],
4020 			    MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm)));
4021 		}
4022 #endif /* IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) */
4023 	}
4024 
4025 #if CONFIG_MEMORYSTATUS
4026 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_MEMORYSTATUS, sizeof(struct task_memorystatus_snapshot), &memorystatus_addr));
4027 	memorystatus_snapshot = (struct task_memorystatus_snapshot *)memorystatus_addr;
4028 	bzero(memorystatus_snapshot, sizeof(*memorystatus_snapshot));
4029 
4030 
4031 	int32_t current_memlimit = 0, effectiveprio = 0, requestedprio = 0, assertionprio = 0;
4032 	proc_memstat_data_kdp(bsd_info, &current_memlimit, &effectiveprio, &requestedprio, &assertionprio);
4033 	memorystatus_snapshot->tms_current_memlimit = current_memlimit;
4034 	memorystatus_snapshot->tms_effectivepriority = effectiveprio;
4035 	memorystatus_snapshot->tms_requestedpriority = requestedprio;
4036 	memorystatus_snapshot->tms_assertionpriority = assertionprio;
4037 #endif /* CONFIG_MEMORYSTATUS */
4038 
4039 	kcd_exit_on_error(kcdata_compression_window_close(kcd));
4040 
4041 #if CONFIG_COALITIONS
4042 	if (task_pid != -1 && bsd_info != NULL &&
4043 	    (task->coalition[COALITION_TYPE_JETSAM] != NULL)) {
4044 		/*
4045 		 * The jetsam coalition ID is always saved, even if
4046 		 * STACKSHOT_SAVE_JETSAM_COALITIONS is not set.
4047 		 */
4048 		uint64_t jetsam_coal_id = coalition_id(task->coalition[COALITION_TYPE_JETSAM]);
4049 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_JETSAM_COALITION, sizeof(jetsam_coal_id), &jetsam_coal_id));
4050 	}
4051 #endif /* CONFIG_COALITIONS */
4052 
4053 #if __arm64__
4054 	if (collect_asid && have_pmap) {
4055 		uint32_t asid = PMAP_VASID(task->map->pmap);
4056 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_ASID, sizeof(asid), &asid));
4057 	}
4058 #endif
4059 
4060 #if STACKSHOT_COLLECTS_LATENCY_INFO
4061 	latency_info->cur_tsnap_latency = mach_absolute_time() - latency_info->cur_tsnap_latency;
4062 	latency_info->pmap_latency = mach_absolute_time();
4063 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4064 
4065 	if (collect_pagetables && have_pmap) {
4066 #if SCHED_HYGIENE_DEBUG
4067 		// pagetable dumps can be large; reset the interrupt timeout to avoid a panic
4068 		ml_spin_debug_clear_self();
4069 #endif
4070 		assert(stackshot_ctx.sc_is_singlethreaded);
4071 		size_t bytes_dumped = 0;
4072 		error = pmap_dump_page_tables(task->map->pmap, kcd_end_address(kcd), kcd_max_address(kcd), stackshot_args.pagetable_mask, &bytes_dumped);
4073 		if (error != KERN_SUCCESS) {
4074 			goto error_exit;
4075 		} else {
4076 			/* Variable size array - better not have it on the stack. */
4077 			kcdata_compression_window_open(kcd);
4078 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, STACKSHOT_KCTYPE_PAGE_TABLES,
4079 			    sizeof(uint64_t), (uint32_t)(bytes_dumped / sizeof(uint64_t)), &out_addr));
4080 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
4081 		}
4082 	}
4083 
4084 #if STACKSHOT_COLLECTS_LATENCY_INFO
4085 	latency_info->pmap_latency = mach_absolute_time() - latency_info->pmap_latency;
4086 	latency_info->bsd_proc_ids_latency = mach_absolute_time();
4087 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4088 
4089 #if STACKSHOT_COLLECTS_LATENCY_INFO
4090 	latency_info->bsd_proc_ids_latency = mach_absolute_time() - latency_info->bsd_proc_ids_latency;
4091 	latency_info->end_latency = mach_absolute_time();
4092 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4093 
4094 	if (collect_iostats) {
4095 		kcd_exit_on_error(kcdata_record_task_iostats(kcd, task));
4096 	}
4097 
4098 #if CONFIG_PERVASIVE_CPI
4099 	if (collect_instrs_cycles) {
4100 		kcd_exit_on_error(kcdata_record_task_instrs_cycles(kcd, task));
4101 	}
4102 #endif /* CONFIG_PERVASIVE_CPI */
4103 
4104 	kcd_exit_on_error(kcdata_record_task_cpu_architecture(kcd, task));
4105 	kcd_exit_on_error(kcdata_record_task_codesigning_info(kcd, task));
4106 	kcd_exit_on_error(kcdata_record_task_jit_address_range(kcd, task));
4107 
4108 #if CONFIG_TASK_SUSPEND_STATS
4109 	kcd_exit_on_error(kcdata_record_task_suspension_info(kcd, task));
4110 #endif /* CONFIG_TASK_SUSPEND_STATS */
4111 
4112 #if STACKSHOT_COLLECTS_LATENCY_INFO
4113 	latency_info->end_latency = mach_absolute_time() - latency_info->end_latency;
4114 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4115 
4116 error_exit:
4117 	return error;
4118 }
4119 
4120 static kern_return_t
kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 task_snap_ss_flags)4121 kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags)
4122 {
4123 #if !CONFIG_PERVASIVE_CPI
4124 #pragma unused(trace_flags)
4125 #endif /* !CONFIG_PERVASIVE_CPI */
4126 	kern_return_t error                       = KERN_SUCCESS;
4127 	struct task_delta_snapshot_v2 * cur_tsnap = NULL;
4128 	mach_vm_address_t out_addr                = 0;
4129 	(void) trace_flags;
4130 #if __arm64__
4131 	boolean_t collect_asid                    = ((trace_flags & STACKSHOT_ASID) != 0);
4132 #endif
4133 #if CONFIG_PERVASIVE_CPI
4134 	boolean_t collect_instrs_cycles           = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
4135 #endif /* CONFIG_PERVASIVE_CPI */
4136 
4137 	uint64_t task_uniqueid = get_task_uniqueid(task);
4138 
4139 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT, sizeof(struct task_delta_snapshot_v2), &out_addr));
4140 
4141 	cur_tsnap = (struct task_delta_snapshot_v2 *)out_addr;
4142 
4143 	cur_tsnap->tds_unique_pid = task_uniqueid;
4144 	cur_tsnap->tds_ss_flags = kcdata_get_task_ss_flags(task, true);
4145 	cur_tsnap->tds_ss_flags |= task_snap_ss_flags;
4146 
4147 	struct recount_usage usage = { 0 };
4148 	recount_task_terminated_usage(task, &usage);
4149 	struct recount_times_mach term_times = recount_usage_times_mach(&usage);
4150 
4151 	cur_tsnap->tds_user_time_in_terminated_threads = term_times.rtm_user;
4152 	cur_tsnap->tds_system_time_in_terminated_threads = term_times.rtm_system;
4153 
4154 	cur_tsnap->tds_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
4155 
4156 	cur_tsnap->tds_max_resident_size = get_task_resident_max(task);
4157 	cur_tsnap->tds_suspend_count = task->suspend_count;
4158 	cur_tsnap->tds_faults            = counter_load(&task->faults);
4159 	cur_tsnap->tds_pageins           = counter_load(&task->pageins);
4160 	cur_tsnap->tds_cow_faults        = counter_load(&task->cow_faults);
4161 	cur_tsnap->tds_was_throttled     = (uint32_t)proc_was_throttled_from_task(task);
4162 	cur_tsnap->tds_did_throttle      = (uint32_t)proc_did_throttle_from_task(task);
4163 	cur_tsnap->tds_latency_qos       = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED)
4164 	    ? LATENCY_QOS_TIER_UNSPECIFIED
4165 	    : ((0xFF << 16) | task->effective_policy.tep_latency_qos);
4166 
4167 #if __arm64__
4168 	if (collect_asid && have_pmap) {
4169 		uint32_t asid = PMAP_VASID(task->map->pmap);
4170 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
4171 		kdp_memcpy((void*)out_addr, &asid, sizeof(asid));
4172 	}
4173 #endif
4174 
4175 #if CONFIG_PERVASIVE_CPI
4176 	if (collect_instrs_cycles) {
4177 		kcd_exit_on_error(kcdata_record_task_instrs_cycles(kcd, task));
4178 	}
4179 #endif /* CONFIG_PERVASIVE_CPI */
4180 
4181 error_exit:
4182 	return error;
4183 }
4184 
4185 static kern_return_t
kcdata_record_thread_iostats(kcdata_descriptor_t kcd,thread_t thread)4186 kcdata_record_thread_iostats(kcdata_descriptor_t kcd, thread_t thread)
4187 {
4188 	kern_return_t error = KERN_SUCCESS;
4189 	mach_vm_address_t out_addr = 0;
4190 
4191 	/* I/O Statistics */
4192 	assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES);
4193 	if (thread->thread_io_stats && !memory_iszero(thread->thread_io_stats, sizeof(struct io_stat_info))) {
4194 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr));
4195 		struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr;
4196 		_iostat->ss_disk_reads_count = thread->thread_io_stats->disk_reads.count;
4197 		_iostat->ss_disk_reads_size = thread->thread_io_stats->disk_reads.size;
4198 		_iostat->ss_disk_writes_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->disk_reads.count);
4199 		_iostat->ss_disk_writes_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->disk_reads.size);
4200 		_iostat->ss_paging_count = thread->thread_io_stats->paging.count;
4201 		_iostat->ss_paging_size = thread->thread_io_stats->paging.size;
4202 		_iostat->ss_non_paging_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->paging.count);
4203 		_iostat->ss_non_paging_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->paging.size);
4204 		_iostat->ss_metadata_count = thread->thread_io_stats->metadata.count;
4205 		_iostat->ss_metadata_size = thread->thread_io_stats->metadata.size;
4206 		_iostat->ss_data_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->metadata.count);
4207 		_iostat->ss_data_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->metadata.size);
4208 		for (int i = 0; i < IO_NUM_PRIORITIES; i++) {
4209 			_iostat->ss_io_priority_count[i] = thread->thread_io_stats->io_priority[i].count;
4210 			_iostat->ss_io_priority_size[i] = thread->thread_io_stats->io_priority[i].size;
4211 		}
4212 	}
4213 
4214 error_exit:
4215 	return error;
4216 }
4217 
4218 bool
machine_trace_thread_validate_kva(vm_offset_t addr)4219 machine_trace_thread_validate_kva(vm_offset_t addr)
4220 {
4221 	return _stackshot_validate_kva(addr, sizeof(uintptr_t));
4222 }
4223 
4224 struct _stackshot_backtrace_context {
4225 	vm_map_t sbc_map;
4226 	vm_offset_t sbc_prev_page;
4227 	vm_offset_t sbc_prev_kva;
4228 	uint32_t sbc_flags;
4229 	bool sbc_allow_faulting;
4230 };
4231 
4232 static errno_t
_stackshot_backtrace_copy(void * vctx,void * dst,user_addr_t src,size_t size)4233 _stackshot_backtrace_copy(void *vctx, void *dst, user_addr_t src, size_t size)
4234 {
4235 	struct _stackshot_backtrace_context *ctx = vctx;
4236 	size_t map_page_mask = 0;
4237 	size_t __assert_only map_page_size = kdp_vm_map_get_page_size(ctx->sbc_map,
4238 	    &map_page_mask);
4239 	assert(size < map_page_size);
4240 	if (src & (size - 1)) {
4241 		// The source should be aligned to the size passed in, like a stack
4242 		// frame or word.
4243 		return EINVAL;
4244 	}
4245 
4246 	vm_offset_t src_page = src & ~map_page_mask;
4247 	vm_offset_t src_kva = 0;
4248 
4249 	if (src_page != ctx->sbc_prev_page) {
4250 		uint32_t res = 0;
4251 		uint32_t flags = 0;
4252 		vm_offset_t src_pa = stackshot_find_phys(ctx->sbc_map, src,
4253 		    ctx->sbc_allow_faulting, &res);
4254 
4255 		flags |= (res & KDP_FAULT_RESULT_PAGED_OUT) ? kThreadTruncatedBT : 0;
4256 		flags |= (res & KDP_FAULT_RESULT_TRIED_FAULT) ? kThreadTriedFaultBT : 0;
4257 		flags |= (res & KDP_FAULT_RESULT_FAULTED_IN) ? kThreadFaultedBT : 0;
4258 		ctx->sbc_flags |= flags;
4259 		if (src_pa == 0) {
4260 			return EFAULT;
4261 		}
4262 
4263 		src_kva = phystokv(src_pa);
4264 		ctx->sbc_prev_page = src_page;
4265 		ctx->sbc_prev_kva = (src_kva & ~map_page_mask);
4266 	} else {
4267 		src_kva = ctx->sbc_prev_kva + (src & map_page_mask);
4268 	}
4269 
4270 #if KASAN
4271 	/*
4272 	 * KASan does not monitor accesses to userspace pages. Therefore, it is
4273 	 * pointless to maintain a shadow map for them. Instead, they are all
4274 	 * mapped to a single, always valid shadow map page. This approach saves
4275 	 * a considerable amount of shadow map pages which are limited and
4276 	 * precious.
4277 	 */
4278 	kasan_notify_address_nopoison(src_kva, size);
4279 #endif
4280 
4281 	memcpy(dst, (const void *)src_kva, size);
4282 
4283 	return 0;
4284 }
4285 
4286 static kern_return_t
kcdata_record_thread_snapshot(kcdata_descriptor_t kcd,thread_t thread,task_t task,uint64_t trace_flags,boolean_t have_pmap,boolean_t thread_on_core)4287 kcdata_record_thread_snapshot(kcdata_descriptor_t kcd, thread_t thread, task_t task, uint64_t trace_flags, boolean_t have_pmap, boolean_t thread_on_core)
4288 {
4289 	boolean_t dispatch_p              = ((trace_flags & STACKSHOT_GET_DQ) != 0);
4290 	boolean_t active_kthreads_only_p  = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
4291 	boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
4292 	boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
4293 #if CONFIG_PERVASIVE_CPI
4294 	boolean_t collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
4295 #endif /* CONFIG_PERVASIVE_CPI */
4296 	kern_return_t error        = KERN_SUCCESS;
4297 
4298 #if STACKSHOT_COLLECTS_LATENCY_INFO
4299 	struct stackshot_latency_thread latency_info;
4300 	latency_info.cur_thsnap1_latency = mach_absolute_time();
4301 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4302 
4303 	mach_vm_address_t out_addr = 0;
4304 	int saved_count            = 0;
4305 
4306 	struct thread_snapshot_v4 * cur_thread_snap = NULL;
4307 	char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE];
4308 
4309 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v4), &out_addr));
4310 	cur_thread_snap = (struct thread_snapshot_v4 *)out_addr;
4311 
4312 	/* Populate the thread snapshot header */
4313 	cur_thread_snap->ths_ss_flags = 0;
4314 	cur_thread_snap->ths_thread_id = thread_tid(thread);
4315 	cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event);
4316 	cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation);
4317 	cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix;
4318 
4319 	if (IPC_VOUCHER_NULL != thread->ith_voucher) {
4320 		cur_thread_snap->ths_voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher);
4321 	} else {
4322 		cur_thread_snap->ths_voucher_identifier = 0;
4323 	}
4324 
4325 #if STACKSHOT_COLLECTS_LATENCY_INFO
4326 	latency_info.cur_thsnap1_latency = mach_absolute_time() - latency_info.cur_thsnap1_latency;
4327 	latency_info.dispatch_serial_latency = mach_absolute_time();
4328 	latency_info.dispatch_label_latency = 0;
4329 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4330 
4331 	cur_thread_snap->ths_dqserialnum = 0;
4332 	if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) {
4333 		uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
4334 		if (dqkeyaddr != 0) {
4335 			uint64_t dqaddr = 0;
4336 			boolean_t copyin_ok = stackshot_copyin_word(task, dqkeyaddr, &dqaddr, FALSE, NULL);
4337 			if (copyin_ok && dqaddr != 0) {
4338 				uint64_t dqserialnumaddr = dqaddr + get_task_dispatchqueue_serialno_offset(task);
4339 				uint64_t dqserialnum = 0;
4340 				copyin_ok = stackshot_copyin_word(task, dqserialnumaddr, &dqserialnum, FALSE, NULL);
4341 				if (copyin_ok) {
4342 					cur_thread_snap->ths_ss_flags |= kHasDispatchSerial;
4343 					cur_thread_snap->ths_dqserialnum = dqserialnum;
4344 				}
4345 
4346 #if STACKSHOT_COLLECTS_LATENCY_INFO
4347 				latency_info.dispatch_serial_latency = mach_absolute_time() - latency_info.dispatch_serial_latency;
4348 				latency_info.dispatch_label_latency = mach_absolute_time();
4349 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4350 
4351 				/* try copying in the queue label */
4352 				uint64_t label_offs = get_task_dispatchqueue_label_offset(task);
4353 				if (label_offs) {
4354 					uint64_t dqlabeladdr = dqaddr + label_offs;
4355 					uint64_t actual_dqlabeladdr = 0;
4356 
4357 					copyin_ok = stackshot_copyin_word(task, dqlabeladdr, &actual_dqlabeladdr, FALSE, NULL);
4358 					if (copyin_ok && actual_dqlabeladdr != 0) {
4359 						char label_buf[STACKSHOT_QUEUE_LABEL_MAXSIZE];
4360 						int len;
4361 
4362 						bzero(label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE * sizeof(char));
4363 						len = stackshot_copyin_string(task, actual_dqlabeladdr, label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE, FALSE, NULL);
4364 						if (len > 0) {
4365 							mach_vm_address_t label_addr = 0;
4366 							kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL, len, &label_addr));
4367 							kdp_strlcpy((char*)label_addr, &label_buf[0], len);
4368 						}
4369 					}
4370 				}
4371 #if STACKSHOT_COLLECTS_LATENCY_INFO
4372 				latency_info.dispatch_label_latency = mach_absolute_time() - latency_info.dispatch_label_latency;
4373 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4374 			}
4375 		}
4376 	}
4377 
4378 #if STACKSHOT_COLLECTS_LATENCY_INFO
4379 	if ((cur_thread_snap->ths_ss_flags & kHasDispatchSerial) == 0) {
4380 		latency_info.dispatch_serial_latency = 0;
4381 	}
4382 	latency_info.cur_thsnap2_latency = mach_absolute_time();
4383 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4384 
4385 	struct recount_times_mach times = recount_thread_times(thread);
4386 	cur_thread_snap->ths_user_time = times.rtm_user;
4387 	cur_thread_snap->ths_sys_time = times.rtm_system;
4388 
4389 	if (thread->thread_tag & THREAD_TAG_MAINTHREAD) {
4390 		cur_thread_snap->ths_ss_flags |= kThreadMain;
4391 	}
4392 	if (thread->effective_policy.thep_darwinbg) {
4393 		cur_thread_snap->ths_ss_flags |= kThreadDarwinBG;
4394 	}
4395 	if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) {
4396 		cur_thread_snap->ths_ss_flags |= kThreadIOPassive;
4397 	}
4398 	if (thread->suspend_count > 0) {
4399 		cur_thread_snap->ths_ss_flags |= kThreadSuspended;
4400 	}
4401 	if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
4402 		cur_thread_snap->ths_ss_flags |= kGlobalForcedIdle;
4403 	}
4404 #if CONFIG_EXCLAVES
4405 	/* save exclave thread for later collection */
4406 	if ((thread->th_exclaves_state & TH_EXCLAVES_RPC) && stackshot_exclave_inspect_ctids && !stackshot_ctx.sc_panic_stackshot) {
4407 		/* certain threads, like the collector, must never be inspected */
4408 		if ((os_atomic_load(&thread->th_exclaves_inspection_state, relaxed) & TH_EXCLAVES_INSPECTION_NOINSPECT) == 0) {
4409 			uint32_t ctid_index = os_atomic_inc_orig(&stackshot_exclave_inspect_ctid_count, acq_rel);
4410 			if (ctid_index < stackshot_exclave_inspect_ctid_capacity) {
4411 				stackshot_exclave_inspect_ctids[ctid_index] = thread_get_ctid(thread);
4412 			} else {
4413 				os_atomic_store(&stackshot_exclave_inspect_ctid_count, stackshot_exclave_inspect_ctid_capacity, release);
4414 			}
4415 			if ((os_atomic_load(&thread->th_exclaves_inspection_state, relaxed) & TH_EXCLAVES_INSPECTION_STACKSHOT) != 0) {
4416 				panic("stackshot: trying to inspect already-queued thread");
4417 			}
4418 		}
4419 	}
4420 #endif /* CONFIG_EXCLAVES */
4421 	if (thread_on_core) {
4422 		cur_thread_snap->ths_ss_flags |= kThreadOnCore;
4423 	}
4424 	if (stackshot_thread_is_idle_worker_unsafe(thread)) {
4425 		cur_thread_snap->ths_ss_flags |= kThreadIdleWorker;
4426 	}
4427 
4428 	/* make sure state flags defined in kcdata.h still match internal flags */
4429 	static_assert(SS_TH_WAIT == TH_WAIT);
4430 	static_assert(SS_TH_SUSP == TH_SUSP);
4431 	static_assert(SS_TH_RUN == TH_RUN);
4432 	static_assert(SS_TH_UNINT == TH_UNINT);
4433 	static_assert(SS_TH_TERMINATE == TH_TERMINATE);
4434 	static_assert(SS_TH_TERMINATE2 == TH_TERMINATE2);
4435 	static_assert(SS_TH_IDLE == TH_IDLE);
4436 
4437 	cur_thread_snap->ths_last_run_time           = thread->last_run_time;
4438 	cur_thread_snap->ths_last_made_runnable_time = thread->last_made_runnable_time;
4439 	cur_thread_snap->ths_state                   = thread->state;
4440 	cur_thread_snap->ths_sched_flags             = thread->sched_flags;
4441 	cur_thread_snap->ths_base_priority = thread->base_pri;
4442 	cur_thread_snap->ths_sched_priority = thread->sched_pri;
4443 	cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos;
4444 	cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos;
4445 	cur_thread_snap->ths_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
4446 	    thread->requested_policy.thrp_qos_workq_override);
4447 	cur_thread_snap->ths_io_tier = (uint8_t) proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
4448 	cur_thread_snap->ths_thread_t = VM_KERNEL_UNSLIDE_OR_PERM(thread);
4449 
4450 	static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t));
4451 	static_assert(sizeof(thread->requested_policy) == sizeof(uint64_t));
4452 	cur_thread_snap->ths_requested_policy = *(unaligned_u64 *) &thread->requested_policy;
4453 	cur_thread_snap->ths_effective_policy = *(unaligned_u64 *) &thread->effective_policy;
4454 
4455 #if STACKSHOT_COLLECTS_LATENCY_INFO
4456 	latency_info.cur_thsnap2_latency = mach_absolute_time()  - latency_info.cur_thsnap2_latency;
4457 	latency_info.thread_name_latency = mach_absolute_time();
4458 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4459 
4460 	/* if there is thread name then add to buffer */
4461 	cur_thread_name[0] = '\0';
4462 	proc_threadname_kdp(get_bsdthread_info(thread), cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE);
4463 	if (strnlen(cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE) > 0) {
4464 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_NAME, sizeof(cur_thread_name), &out_addr));
4465 		kdp_memcpy((void *)out_addr, (void *)cur_thread_name, sizeof(cur_thread_name));
4466 	}
4467 
4468 #if STACKSHOT_COLLECTS_LATENCY_INFO
4469 	latency_info.thread_name_latency = mach_absolute_time()  - latency_info.thread_name_latency;
4470 	latency_info.sur_times_latency = mach_absolute_time();
4471 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4472 
4473 	/* record system, user, and runnable times */
4474 	time_value_t runnable_time;
4475 	thread_read_times(thread, NULL, NULL, &runnable_time);
4476 	clock_sec_t user_sec = 0, system_sec = 0;
4477 	clock_usec_t user_usec = 0, system_usec = 0;
4478 	absolutetime_to_microtime(times.rtm_user, &user_sec, &user_usec);
4479 	absolutetime_to_microtime(times.rtm_system, &system_sec, &system_usec);
4480 
4481 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times_v2), &out_addr));
4482 	struct stackshot_cpu_times_v2 *stackshot_cpu_times = (struct stackshot_cpu_times_v2 *)out_addr;
4483 	*stackshot_cpu_times = (struct stackshot_cpu_times_v2){
4484 		.user_usec = user_sec * USEC_PER_SEC + user_usec,
4485 		.system_usec = system_sec * USEC_PER_SEC + system_usec,
4486 		.runnable_usec = (uint64_t)runnable_time.seconds * USEC_PER_SEC + runnable_time.microseconds,
4487 	};
4488 
4489 #if STACKSHOT_COLLECTS_LATENCY_INFO
4490 	latency_info.sur_times_latency = mach_absolute_time()  - latency_info.sur_times_latency;
4491 	latency_info.user_stack_latency = mach_absolute_time();
4492 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4493 
4494 	/* Trace user stack, if any */
4495 	if (!active_kthreads_only_p && task->active && task->map != kernel_map) {
4496 		uint32_t user_ths_ss_flags = 0;
4497 
4498 		/*
4499 		 * We don't know how big the stacktrace will be, so read it into our
4500 		 * per-cpu buffer, then copy it to the kcdata.
4501 		 */
4502 		struct _stackshot_backtrace_context ctx = {
4503 			.sbc_map = task->map,
4504 			.sbc_allow_faulting = stackshot_ctx.sc_enable_faulting,
4505 			.sbc_prev_page = -1,
4506 			.sbc_prev_kva = -1,
4507 		};
4508 		struct backtrace_control ctl = {
4509 			.btc_user_thread = thread,
4510 			.btc_user_copy = _stackshot_backtrace_copy,
4511 			.btc_user_copy_context = &ctx,
4512 		};
4513 		struct backtrace_user_info info = BTUINFO_INIT;
4514 
4515 		saved_count = backtrace_user(stackshot_cpu_ctx.scc_stack_buffer, MAX_FRAMES, &ctl,
4516 		    &info);
4517 		if (saved_count > 0) {
4518 #if __LP64__
4519 #define STACKLR_WORDS STACKSHOT_KCTYPE_USER_STACKLR64
4520 #else // __LP64__
4521 #define STACKLR_WORDS STACKSHOT_KCTYPE_USER_STACKLR
4522 #endif // !__LP64__
4523 			/* Now, copy the stacktrace into kcdata. */
4524 			kcd_exit_on_error(kcdata_push_array(kcd, STACKLR_WORDS, sizeof(uintptr_t),
4525 			    saved_count, stackshot_cpu_ctx.scc_stack_buffer));
4526 			if (info.btui_info & BTI_64_BIT) {
4527 				user_ths_ss_flags |= kUser64_p;
4528 			}
4529 			if ((info.btui_info & BTI_TRUNCATED) ||
4530 			    (ctx.sbc_flags & kThreadTruncatedBT)) {
4531 				user_ths_ss_flags |= kThreadTruncatedBT;
4532 				user_ths_ss_flags |= kThreadTruncUserBT;
4533 			}
4534 			user_ths_ss_flags |= ctx.sbc_flags;
4535 			ctx.sbc_flags = 0;
4536 #if __LP64__
4537 			/* We only support async stacks on 64-bit kernels */
4538 			if (info.btui_async_frame_addr != 0) {
4539 				uint32_t async_start_offset = info.btui_async_start_index;
4540 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_USER_ASYNC_START_INDEX,
4541 				    sizeof(async_start_offset), &async_start_offset));
4542 				ctl.btc_frame_addr = info.btui_async_frame_addr;
4543 				ctl.btc_addr_offset = BTCTL_ASYNC_ADDR_OFFSET;
4544 				info = BTUINFO_INIT;
4545 				unsigned int async_count = backtrace_user(stackshot_cpu_ctx.scc_stack_buffer, MAX_FRAMES, &ctl,
4546 				    &info);
4547 				if (async_count > 0) {
4548 					kcd_exit_on_error(kcdata_push_array(kcd, STACKSHOT_KCTYPE_USER_ASYNC_STACKLR64,
4549 					    sizeof(uintptr_t), async_count, stackshot_cpu_ctx.scc_stack_buffer));
4550 					if ((info.btui_info & BTI_TRUNCATED) ||
4551 					    (ctx.sbc_flags & kThreadTruncatedBT)) {
4552 						user_ths_ss_flags |= kThreadTruncatedBT;
4553 						user_ths_ss_flags |= kThreadTruncUserAsyncBT;
4554 					}
4555 					user_ths_ss_flags |= ctx.sbc_flags;
4556 				}
4557 			}
4558 #endif /* _LP64 */
4559 		}
4560 		if (user_ths_ss_flags != 0) {
4561 			cur_thread_snap->ths_ss_flags |= user_ths_ss_flags;
4562 		}
4563 	}
4564 
4565 #if STACKSHOT_COLLECTS_LATENCY_INFO
4566 	latency_info.user_stack_latency = mach_absolute_time()  - latency_info.user_stack_latency;
4567 	latency_info.kernel_stack_latency = mach_absolute_time();
4568 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4569 
4570 	/* Call through to the machine specific trace routines
4571 	 * Frames are added past the snapshot header.
4572 	 */
4573 	if (thread->kernel_stack != 0) {
4574 		uint32_t kern_ths_ss_flags = 0;
4575 #if defined(__LP64__)
4576 		uint32_t stack_kcdata_type = STACKSHOT_KCTYPE_KERN_STACKLR64;
4577 		extern int machine_trace_thread64(thread_t thread, char *tracepos,
4578 		    char *tracebound, int nframes, uint32_t *thread_trace_flags);
4579 		saved_count = machine_trace_thread64(
4580 #else
4581 		uint32_t stack_kcdata_type = STACKSHOT_KCTYPE_KERN_STACKLR;
4582 		extern int machine_trace_thread(thread_t thread, char *tracepos,
4583 		    char *tracebound, int nframes, uint32_t *thread_trace_flags);
4584 		saved_count = machine_trace_thread(
4585 #endif
4586 			thread, (char*) stackshot_cpu_ctx.scc_stack_buffer,
4587 			(char *) (stackshot_cpu_ctx.scc_stack_buffer + MAX_FRAMES), MAX_FRAMES,
4588 			&kern_ths_ss_flags);
4589 		if (saved_count > 0) {
4590 			int frame_size = sizeof(uintptr_t);
4591 #if defined(__LP64__)
4592 			cur_thread_snap->ths_ss_flags |= kKernel64_p;
4593 #endif
4594 #if CONFIG_EXCLAVES
4595 			if (thread->th_exclaves_state & TH_EXCLAVES_RPC) {
4596 				struct thread_exclaves_info info = { 0 };
4597 
4598 				info.tei_flags = kExclaveRPCActive;
4599 				if (thread->th_exclaves_state & TH_EXCLAVES_SCHEDULER_REQUEST) {
4600 					info.tei_flags |= kExclaveSchedulerRequest;
4601 				}
4602 				if (thread->th_exclaves_state & TH_EXCLAVES_UPCALL) {
4603 					info.tei_flags |= kExclaveUpcallActive;
4604 				}
4605 				info.tei_scid = thread->th_exclaves_ipc_ctx.scid;
4606 				info.tei_thread_offset = exclaves_stack_offset(stackshot_cpu_ctx.scc_stack_buffer, saved_count / frame_size, false);
4607 
4608 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_KERN_EXCLAVES_THREADINFO, sizeof(struct thread_exclaves_info), &info));
4609 			}
4610 #endif /* CONFIG_EXCLAVES */
4611 			kcd_exit_on_error(kcdata_push_array(kcd, stack_kcdata_type,
4612 			    frame_size, saved_count / frame_size, stackshot_cpu_ctx.scc_stack_buffer));
4613 		}
4614 		if (kern_ths_ss_flags & kThreadTruncatedBT) {
4615 			kern_ths_ss_flags |= kThreadTruncKernBT;
4616 		}
4617 		if (kern_ths_ss_flags != 0) {
4618 			cur_thread_snap->ths_ss_flags |= kern_ths_ss_flags;
4619 		}
4620 	}
4621 
4622 #if STACKSHOT_COLLECTS_LATENCY_INFO
4623 	latency_info.kernel_stack_latency = mach_absolute_time()  - latency_info.kernel_stack_latency;
4624 	latency_info.misc_latency = mach_absolute_time();
4625 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4626 
4627 #if CONFIG_THREAD_GROUPS
4628 	if (trace_flags & STACKSHOT_THREAD_GROUP) {
4629 		uint64_t thread_group_id = thread->thread_group ? thread_group_get_id(thread->thread_group) : 0;
4630 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_GROUP, sizeof(thread_group_id), &out_addr));
4631 		kdp_memcpy((void*)out_addr, &thread_group_id, sizeof(uint64_t));
4632 	}
4633 #endif /* CONFIG_THREAD_GROUPS */
4634 
4635 	if (collect_iostats) {
4636 		kcd_exit_on_error(kcdata_record_thread_iostats(kcd, thread));
4637 	}
4638 
4639 #if CONFIG_PERVASIVE_CPI
4640 	if (collect_instrs_cycles) {
4641 		struct recount_usage usage = { 0 };
4642 		recount_sum_unsafe(&recount_thread_plan, thread->th_recount.rth_lifetime,
4643 		    &usage);
4644 
4645 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_INSTRS_CYCLES, sizeof(struct instrs_cycles_snapshot), &out_addr));
4646 		struct instrs_cycles_snapshot *instrs_cycles = (struct instrs_cycles_snapshot *)out_addr;
4647 		    instrs_cycles->ics_instructions = recount_usage_instructions(&usage);
4648 		    instrs_cycles->ics_cycles = recount_usage_cycles(&usage);
4649 	}
4650 #endif /* CONFIG_PERVASIVE_CPI */
4651 
4652 #if STACKSHOT_COLLECTS_LATENCY_INFO
4653 	latency_info.misc_latency = mach_absolute_time() - latency_info.misc_latency;
4654 	if (collect_latency_info) {
4655 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_LATENCY_INFO_THREAD, sizeof(latency_info), &latency_info));
4656 	}
4657 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4658 
4659 error_exit:
4660 	return error;
4661 }
4662 
4663 static int
kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread_snap,thread_t thread,boolean_t thread_on_core)4664 kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread_snap, thread_t thread, boolean_t thread_on_core)
4665 {
4666 	cur_thread_snap->tds_thread_id = thread_tid(thread);
4667 	if (IPC_VOUCHER_NULL != thread->ith_voucher) {
4668 		cur_thread_snap->tds_voucher_identifier  = VM_KERNEL_ADDRPERM(thread->ith_voucher);
4669 	} else {
4670 		cur_thread_snap->tds_voucher_identifier = 0;
4671 	}
4672 
4673 	cur_thread_snap->tds_ss_flags = 0;
4674 	if (thread->effective_policy.thep_darwinbg) {
4675 		cur_thread_snap->tds_ss_flags |= kThreadDarwinBG;
4676 	}
4677 	if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) {
4678 		cur_thread_snap->tds_ss_flags |= kThreadIOPassive;
4679 	}
4680 	if (thread->suspend_count > 0) {
4681 		cur_thread_snap->tds_ss_flags |= kThreadSuspended;
4682 	}
4683 	if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
4684 		cur_thread_snap->tds_ss_flags |= kGlobalForcedIdle;
4685 	}
4686 	if (thread_on_core) {
4687 		cur_thread_snap->tds_ss_flags |= kThreadOnCore;
4688 	}
4689 	if (stackshot_thread_is_idle_worker_unsafe(thread)) {
4690 		cur_thread_snap->tds_ss_flags |= kThreadIdleWorker;
4691 	}
4692 
4693 	cur_thread_snap->tds_last_made_runnable_time = thread->last_made_runnable_time;
4694 	cur_thread_snap->tds_state                   = thread->state;
4695 	cur_thread_snap->tds_sched_flags             = thread->sched_flags;
4696 	cur_thread_snap->tds_base_priority           = thread->base_pri;
4697 	cur_thread_snap->tds_sched_priority          = thread->sched_pri;
4698 	cur_thread_snap->tds_eqos                    = thread->effective_policy.thep_qos;
4699 	cur_thread_snap->tds_rqos                    = thread->requested_policy.thrp_qos;
4700 	cur_thread_snap->tds_rqos_override           = MAX(thread->requested_policy.thrp_qos_override,
4701 	    thread->requested_policy.thrp_qos_workq_override);
4702 	cur_thread_snap->tds_io_tier                 = (uint8_t) proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
4703 
4704 	static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t));
4705 	static_assert(sizeof(thread->requested_policy) == sizeof(uint64_t));
4706 	cur_thread_snap->tds_requested_policy = *(unaligned_u64 *) &thread->requested_policy;
4707 	cur_thread_snap->tds_effective_policy = *(unaligned_u64 *) &thread->effective_policy;
4708 
4709 	return 0;
4710 }
4711 
4712 /*
4713  * Why 12?  12 strikes a decent balance between allocating a large array on
4714  * the stack and having large kcdata item overheads for recording nonrunable
4715  * tasks.
4716  */
4717 #define UNIQUEIDSPERFLUSH 12
4718 
4719 struct saved_uniqueids {
4720 	uint64_t ids[UNIQUEIDSPERFLUSH];
4721 	unsigned count;
4722 };
4723 
4724 enum thread_classification {
4725 	tc_full_snapshot,  /* take a full snapshot */
4726 	tc_delta_snapshot, /* take a delta snapshot */
4727 };
4728 
4729 static enum thread_classification
classify_thread(thread_t thread,boolean_t * thread_on_core_p,boolean_t collect_delta_stackshot)4730 classify_thread(thread_t thread, boolean_t * thread_on_core_p, boolean_t collect_delta_stackshot)
4731 {
4732 	processor_t last_processor = thread->last_processor;
4733 
4734 	boolean_t thread_on_core = FALSE;
4735 	if (last_processor != PROCESSOR_NULL) {
4736 		/* Idle threads are always treated as on-core, since the processor state can change while they are running. */
4737 		thread_on_core = (thread == last_processor->idle_thread) ||
4738 		    (last_processor->state == PROCESSOR_RUNNING &&
4739 		    last_processor->active_thread == thread);
4740 	}
4741 
4742 	*thread_on_core_p = thread_on_core;
4743 
4744 	/* Capture the full thread snapshot if this is not a delta stackshot or if the thread has run subsequent to the
4745 	 * previous full stackshot */
4746 	if (!collect_delta_stackshot || thread_on_core || (thread->last_run_time > stackshot_args.since_timestamp)) {
4747 		return tc_full_snapshot;
4748 	} else {
4749 		return tc_delta_snapshot;
4750 	}
4751 }
4752 
4753 
4754 static kern_return_t
kdp_stackshot_record_task(task_t task)4755 kdp_stackshot_record_task(task_t task)
4756 {
4757 	boolean_t active_kthreads_only_p  = ((stackshot_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
4758 	boolean_t collect_delta_stackshot = ((stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
4759 	boolean_t save_owner_info         = ((stackshot_flags & STACKSHOT_THREAD_WAITINFO) != 0);
4760 	boolean_t include_drivers         = ((stackshot_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) != 0);
4761 
4762 	kern_return_t error = KERN_SUCCESS;
4763 	mach_vm_address_t out_addr = 0;
4764 	int saved_count = 0;
4765 
4766 	int task_pid                   = 0;
4767 	uint64_t task_uniqueid         = 0;
4768 	int num_delta_thread_snapshots = 0;
4769 	int num_waitinfo_threads       = 0;
4770 	int num_turnstileinfo_threads  = 0;
4771 
4772 	uint64_t task_start_abstime    = 0;
4773 	boolean_t have_map = FALSE, have_pmap = FALSE;
4774 	boolean_t some_thread_ran = FALSE;
4775 	unaligned_u64 task_snap_ss_flags = 0;
4776 #if STACKSHOT_COLLECTS_LATENCY_INFO
4777 	struct stackshot_latency_task latency_info;
4778 	latency_info.setup_latency = mach_absolute_time();
4779 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4780 
4781 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
4782 	uint64_t task_begin_cpu_cycle_count = 0;
4783 	if (!stackshot_ctx.sc_panic_stackshot) {
4784 		task_begin_cpu_cycle_count = mt_cur_cpu_cycles();
4785 	}
4786 #endif
4787 
4788 	if ((task == NULL) || !_stackshot_validate_kva((vm_offset_t)task, sizeof(struct task))) {
4789 		error = KERN_FAILURE;
4790 		goto error_exit;
4791 	}
4792 
4793 	void *bsd_info = get_bsdtask_info(task);
4794 	boolean_t task_in_teardown        = (bsd_info == NULL) || proc_in_teardown(bsd_info);// has P_LPEXIT set during proc_exit()
4795 	boolean_t task_in_transition      = task_in_teardown;         // here we can add other types of transition.
4796 	uint32_t  container_type          = (task_in_transition) ? STACKSHOT_KCCONTAINER_TRANSITIONING_TASK : STACKSHOT_KCCONTAINER_TASK;
4797 	uint32_t  transition_type         = (task_in_teardown) ? kTaskIsTerminated : 0;
4798 	/* Task just exec'd and this is the old task */
4799 	bool      task_is_exec_transit    = task_did_exec_internal(task) || task_is_exec_copy_internal(task);
4800 
4801 	if (task_in_transition) {
4802 		collect_delta_stackshot = FALSE;
4803 	}
4804 
4805 	have_map = (task->map != NULL) && (_stackshot_validate_kva((vm_offset_t)(task->map), sizeof(struct _vm_map)));
4806 	have_pmap = have_map && (task->map->pmap != NULL) && (_stackshot_validate_kva((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
4807 
4808 	task_pid = pid_from_task(task);
4809 	/* Is returning -1 ok for terminating task ok ??? */
4810 	task_uniqueid = get_task_uniqueid(task);
4811 
4812 	if (!task->active || task_is_a_corpse(task) || task_is_a_corpse_fork(task)) {
4813 		/*
4814 		 * Not interested in terminated tasks without threads.
4815 		 */
4816 		if (queue_empty(&task->threads) || task_pid == -1) {
4817 			return KERN_SUCCESS;
4818 		}
4819 	}
4820 
4821 	/* All PIDs should have the MSB unset */
4822 	assert((task_pid & (1ULL << 31)) == 0);
4823 
4824 #if STACKSHOT_COLLECTS_LATENCY_INFO
4825 	latency_info.setup_latency = mach_absolute_time() - latency_info.setup_latency;
4826 	latency_info.task_uniqueid = task_uniqueid;
4827 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4828 
4829 	/* Trace everything, unless a process was specified. Add in driver tasks if requested. */
4830 	if ((stackshot_args.pid == -1) ||
4831 	    ((stackshot_args.pid == task_pid) && !task_is_exec_transit) ||
4832 	    (include_drivers && task_is_driver(task))) {
4833 #if STACKSHOT_COLLECTS_LATENCY_INFO
4834 		stackshot_cpu_latency.tasks_processed++;
4835 #endif
4836 
4837 		/* add task snapshot marker */
4838 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
4839 		    container_type, task_uniqueid));
4840 
4841 		if (collect_delta_stackshot) {
4842 			/*
4843 			 * For delta stackshots we need to know if a thread from this task has run since the
4844 			 * previous timestamp to decide whether we're going to record a full snapshot and UUID info.
4845 			 */
4846 			thread_t thread = THREAD_NULL;
4847 			queue_iterate(&task->threads, thread, thread_t, task_threads)
4848 			{
4849 				if ((thread == NULL) || !_stackshot_validate_kva((vm_offset_t)thread, sizeof(struct thread))) {
4850 					error = KERN_FAILURE;
4851 					goto error_exit;
4852 				}
4853 
4854 				if (active_kthreads_only_p && thread->kernel_stack == 0) {
4855 					continue;
4856 				}
4857 
4858 				boolean_t thread_on_core;
4859 				enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, collect_delta_stackshot);
4860 
4861 				switch (thread_classification) {
4862 				case tc_full_snapshot:
4863 					some_thread_ran = TRUE;
4864 					break;
4865 				case tc_delta_snapshot:
4866 					num_delta_thread_snapshots++;
4867 					break;
4868 				}
4869 			}
4870 		}
4871 
4872 		if (collect_delta_stackshot) {
4873 			proc_starttime_kdp(get_bsdtask_info(task), NULL, NULL, &task_start_abstime);
4874 		}
4875 
4876 		/* Next record any relevant UUID info and store the task snapshot */
4877 		if (task_in_transition ||
4878 		    !collect_delta_stackshot ||
4879 		    (task_start_abstime == 0) ||
4880 		    (task_start_abstime > stackshot_args.since_timestamp) ||
4881 		    some_thread_ran) {
4882 			/*
4883 			 * Collect full task information in these scenarios:
4884 			 *
4885 			 * 1) a full stackshot or the task is in transition
4886 			 * 2) a delta stackshot where the task started after the previous full stackshot
4887 			 * 3) a delta stackshot where any thread from the task has run since the previous full stackshot
4888 			 *
4889 			 * because the task may have exec'ed, changing its name, architecture, load info, etc
4890 			 */
4891 
4892 			kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, &task_snap_ss_flags));
4893 			kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, stackshot_flags, have_pmap, &task_snap_ss_flags));
4894 			kcd_exit_on_error(kcdata_record_task_exec_meta(stackshot_kcdata_p, task));
4895 #if STACKSHOT_COLLECTS_LATENCY_INFO
4896 			if (!task_in_transition) {
4897 				kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags, &latency_info));
4898 			} else {
4899 				kcd_exit_on_error(kcdata_record_transitioning_task_snapshot(stackshot_kcdata_p, task, task_snap_ss_flags, transition_type));
4900 			}
4901 #else
4902 			if (!task_in_transition) {
4903 				kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags));
4904 			} else {
4905 				kcd_exit_on_error(kcdata_record_transitioning_task_snapshot(stackshot_kcdata_p, task, task_snap_ss_flags, transition_type));
4906 			}
4907 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4908 		} else {
4909 			kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags));
4910 		}
4911 
4912 #if STACKSHOT_COLLECTS_LATENCY_INFO
4913 		latency_info.misc_latency = mach_absolute_time();
4914 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4915 
4916 		struct thread_delta_snapshot_v3 * delta_snapshots = NULL;
4917 		int current_delta_snapshot_index                  = 0;
4918 		if (num_delta_thread_snapshots > 0) {
4919 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT,
4920 			    sizeof(struct thread_delta_snapshot_v3),
4921 			    num_delta_thread_snapshots, &out_addr));
4922 			delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr;
4923 		}
4924 
4925 
4926 #if STACKSHOT_COLLECTS_LATENCY_INFO
4927 		latency_info.task_thread_count_loop_latency = mach_absolute_time();
4928 #endif
4929 		/*
4930 		 * Iterate over the task threads to save thread snapshots and determine
4931 		 * how much space we need for waitinfo and turnstile info
4932 		 */
4933 		thread_t thread = THREAD_NULL;
4934 		queue_iterate(&task->threads, thread, thread_t, task_threads)
4935 		{
4936 			if ((thread == NULL) || !_stackshot_validate_kva((vm_offset_t)thread, sizeof(struct thread))) {
4937 				error = KERN_FAILURE;
4938 				goto error_exit;
4939 			}
4940 
4941 			uint64_t thread_uniqueid;
4942 			if (active_kthreads_only_p && thread->kernel_stack == 0) {
4943 				continue;
4944 			}
4945 			thread_uniqueid = thread_tid(thread);
4946 
4947 			boolean_t thread_on_core;
4948 			enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, collect_delta_stackshot);
4949 
4950 #if STACKSHOT_COLLECTS_LATENCY_INFO
4951 			stackshot_cpu_latency.threads_processed++;
4952 #endif
4953 
4954 			switch (thread_classification) {
4955 			case tc_full_snapshot:
4956 				/* add thread marker */
4957 				kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
4958 				    STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
4959 
4960 				/* thread snapshot can be large, including strings, avoid overflowing the stack. */
4961 				kcdata_compression_window_open(stackshot_kcdata_p);
4962 
4963 				kcd_exit_on_error(kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, stackshot_flags, have_pmap, thread_on_core));
4964 
4965 				kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
4966 
4967 				/* mark end of thread snapshot data */
4968 				kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
4969 				    STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
4970 				break;
4971 			case tc_delta_snapshot:
4972 				kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++], thread, thread_on_core));
4973 				break;
4974 			}
4975 
4976 			/*
4977 			 * We want to report owner information regardless of whether a thread
4978 			 * has changed since the last delta, whether it's a normal stackshot,
4979 			 * or whether it's nonrunnable
4980 			 */
4981 			if (save_owner_info) {
4982 				if (stackshot_thread_has_valid_waitinfo(thread)) {
4983 					num_waitinfo_threads++;
4984 				}
4985 
4986 				if (stackshot_thread_has_valid_turnstileinfo(thread)) {
4987 					num_turnstileinfo_threads++;
4988 				}
4989 			}
4990 		}
4991 #if STACKSHOT_COLLECTS_LATENCY_INFO
4992 		latency_info.task_thread_count_loop_latency = mach_absolute_time() - latency_info.task_thread_count_loop_latency;
4993 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4994 
4995 		thread_waitinfo_v2_t *thread_waitinfo           = NULL;
4996 		thread_turnstileinfo_v2_t *thread_turnstileinfo = NULL;
4997 		int current_waitinfo_index              = 0;
4998 		int current_turnstileinfo_index         = 0;
4999 		/* allocate space for the wait and turnstil info */
5000 		if (num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) {
5001 			/* thread waitinfo and turnstileinfo can be quite large, avoid overflowing the stack */
5002 			kcdata_compression_window_open(stackshot_kcdata_p);
5003 
5004 			if (num_waitinfo_threads > 0) {
5005 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO,
5006 				    sizeof(thread_waitinfo_v2_t), num_waitinfo_threads, &out_addr));
5007 				thread_waitinfo = (thread_waitinfo_v2_t *)out_addr;
5008 			}
5009 
5010 			if (num_turnstileinfo_threads > 0) {
5011 				/* get space for the turnstile info */
5012 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO,
5013 				    sizeof(thread_turnstileinfo_v2_t), num_turnstileinfo_threads, &out_addr));
5014 				thread_turnstileinfo = (thread_turnstileinfo_v2_t *)out_addr;
5015 			}
5016 
5017 			stackshot_plh_resetgen();  // so we know which portlabel_ids are referenced
5018 		}
5019 
5020 #if STACKSHOT_COLLECTS_LATENCY_INFO
5021 		latency_info.misc_latency = mach_absolute_time() - latency_info.misc_latency;
5022 		latency_info.task_thread_data_loop_latency = mach_absolute_time();
5023 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5024 
5025 		/* Iterate over the task's threads to save the wait and turnstile info */
5026 		queue_iterate(&task->threads, thread, thread_t, task_threads)
5027 		{
5028 			uint64_t thread_uniqueid;
5029 			#pragma unused(thread_uniqueid)
5030 
5031 			if (active_kthreads_only_p && thread->kernel_stack == 0) {
5032 				continue;
5033 			}
5034 
5035 			thread_uniqueid = thread_tid(thread);
5036 
5037 			/* If we want owner info, we should capture it regardless of its classification */
5038 			if (save_owner_info) {
5039 				if (stackshot_thread_has_valid_waitinfo(thread)) {
5040 					stackshot_thread_wait_owner_info(
5041 						thread,
5042 						&thread_waitinfo[current_waitinfo_index++]);
5043 				}
5044 
5045 				if (stackshot_thread_has_valid_turnstileinfo(thread)) {
5046 					stackshot_thread_turnstileinfo(
5047 						thread,
5048 						&thread_turnstileinfo[current_turnstileinfo_index++]);
5049 				}
5050 			}
5051 		}
5052 
5053 #if STACKSHOT_COLLECTS_LATENCY_INFO
5054 		latency_info.task_thread_data_loop_latency = mach_absolute_time() - latency_info.task_thread_data_loop_latency;
5055 		latency_info.misc2_latency = mach_absolute_time();
5056 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5057 
5058 #if DEBUG || DEVELOPMENT
5059 		if (current_delta_snapshot_index != num_delta_thread_snapshots) {
5060 			panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
5061 			    num_delta_thread_snapshots, current_delta_snapshot_index);
5062 		}
5063 		if (current_waitinfo_index != num_waitinfo_threads) {
5064 			panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
5065 			    num_waitinfo_threads, current_waitinfo_index);
5066 		}
5067 #endif
5068 
5069 		if (num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) {
5070 			kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5071 			// now, record the portlabel hashes.
5072 			kcd_exit_on_error(kdp_stackshot_plh_record());
5073 		}
5074 
5075 #if IMPORTANCE_INHERITANCE
5076 		/* Ensure the buffer is big enough, since we're using the stack buffer for this. */
5077 		static_assert(TASK_IMP_WALK_LIMIT * sizeof(int32_t) <= MAX_FRAMES * sizeof(uintptr_t));
5078 		saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS,
5079 		    (char*) stackshot_cpu_ctx.scc_stack_buffer, TASK_IMP_WALK_LIMIT);
5080 		if (saved_count > 0) {
5081 			/* Variable size array - better not have it on the stack. */
5082 			kcdata_compression_window_open(stackshot_kcdata_p);
5083 			kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS,
5084 			    sizeof(int32_t), saved_count, stackshot_cpu_ctx.scc_stack_buffer));
5085 			kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5086 		}
5087 #endif
5088 
5089 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5090 		if (!stackshot_ctx.sc_panic_stackshot) {
5091 			kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - task_begin_cpu_cycle_count),
5092 			    "task_cpu_cycle_count"));
5093 		}
5094 #endif
5095 
5096 #if STACKSHOT_COLLECTS_LATENCY_INFO
5097 		latency_info.misc2_latency = mach_absolute_time() - latency_info.misc2_latency;
5098 		if (collect_latency_info) {
5099 			kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_LATENCY_INFO_TASK, sizeof(latency_info), &latency_info));
5100 		}
5101 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5102 
5103 		/* mark end of task snapshot data */
5104 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, container_type,
5105 		    task_uniqueid));
5106 	}
5107 
5108 
5109 error_exit:
5110 	return error;
5111 }
5112 
5113 /* Record global shared regions */
5114 static kern_return_t
kdp_stackshot_shared_regions(uint64_t trace_flags)5115 kdp_stackshot_shared_regions(uint64_t trace_flags)
5116 {
5117 	kern_return_t error        = KERN_SUCCESS;
5118 
5119 	boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
5120 	extern queue_head_t vm_shared_region_queue;
5121 	vm_shared_region_t sr;
5122 
5123 	extern queue_head_t vm_shared_region_queue;
5124 	queue_iterate(&vm_shared_region_queue,
5125 	    sr,
5126 	    vm_shared_region_t,
5127 	    sr_q) {
5128 		struct dyld_shared_cache_loadinfo_v2 scinfo = {0};
5129 		if (!_stackshot_validate_kva((vm_offset_t)sr, sizeof(*sr))) {
5130 			break;
5131 		}
5132 		if (collect_delta_stackshot && sr->sr_install_time < stackshot_args.since_timestamp) {
5133 			continue; // only include new shared caches in delta stackshots
5134 		}
5135 		uint32_t sharedCacheFlags = ((sr == primary_system_shared_region) ? kSharedCacheSystemPrimary : 0) |
5136 		    (sr->sr_driverkit ? kSharedCacheDriverkit : 0);
5137 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
5138 		    STACKSHOT_KCCONTAINER_SHAREDCACHE, sr->sr_id));
5139 		kdp_memcpy(scinfo.sharedCacheUUID, sr->sr_uuid, sizeof(sr->sr_uuid));
5140 		scinfo.sharedCacheSlide = sr->sr_slide;
5141 		scinfo.sharedCacheUnreliableSlidBaseAddress = sr->sr_base_address + sr->sr_first_mapping;
5142 		scinfo.sharedCacheSlidFirstMapping = sr->sr_base_address + sr->sr_first_mapping;
5143 		scinfo.sharedCacheID = sr->sr_id;
5144 		scinfo.sharedCacheFlags = sharedCacheFlags;
5145 
5146 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_INFO,
5147 		    sizeof(scinfo), &scinfo));
5148 
5149 		if ((trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) && sr->sr_images != NULL &&
5150 		    _stackshot_validate_kva((vm_offset_t)sr->sr_images, sr->sr_images_count * sizeof(struct dyld_uuid_info_64))) {
5151 			assert(sr->sr_images_count != 0);
5152 			kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), sr->sr_images_count, sr->sr_images));
5153 		}
5154 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
5155 		    STACKSHOT_KCCONTAINER_SHAREDCACHE, sr->sr_id));
5156 	}
5157 
5158 	/*
5159 	 * For backwards compatibility; this will eventually be removed.
5160 	 * Another copy of the Primary System Shared Region, for older readers.
5161 	 */
5162 	sr = primary_system_shared_region;
5163 	/* record system level shared cache load info (if available) */
5164 	if (!collect_delta_stackshot && sr &&
5165 	    _stackshot_validate_kva((vm_offset_t)sr, sizeof(struct vm_shared_region))) {
5166 		struct dyld_shared_cache_loadinfo scinfo = {0};
5167 
5168 		/*
5169 		 * Historically, this data was in a dyld_uuid_info_64 structure, but the
5170 		 * naming of both the structure and fields for this use isn't great.  The
5171 		 * dyld_shared_cache_loadinfo structure has better names, but the same
5172 		 * layout and content as the original.
5173 		 *
5174 		 * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
5175 		 * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
5176 		 * entries; here, it's the slid base address, and we leave it that way
5177 		 * for backwards compatibility.
5178 		 */
5179 		kdp_memcpy(scinfo.sharedCacheUUID, &sr->sr_uuid, sizeof(sr->sr_uuid));
5180 		scinfo.sharedCacheSlide = sr->sr_slide;
5181 		scinfo.sharedCacheUnreliableSlidBaseAddress = sr->sr_slide + sr->sr_base_address;
5182 		scinfo.sharedCacheSlidFirstMapping = sr->sr_base_address + sr->sr_first_mapping;
5183 
5184 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO,
5185 		    sizeof(scinfo), &scinfo));
5186 
5187 		if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) {
5188 			/*
5189 			 * Include a map of the system shared cache layout if it has been populated
5190 			 * (which is only when the system is using a custom shared cache).
5191 			 */
5192 			if (sr->sr_images && _stackshot_validate_kva((vm_offset_t)sr->sr_images,
5193 			    (sr->sr_images_count * sizeof(struct dyld_uuid_info_64)))) {
5194 				assert(sr->sr_images_count != 0);
5195 				kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), sr->sr_images_count, sr->sr_images));
5196 			}
5197 		}
5198 	}
5199 
5200 error_exit:
5201 	return error;
5202 }
5203 
5204 static kern_return_t
kdp_stackshot_kcdata_format(void)5205 kdp_stackshot_kcdata_format(void)
5206 {
5207 	kern_return_t error        = KERN_SUCCESS;
5208 	mach_vm_address_t out_addr = 0;
5209 	uint64_t abs_time = 0;
5210 	uint64_t system_state_flags = 0;
5211 	task_t task = TASK_NULL;
5212 	mach_timebase_info_data_t timebase = {0, 0};
5213 	uint32_t length_to_copy = 0, tmp32 = 0;
5214 	abs_time = mach_absolute_time();
5215 	uint64_t last_task_start_time = 0;
5216 	int cur_workitem_index = 0;
5217 	uint64_t tasks_in_stackshot = 0;
5218 	uint64_t threads_in_stackshot = 0;
5219 
5220 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5221 	uint64_t stackshot_begin_cpu_cycle_count = 0;
5222 
5223 	if (!stackshot_ctx.sc_panic_stackshot) {
5224 		stackshot_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5225 	}
5226 #endif
5227 
5228 	/* the CPU entering here is participating in the stackshot */
5229 	stackshot_cpu_ctx.scc_did_work = true;
5230 
5231 #if STACKSHOT_COLLECTS_LATENCY_INFO
5232 	collect_latency_info = stackshot_flags & STACKSHOT_DISABLE_LATENCY_INFO ? false : true;
5233 #endif
5234 	/* process the flags */
5235 	bool collect_delta_stackshot = ((stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
5236 	bool collect_exclaves        = !disable_exclave_stackshot && ((stackshot_flags & STACKSHOT_SKIP_EXCLAVES) == 0);
5237 	stackshot_ctx.sc_enable_faulting = (stackshot_flags & (STACKSHOT_ENABLE_BT_FAULTING));
5238 
5239 	/* Currently we only support returning explicit KEXT load info on fileset kernels */
5240 	kc_format_t primary_kc_type = KCFormatUnknown;
5241 	if (PE_get_primary_kc_format(&primary_kc_type) && (primary_kc_type != KCFormatFileset)) {
5242 		stackshot_flags &= ~(STACKSHOT_SAVE_KEXT_LOADINFO);
5243 	}
5244 
5245 	if (sizeof(void *) == 8) {
5246 		system_state_flags |= kKernel64_p;
5247 	}
5248 
5249 #if CONFIG_EXCLAVES
5250 	if (!stackshot_ctx.sc_panic_stackshot && collect_exclaves) {
5251 		kcd_exit_on_error(stackshot_setup_exclave_waitlist()); /* Allocate list of exclave threads */
5252 	}
5253 #else
5254 #pragma unused(collect_exclaves)
5255 #endif /* CONFIG_EXCLAVES */
5256 
5257 	/* setup mach_absolute_time and timebase info -- copy out in some cases and needed to convert since_timestamp to seconds for proc start time */
5258 	clock_timebase_info(&timebase);
5259 
5260 	/* begin saving data into the buffer */
5261 	if (stackshot_ctx.sc_bytes_uncompressed) {
5262 		stackshot_ctx.sc_bytes_uncompressed = 0;
5263 	}
5264 
5265 	/*
5266 	 * Setup pre-task linked kcdata buffer.
5267 	 * The idea here is that we want the kcdata to be in (roughly) the same order as it was
5268 	 * before we made this multithreaded, so we have separate buffers for pre and post task-iteration,
5269 	 * since that's the parallelized part.
5270 	 */
5271 	if (!stackshot_ctx.sc_is_singlethreaded) {
5272 		kcd_exit_on_error(stackshot_new_linked_kcdata());
5273 		stackshot_ctx.sc_pretask_kcdata = stackshot_cpu_ctx.scc_kcdata_head;
5274 	}
5275 
5276 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, stackshot_flags, "stackshot_in_flags"));
5277 	kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, (uint32_t)stackshot_flags, "stackshot_in_pid"));
5278 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, system_state_flags, "system_state_flags"));
5279 	if (stackshot_flags & STACKSHOT_PAGE_TABLES) {
5280 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_args.pagetable_mask, "stackshot_pagetable_mask"));
5281 	}
5282 	if (stackshot_initial_estimate != 0) {
5283 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate, "stackshot_size_estimate"));
5284 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate_adj, "stackshot_size_estimate_adj"));
5285 	}
5286 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, stackshot_available_task_exec_flags(), "stackshot_te_flags_mask"));
5287 
5288 
5289 #if STACKSHOT_COLLECTS_LATENCY_INFO
5290 	stackshot_ctx.sc_latency.setup_latency_mt = mach_absolute_time();
5291 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5292 
5293 #if CONFIG_JETSAM
5294 	tmp32 = memorystatus_get_pressure_status_kdp();
5295 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_LEVEL, sizeof(uint32_t), &tmp32));
5296 #endif
5297 
5298 	if (!collect_delta_stackshot) {
5299 		tmp32 = THREAD_POLICY_INTERNAL_STRUCT_VERSION;
5300 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_POLICY_VERSION, sizeof(uint32_t), &tmp32));
5301 
5302 		tmp32 = PAGE_SIZE;
5303 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_KERN_PAGE_SIZE, sizeof(uint32_t), &tmp32));
5304 
5305 		/* save boot-args and osversion string */
5306 		length_to_copy =  MIN((uint32_t)(strlen(version) + 1), OSVERSIZE);
5307 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, (const void *)version));
5308 		length_to_copy = MIN((uint32_t)(strlen(osversion) + 1), OSVERSIZE);
5309 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_OS_BUILD_VERSION, length_to_copy, (void *)osversion));
5310 
5311 
5312 		length_to_copy =  MIN((uint32_t)(strlen(PE_boot_args()) + 1), BOOT_LINE_LENGTH);
5313 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, PE_boot_args()));
5314 
5315 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &timebase));
5316 	} else {
5317 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP, sizeof(uint64_t), &stackshot_args.since_timestamp));
5318 	}
5319 
5320 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &abs_time));
5321 
5322 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &stackshot_ctx.sc_microsecs));
5323 
5324 	kcd_exit_on_error(kdp_stackshot_shared_regions(stackshot_flags));
5325 
5326 	/* Add requested information first */
5327 	if (stackshot_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) {
5328 		struct mem_and_io_snapshot mais = {0};
5329 		kdp_mem_and_io_snapshot(&mais);
5330 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_GLOBAL_MEM_STATS, sizeof(mais), &mais));
5331 	}
5332 
5333 
5334 #if CONFIG_THREAD_GROUPS
5335 	struct thread_group_snapshot_v3 *thread_groups = NULL;
5336 	int num_thread_groups = 0;
5337 
5338 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5339 	uint64_t thread_group_begin_cpu_cycle_count = 0;
5340 
5341 	if (!stackshot_ctx.sc_is_singlethreaded && (stackshot_flags & STACKSHOT_THREAD_GROUP)) {
5342 		thread_group_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5343 	}
5344 #endif
5345 
5346 	/* Iterate over thread group names */
5347 	if (stackshot_flags & STACKSHOT_THREAD_GROUP) {
5348 		/* Variable size array - better not have it on the stack. */
5349 		kcdata_compression_window_open(stackshot_kcdata_p);
5350 
5351 		if (thread_group_iterate_stackshot(stackshot_thread_group_count, &num_thread_groups) != KERN_SUCCESS) {
5352 			stackshot_flags &= ~(STACKSHOT_THREAD_GROUP);
5353 		}
5354 
5355 		if (num_thread_groups > 0) {
5356 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT, sizeof(struct thread_group_snapshot_v3), num_thread_groups, &out_addr));
5357 			thread_groups = (struct thread_group_snapshot_v3 *)out_addr;
5358 		}
5359 
5360 		if (thread_group_iterate_stackshot(stackshot_thread_group_snapshot, thread_groups) != KERN_SUCCESS) {
5361 			error = KERN_FAILURE;
5362 			goto error_exit;
5363 		}
5364 
5365 		kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5366 	}
5367 
5368 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5369 	if (!stackshot_ctx.sc_panic_stackshot && (thread_group_begin_cpu_cycle_count != 0)) {
5370 		kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - thread_group_begin_cpu_cycle_count),
5371 		    "thread_groups_cpu_cycle_count"));
5372 	}
5373 #endif
5374 #else
5375 	stackshot_flags &= ~(STACKSHOT_THREAD_GROUP);
5376 #endif /* CONFIG_THREAD_GROUPS */
5377 
5378 
5379 #if STACKSHOT_COLLECTS_LATENCY_INFO
5380 	stackshot_ctx.sc_latency.setup_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.setup_latency_mt;
5381 	if (stackshot_ctx.sc_is_singlethreaded) {
5382 		stackshot_ctx.sc_latency.total_task_iteration_latency_mt = mach_absolute_time();
5383 	} else {
5384 		stackshot_ctx.sc_latency.task_queue_building_latency_mt = mach_absolute_time();
5385 	}
5386 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5387 
5388 	bool const process_scoped = (stackshot_args.pid != -1) &&
5389 	    ((stackshot_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0);
5390 
5391 	/* Iterate over tasks */
5392 	queue_iterate(&tasks, task, task_t, tasks)
5393 	{
5394 		stackshot_panic_guard();
5395 
5396 		if (collect_delta_stackshot) {
5397 			uint64_t abstime;
5398 			proc_starttime_kdp(get_bsdtask_info(task), NULL, NULL, &abstime);
5399 
5400 			if (abstime > last_task_start_time) {
5401 				last_task_start_time = abstime;
5402 			}
5403 		}
5404 
5405 		pid_t task_pid = pid_from_task(task);
5406 
5407 		if (process_scoped && (task_pid != stackshot_args.pid)) {
5408 			continue;
5409 		}
5410 
5411 		if ((task->active && !task_is_a_corpse(task) && !task_is_a_corpse_fork(task)) ||
5412 		    (!queue_empty(&task->threads) && task_pid != -1)) {
5413 			tasks_in_stackshot++;
5414 			threads_in_stackshot += task->thread_count;
5415 		}
5416 
5417 		/* If this is a singlethreaded stackshot, don't use the work queues. */
5418 		if (stackshot_ctx.sc_is_singlethreaded) {
5419 			kcd_exit_on_error(kdp_stackshot_record_task(task));
5420 		} else {
5421 			kcd_exit_on_error(stackshot_put_workitem((struct stackshot_workitem) {
5422 				.sswi_task = task,
5423 				.sswi_data = NULL,
5424 				.sswi_idx = cur_workitem_index++
5425 			}));
5426 		}
5427 
5428 		if (process_scoped) {
5429 			/* Only targeting one process, we're done now. */
5430 			break;
5431 		}
5432 	}
5433 
5434 #if STACKSHOT_COLLECTS_LATENCY_INFO
5435 	if (stackshot_ctx.sc_is_singlethreaded) {
5436 		stackshot_ctx.sc_latency.total_task_iteration_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.total_task_iteration_latency_mt;
5437 	} else {
5438 		stackshot_ctx.sc_latency.task_queue_building_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.task_queue_building_latency_mt;
5439 	}
5440 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5441 
5442 	/* Setup post-task kcdata buffer */
5443 	if (!stackshot_ctx.sc_is_singlethreaded) {
5444 		stackshot_finalize_linked_kcdata();
5445 		kcd_exit_on_error(stackshot_new_linked_kcdata());
5446 		stackshot_ctx.sc_posttask_kcdata = stackshot_cpu_ctx.scc_kcdata_head;
5447 	}
5448 
5449 #if CONFIG_COALITIONS
5450 	/* Don't collect jetsam coalition snapshots in delta stackshots - these don't change */
5451 	if (!collect_delta_stackshot || (last_task_start_time > stackshot_args.since_timestamp)) {
5452 		int num_coalitions = 0;
5453 		struct jetsam_coalition_snapshot *coalitions = NULL;
5454 
5455 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5456 		uint64_t coalition_begin_cpu_cycle_count = 0;
5457 
5458 		if (!stackshot_ctx.sc_panic_stackshot && (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) {
5459 			coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5460 		}
5461 #endif /* SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI */
5462 
5463 		/* Iterate over coalitions */
5464 		if (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
5465 			if (coalition_iterate_stackshot(stackshot_coalition_jetsam_count, &num_coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) {
5466 				stackshot_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
5467 			}
5468 		}
5469 		if (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
5470 			if (num_coalitions > 0) {
5471 				/* Variable size array - better not have it on the stack. */
5472 				kcdata_compression_window_open(stackshot_kcdata_p);
5473 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT, sizeof(struct jetsam_coalition_snapshot), num_coalitions, &out_addr));
5474 				coalitions = (struct jetsam_coalition_snapshot*)out_addr;
5475 
5476 				if (coalition_iterate_stackshot(stackshot_coalition_jetsam_snapshot, coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) {
5477 					error = KERN_FAILURE;
5478 					goto error_exit;
5479 				}
5480 
5481 				kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5482 			}
5483 		}
5484 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5485 		if (!stackshot_ctx.sc_panic_stackshot && (coalition_begin_cpu_cycle_count != 0)) {
5486 			kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count),
5487 			    "coalitions_cpu_cycle_count"));
5488 		}
5489 #endif /* SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI */
5490 	}
5491 #else
5492 	stackshot_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
5493 #endif /* CONFIG_COALITIONS */
5494 
5495 	stackshot_panic_guard();
5496 
5497 #if STACKSHOT_COLLECTS_LATENCY_INFO
5498 	if (stackshot_ctx.sc_is_singlethreaded) {
5499 		stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt = mach_absolute_time();
5500 	} else {
5501 		stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt = mach_absolute_time();
5502 	}
5503 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5504 
5505 	/*
5506 	 * Iterate over the tasks in the terminated tasks list. We only inspect
5507 	 * tasks that have a valid bsd_info pointer. The check for task transition
5508 	 * like past P_LPEXIT during proc_exit() is now checked for inside the
5509 	 * kdp_stackshot_record_task(), and then a safer and minimal
5510 	 * transitioning_task_snapshot struct is collected via
5511 	 * kcdata_record_transitioning_task_snapshot()
5512 	 */
5513 	queue_iterate(&terminated_tasks, task, task_t, tasks)
5514 	{
5515 		stackshot_panic_guard();
5516 
5517 		if ((task->active && !task_is_a_corpse(task) && !task_is_a_corpse_fork(task)) ||
5518 		    (!queue_empty(&task->threads) && pid_from_task(task) != -1)) {
5519 			tasks_in_stackshot++;
5520 			threads_in_stackshot += task->thread_count;
5521 		}
5522 
5523 		/* Only use workqueues on non-panic and non-scoped stackshots. */
5524 		if (stackshot_ctx.sc_is_singlethreaded) {
5525 			kcd_exit_on_error(kdp_stackshot_record_task(task));
5526 		} else {
5527 			kcd_exit_on_error(stackshot_put_workitem((struct stackshot_workitem) {
5528 				.sswi_task = task,
5529 				.sswi_data = NULL,
5530 				.sswi_idx = cur_workitem_index++
5531 			}));
5532 		}
5533 	}
5534 
5535 	/* Mark the queue(s) as populated. */
5536 	for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
5537 		os_atomic_store(&stackshot_ctx.sc_workqueues[i].sswq_populated, true, release);
5538 	}
5539 
5540 #if DEVELOPMENT || DEBUG
5541 	kcd_exit_on_error(kdp_stackshot_plh_stats());
5542 #endif /* DEVELOPMENT || DEBUG */
5543 
5544 #if STACKSHOT_COLLECTS_LATENCY_INFO
5545 	if (stackshot_ctx.sc_is_singlethreaded) {
5546 		stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt;
5547 	} else {
5548 		stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt;
5549 	}
5550 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5551 
5552 #if STACKSHOT_COLLECTS_LATENCY_INFO
5553 	if (collect_latency_info) {
5554 		stackshot_ctx.sc_latency.latency_version = 2;
5555 		stackshot_ctx.sc_latency.main_cpu_number = stackshot_ctx.sc_main_cpuid;
5556 		stackshot_ctx.sc_latency.calling_cpu_number = stackshot_ctx.sc_calling_cpuid;
5557 	}
5558 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5559 
5560 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5561 	if (!stackshot_ctx.sc_panic_stackshot) {
5562 		kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - stackshot_begin_cpu_cycle_count),
5563 		    "stackshot_total_cpu_cycle_cnt"));
5564 	}
5565 #endif
5566 
5567 	kcdata_add_uint64_with_description(stackshot_kcdata_p, tasks_in_stackshot, "stackshot_tasks_count");
5568 	kcdata_add_uint64_with_description(stackshot_kcdata_p, threads_in_stackshot, "stackshot_threads_count");
5569 
5570 	stackshot_panic_guard();
5571 
5572 	if (!stackshot_ctx.sc_is_singlethreaded) {
5573 		/* Chip away at the queue. */
5574 		stackshot_finalize_linked_kcdata();
5575 		stackshot_cpu_do_work();
5576 		*stackshot_kcdata_p = stackshot_cpu_ctx.scc_kcdata_tail->kcdata;
5577 	}
5578 
5579 #if CONFIG_EXCLAVES
5580 	/* If this is the panic stackshot, check if Exclaves panic left its stackshot in the shared region */
5581 	if (stackshot_ctx.sc_panic_stackshot) {
5582 		struct exclaves_panic_stackshot excl_ss;
5583 		kdp_read_panic_exclaves_stackshot(&excl_ss);
5584 
5585 		if (excl_ss.stackshot_buffer != NULL && excl_ss.stackshot_buffer_size != 0) {
5586 			tb_error_t tberr = TB_ERROR_SUCCESS;
5587 			exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_FOUND;
5588 
5589 			/* this block does not escape, so this is okay... */
5590 			kern_return_t *error_in_block = &error;
5591 			kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
5592 			    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
5593 			tberr = stackshot_stackshotresult__unmarshal(excl_ss.stackshot_buffer, excl_ss.stackshot_buffer_size, ^(stackshot_stackshotresult_s result){
5594 				*error_in_block = stackshot_exclaves_process_stackshot(&result, stackshot_kcdata_p, false);
5595 			});
5596 			kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
5597 			    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
5598 			if (tberr != TB_ERROR_SUCCESS) {
5599 				exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_DECODE_FAILED;
5600 			}
5601 		} else {
5602 			exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_NOT_FOUND;
5603 		}
5604 
5605 		/* check error from the block */
5606 		kcd_exit_on_error(error);
5607 	}
5608 #endif
5609 
5610 	/*  === END of populating stackshot data === */
5611 error_exit:;
5612 	if (error != KERN_SUCCESS) {
5613 		stackshot_set_error(error);
5614 	}
5615 
5616 	stackshot_panic_guard();
5617 
5618 	return error;
5619 }
5620 
5621 static uint64_t
proc_was_throttled_from_task(task_t task)5622 proc_was_throttled_from_task(task_t task)
5623 {
5624 	uint64_t was_throttled = 0;
5625 	void *bsd_info = get_bsdtask_info(task);
5626 
5627 	if (bsd_info) {
5628 		was_throttled = proc_was_throttled(bsd_info);
5629 	}
5630 
5631 	return was_throttled;
5632 }
5633 
5634 static uint64_t
proc_did_throttle_from_task(task_t task)5635 proc_did_throttle_from_task(task_t task)
5636 {
5637 	uint64_t did_throttle = 0;
5638 	void *bsd_info = get_bsdtask_info(task);
5639 
5640 	if (bsd_info) {
5641 		did_throttle = proc_did_throttle(bsd_info);
5642 	}
5643 
5644 	return did_throttle;
5645 }
5646 
5647 static void
kdp_mem_and_io_snapshot(struct mem_and_io_snapshot * memio_snap)5648 kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap)
5649 {
5650 	unsigned int pages_reclaimed;
5651 	unsigned int pages_wanted;
5652 	kern_return_t kErr;
5653 
5654 	uint64_t compressions = 0;
5655 	uint64_t decompressions = 0;
5656 
5657 	compressions = counter_load(&vm_statistics_compressions);
5658 	decompressions = counter_load(&vm_statistics_decompressions);
5659 
5660 	memio_snap->snapshot_magic = STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC;
5661 	memio_snap->free_pages = vm_page_free_count;
5662 	memio_snap->active_pages = vm_page_active_count;
5663 	memio_snap->inactive_pages = vm_page_inactive_count;
5664 	memio_snap->purgeable_pages = vm_page_purgeable_count;
5665 	memio_snap->wired_pages = vm_page_wire_count;
5666 	memio_snap->speculative_pages = vm_page_speculative_count;
5667 	memio_snap->throttled_pages = vm_page_throttled_count;
5668 	memio_snap->busy_buffer_count = count_busy_buffers();
5669 	memio_snap->filebacked_pages = vm_page_pageable_external_count;
5670 	memio_snap->compressions = (uint32_t)compressions;
5671 	memio_snap->decompressions = (uint32_t)decompressions;
5672 	memio_snap->compressor_size = VM_PAGE_COMPRESSOR_COUNT;
5673 	kErr = mach_vm_pressure_monitor(FALSE, VM_PRESSURE_TIME_WINDOW, &pages_reclaimed, &pages_wanted);
5674 
5675 	if (!kErr) {
5676 		memio_snap->pages_wanted = (uint32_t)pages_wanted;
5677 		memio_snap->pages_reclaimed = (uint32_t)pages_reclaimed;
5678 		memio_snap->pages_wanted_reclaimed_valid = 1;
5679 	} else {
5680 		memio_snap->pages_wanted = 0;
5681 		memio_snap->pages_reclaimed = 0;
5682 		memio_snap->pages_wanted_reclaimed_valid = 0;
5683 	}
5684 }
5685 
5686 
5687 static vm_offset_t
stackshot_find_phys(vm_map_t map,vm_offset_t target_addr,kdp_fault_flags_t fault_flags,uint32_t * kdp_fault_result_flags)5688 stackshot_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_flags, uint32_t *kdp_fault_result_flags)
5689 {
5690 	vm_offset_t result;
5691 	struct kdp_fault_result fault_results = {0};
5692 	if (stackshot_cpu_ctx.scc_fault_stats.sfs_stopped_faulting) {
5693 		fault_flags &= ~KDP_FAULT_FLAGS_ENABLE_FAULTING;
5694 	}
5695 	if (!stackshot_ctx.sc_panic_stackshot) {
5696 		fault_flags |= KDP_FAULT_FLAGS_MULTICPU;
5697 	}
5698 
5699 	result = kdp_find_phys(map, target_addr, fault_flags, &fault_results);
5700 
5701 	if ((fault_results.flags & KDP_FAULT_RESULT_TRIED_FAULT) || (fault_results.flags & KDP_FAULT_RESULT_FAULTED_IN)) {
5702 		stackshot_cpu_ctx.scc_fault_stats.sfs_time_spent_faulting += fault_results.time_spent_faulting;
5703 
5704 #if STACKSHOT_COLLECTS_LATENCY_INFO
5705 		stackshot_cpu_latency.faulting_time_mt += fault_results.time_spent_faulting;
5706 #endif
5707 
5708 		if ((stackshot_cpu_ctx.scc_fault_stats.sfs_time_spent_faulting >= stackshot_max_fault_time) && !stackshot_ctx.sc_panic_stackshot) {
5709 			stackshot_cpu_ctx.scc_fault_stats.sfs_stopped_faulting = (uint8_t) TRUE;
5710 		}
5711 	}
5712 
5713 	if (fault_results.flags & KDP_FAULT_RESULT_FAULTED_IN) {
5714 		stackshot_cpu_ctx.scc_fault_stats.sfs_pages_faulted_in++;
5715 	}
5716 
5717 	if (kdp_fault_result_flags) {
5718 		*kdp_fault_result_flags = fault_results.flags;
5719 	}
5720 
5721 	return result;
5722 }
5723 
5724 /*
5725  * Wrappers around kdp_generic_copyin, kdp_generic_copyin_word, kdp_generic_copyin_string that use stackshot_find_phys
5726  * in order to:
5727  *   1. collect statistics on the number of pages faulted in
5728  *   2. stop faulting if the time spent faulting has exceeded the limit.
5729  */
5730 static boolean_t
stackshot_copyin(vm_map_t map,uint64_t uaddr,void * dest,size_t size,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5731 stackshot_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5732 {
5733 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5734 	if (try_fault) {
5735 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5736 	}
5737 	return kdp_generic_copyin(map, uaddr, dest, size, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags) == KERN_SUCCESS;
5738 }
5739 static boolean_t
stackshot_copyin_word(task_t task,uint64_t addr,uint64_t * result,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5740 stackshot_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5741 {
5742 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5743 	if (try_fault) {
5744 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5745 	}
5746 	return kdp_generic_copyin_word(task, addr, result, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags) == KERN_SUCCESS;
5747 }
5748 static int
stackshot_copyin_string(task_t task,uint64_t addr,char * buf,int buf_sz,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5749 stackshot_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5750 {
5751 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5752 	if (try_fault) {
5753 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5754 	}
5755 	return kdp_generic_copyin_string(task, addr, buf, buf_sz, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags);
5756 }
5757 
5758 kern_return_t
do_stackshot(void * context)5759 do_stackshot(void *context)
5760 {
5761 #pragma unused(context)
5762 	kern_return_t error;
5763 	size_t queue_size;
5764 	uint64_t abs_time = mach_absolute_time(), abs_time_end = 0;
5765 	kdp_snapshot++;
5766 
5767 	if (!stackshot_ctx.sc_is_singlethreaded) {
5768 #if defined(__arm64__)
5769 		/*
5770 		 * Set up buffers. We used the ssb_size entry in each buffer entry
5771 		 * to indicate how many CPUs in that cluster are participating in the
5772 		 * stackshot, so that we can divvy up buffer space accordingly.
5773 		 */
5774 		size_t buf_per_cpu = stackshot_args.buffer_size / os_atomic_load(&stackshot_ctx.sc_cpus_working, relaxed);
5775 		buf_per_cpu -= buf_per_cpu % sizeof(uint64_t); /* align to uint64_t */
5776 		mach_vm_address_t cur_addr = (mach_vm_address_t) stackshot_args.buffer;
5777 		for (int buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) {
5778 			size_t bufsz = buf_per_cpu * stackshot_ctx.sc_buffers[buf_idx].ssb_size;
5779 			if (bufsz == 0) {
5780 				continue;
5781 			}
5782 			stackshot_ctx.sc_buffers[buf_idx] = (struct stackshot_buffer) {
5783 				.ssb_ptr = (void*) cur_addr,
5784 				.ssb_size = bufsz,
5785 				.ssb_used = 0,
5786 				.ssb_freelist = NULL,
5787 				.ssb_freelist_lock = 0,
5788 				.ssb_overhead = 0
5789 			};
5790 			cur_addr += bufsz;
5791 		}
5792 		assert(cur_addr <= ((mach_vm_address_t) stackshot_args.buffer + stackshot_args.buffer_size));
5793 #else /* __arm64__ */
5794 		/*
5795 		 * On Intel, we always just have one buffer
5796 		 */
5797 		stackshot_ctx.sc_buffers[0] = (struct stackshot_buffer) {
5798 			.ssb_ptr = stackshot_args.buffer,
5799 			.ssb_size = stackshot_args.buffer_size,
5800 			.ssb_used = 0,
5801 			.ssb_freelist = NULL,
5802 			.ssb_freelist_lock = 0,
5803 			.ssb_overhead = 0
5804 		};
5805 #endif /* !__arm64__ */
5806 
5807 		/* Set up queues. These numbers shouldn't change, but slightly fudge queue size just in case. */
5808 		queue_size = FUDGED_SIZE(tasks_count + terminated_tasks_count, 10);
5809 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
5810 			stackshot_ctx.sc_workqueues[i] = (struct stackshot_workqueue) {
5811 				.sswq_items     = stackshot_alloc_arr(struct stackshot_workitem, queue_size, &error),
5812 				.sswq_capacity  = queue_size,
5813 				.sswq_num_items = 0,
5814 				.sswq_cur_item  = 0,
5815 				.sswq_populated = false
5816 			};
5817 			if (error != KERN_SUCCESS) {
5818 				break;
5819 			}
5820 		}
5821 	}
5822 
5823 	_stackshot_validation_reset();
5824 	error = stackshot_plh_setup(); /* set up port label hash */
5825 
5826 	if (error != KERN_SUCCESS) {
5827 		stackshot_set_error(error);
5828 		return error;
5829 	}
5830 
5831 	/*
5832 	 * If no main CPU has been selected at this point, (since every CPU has
5833 	 * called stackshot_cpu_preflight by now), then there was no CLPC
5834 	 * recommended P-core available. In that case, we should volunteer ourself
5835 	 * to be the main CPU, because someone has to do it.
5836 	 */
5837 	if (stackshot_ctx.sc_main_cpuid == -1) {
5838 		os_atomic_cmpxchg(&stackshot_ctx.sc_main_cpuid, -1, cpu_number(), acquire);
5839 		stackshot_cpu_ctx.scc_can_work = true;
5840 	}
5841 
5842 	/* After this, auxiliary CPUs can begin work. */
5843 	os_atomic_store(&stackshot_ctx.sc_state, SS_RUNNING, release);
5844 
5845 	/* If we are the main CPU, populate the queues / do other main CPU work. */
5846 	if (stackshot_ctx.sc_panic_stackshot || (stackshot_ctx.sc_main_cpuid == cpu_number())) {
5847 		stackshot_ctx.sc_retval = kdp_stackshot_kcdata_format();
5848 	} else if (stackshot_cpu_ctx.scc_can_work) {
5849 		stackshot_cpu_do_work();
5850 	}
5851 
5852 	/* Wait for every CPU to finish. */
5853 #if STACKSHOT_COLLECTS_LATENCY_INFO
5854 	stackshot_ctx.sc_latency.cpu_wait_latency_mt = mach_absolute_time();
5855 #endif
5856 	if (stackshot_cpu_ctx.scc_can_work) {
5857 		os_atomic_dec(&stackshot_ctx.sc_cpus_working, seq_cst);
5858 		stackshot_cpu_ctx.scc_can_work = false;
5859 	}
5860 	while (os_atomic_load(&stackshot_ctx.sc_cpus_working, seq_cst) != 0) {
5861 		loop_wait();
5862 	}
5863 	stackshot_panic_guard();
5864 #if STACKSHOT_COLLECTS_LATENCY_INFO
5865 	stackshot_ctx.sc_latency.cpu_wait_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.cpu_wait_latency_mt;
5866 #endif
5867 
5868 	/* update timestamp of the stackshot */
5869 	abs_time_end = mach_absolute_time();
5870 	stackshot_ctx.sc_duration = (struct stackshot_duration_v2) {
5871 		.stackshot_duration       = (abs_time_end - abs_time),
5872 		.stackshot_duration_outer = 0,
5873 		.stackshot_duration_prior = stackshot_duration_prior_abs,
5874 	};
5875 
5876 	stackshot_plh_reset();
5877 
5878 	/* Check interrupts disabled time. */
5879 #if SCHED_HYGIENE_DEBUG
5880 	bool disable_interrupts_masked_check = kern_feature_override(
5881 		KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD) ||
5882 	    (stackshot_flags & STACKSHOT_DO_COMPRESS) != 0;
5883 
5884 #if STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED
5885 	disable_interrupts_masked_check = true;
5886 #endif /* STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED */
5887 
5888 	if (disable_interrupts_masked_check) {
5889 		ml_spin_debug_clear_self();
5890 	}
5891 
5892 	if (!stackshot_ctx.sc_panic_stackshot && interrupt_masked_debug_mode) {
5893 		/*
5894 		 * Try to catch instances where stackshot takes too long BEFORE returning from
5895 		 * the debugger
5896 		 */
5897 		ml_handle_stackshot_interrupt_disabled_duration(current_thread());
5898 	}
5899 #endif /* SCHED_HYGIENE_DEBUG */
5900 
5901 	kdp_snapshot--;
5902 
5903 	/* If any other CPU had an error, make sure we return it */
5904 	if (stackshot_ctx.sc_retval == KERN_SUCCESS) {
5905 		stackshot_ctx.sc_retval = stackshot_status_check();
5906 	}
5907 
5908 #if CONFIG_EXCLAVES
5909 	/* Avoid setting AST until as late as possible, in case the stackshot fails */
5910 	if (!stackshot_ctx.sc_panic_stackshot && stackshot_ctx.sc_retval == KERN_SUCCESS) {
5911 		commit_exclaves_ast();
5912 	}
5913 	if (stackshot_ctx.sc_retval != KERN_SUCCESS && stackshot_exclave_inspect_ctids) {
5914 		/* Clear inspection CTID list: no need to wait for these threads */
5915 		stackshot_cleanup_exclave_waitlist();
5916 	}
5917 #endif
5918 
5919 	/* If this is a singlethreaded stackshot, the "final" kcdata buffer is just our CPU's kcdata buffer */
5920 	if (stackshot_ctx.sc_is_singlethreaded) {
5921 		stackshot_ctx.sc_finalized_kcdata = stackshot_kcdata_p;
5922 	}
5923 
5924 	return stackshot_ctx.sc_retval;
5925 }
5926 
5927 kern_return_t
do_panic_stackshot(void * context)5928 do_panic_stackshot(void *context)
5929 {
5930 	kern_return_t ret = do_stackshot(context);
5931 	if (ret != KERN_SUCCESS) {
5932 		goto out;
5933 	}
5934 
5935 	ret = stackshot_finalize_singlethreaded_kcdata();
5936 
5937 out:
5938 	return ret;
5939 }
5940 
5941 /*
5942  * Set up needed state for this CPU before participating in a stackshot.
5943  * Namely, we want to signal that we're available to do work.
5944  * Called while interrupts are disabled & in the debugger trap.
5945  */
5946 void
stackshot_cpu_preflight(void)5947 stackshot_cpu_preflight(void)
5948 {
5949 	bool is_recommended, is_calling_cpu;
5950 	int my_cpu_no = cpu_number();
5951 
5952 #if STACKSHOT_COLLECTS_LATENCY_INFO
5953 	stackshot_cpu_latency = (typeof(stackshot_cpu_latency)) {
5954 		.cpu_number            =  cpu_number(),
5955 #if defined(__AMP__)
5956 		.cluster_type          =  current_cpu_datap()->cpu_cluster_type,
5957 #else /* __AMP__ */
5958 		.cluster_type = CLUSTER_TYPE_SMP,
5959 #endif /* __AMP__ */
5960 		.faulting_time_mt      = 0,
5961 		.total_buf             = 0,
5962 		.intercluster_buf_used = 0
5963 	};
5964 #if CONFIG_PERVASIVE_CPI
5965 	mt_cur_cpu_cycles_instrs_speculative(&stackshot_cpu_latency.total_cycles, &stackshot_cpu_latency.total_instrs);
5966 #endif /* CONFIG_PERVASIVE_CPI */
5967 	stackshot_cpu_latency.init_latency_mt = stackshot_cpu_latency.total_latency_mt = mach_absolute_time();
5968 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5969 
5970 	is_recommended = current_processor()->is_recommended;
5971 
5972 	/* If this is a recommended P-core (or SMP), try making it the main CPU */
5973 	if (is_recommended
5974 #if defined(__AMP__)
5975 	    && current_cpu_datap()->cpu_cluster_type == CLUSTER_TYPE_P
5976 #endif /* __AMP__ */
5977 	    ) {
5978 		os_atomic_cmpxchg(&stackshot_ctx.sc_main_cpuid, -1, my_cpu_no, acquire);
5979 	}
5980 
5981 	is_calling_cpu = stackshot_ctx.sc_calling_cpuid == my_cpu_no;
5982 
5983 	stackshot_cpu_ctx.scc_did_work = false;
5984 	stackshot_cpu_ctx.scc_can_work = is_calling_cpu || (is_recommended && !stackshot_ctx.sc_is_singlethreaded);
5985 
5986 	if (stackshot_cpu_ctx.scc_can_work) {
5987 		/*
5988 		 * Increase size of our cluster's buffer to indicate how many CPUs in this
5989 		 * cluster are participating
5990 		 */
5991 #if defined(__arm64__)
5992 		os_atomic_inc(&stackshot_ctx.sc_buffers[cpu_cluster_id()].ssb_size, relaxed);
5993 #endif /* __arm64__ */
5994 		os_atomic_inc(&stackshot_ctx.sc_cpus_working, seq_cst);
5995 	}
5996 }
5997 
5998 __result_use_check
5999 static kern_return_t
stackshot_cpu_work_on_queue(struct stackshot_workqueue * queue)6000 stackshot_cpu_work_on_queue(struct stackshot_workqueue *queue)
6001 {
6002 	struct stackshot_workitem     *cur_workitemp;
6003 	kern_return_t                  error = KERN_SUCCESS;
6004 
6005 	while (((cur_workitemp = stackshot_get_workitem(queue)) != NULL || !os_atomic_load(&queue->sswq_populated, acquire))) {
6006 		/* Check to make sure someone hasn't errored out or panicked. */
6007 		if (__improbable(stackshot_status_check() != KERN_SUCCESS)) {
6008 			return KERN_ABORTED;
6009 		}
6010 
6011 		if (cur_workitemp) {
6012 			kcd_exit_on_error(stackshot_new_linked_kcdata());
6013 			cur_workitemp->sswi_data = stackshot_cpu_ctx.scc_kcdata_head;
6014 			kcd_exit_on_error(kdp_stackshot_record_task(cur_workitemp->sswi_task));
6015 			stackshot_finalize_linked_kcdata();
6016 		} else {
6017 #if STACKSHOT_COLLECTS_LATENCY_INFO
6018 			uint64_t time_begin = mach_absolute_time();
6019 #endif
6020 			loop_wait();
6021 #if STACKSHOT_COLLECTS_LATENCY_INFO
6022 			stackshot_cpu_latency.workqueue_latency_mt += mach_absolute_time() - time_begin;
6023 #endif
6024 		}
6025 	}
6026 
6027 error_exit:
6028 	return error;
6029 }
6030 
6031 static void
stackshot_cpu_do_work(void)6032 stackshot_cpu_do_work(void)
6033 {
6034 	kern_return_t                  error;
6035 
6036 	stackshot_cpu_ctx.scc_stack_buffer = stackshot_alloc_arr(uintptr_t, MAX_FRAMES, &error);
6037 	if (error != KERN_SUCCESS) {
6038 		goto error_exit;
6039 	}
6040 
6041 #if STACKSHOT_COLLECTS_LATENCY_INFO
6042 	stackshot_cpu_latency.init_latency_mt = mach_absolute_time() - stackshot_cpu_latency.init_latency_mt;
6043 #endif
6044 
6045 	bool high_perf = true;
6046 
6047 #if defined(__AMP__)
6048 	if (current_cpu_datap()->cpu_cluster_type != CLUSTER_TYPE_P) {
6049 		high_perf = false;
6050 	}
6051 #endif /* __AMP__ */
6052 
6053 	if (high_perf) {
6054 		/* High Perf: Work from most difficult to least difficult */
6055 		for (size_t i = STACKSHOT_NUM_WORKQUEUES; i > 0; i--) {
6056 			kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i - 1]));
6057 		}
6058 	} else {
6059 		/* Low Perf: Work from least difficult to most difficult */
6060 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
6061 			kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i]));
6062 		}
6063 	}
6064 #if STACKSHOT_COLLECTS_LATENCY_INFO
6065 	stackshot_cpu_latency.total_latency_mt = mach_absolute_time() - stackshot_cpu_latency.total_latency_mt;
6066 #if CONFIG_PERVASIVE_CPI
6067 	uint64_t cycles, instrs;
6068 	mt_cur_cpu_cycles_instrs_speculative(&cycles, &instrs);
6069 	stackshot_cpu_latency.total_cycles = cycles - stackshot_cpu_latency.total_cycles;
6070 	stackshot_cpu_latency.total_instrs = instrs - stackshot_cpu_latency.total_instrs;
6071 #endif /* CONFIG_PERVASIVE_CPI */
6072 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
6073 
6074 error_exit:
6075 	if (error != KERN_SUCCESS) {
6076 		stackshot_set_error(error);
6077 	}
6078 	stackshot_panic_guard();
6079 }
6080 
6081 /*
6082  * This is where the other CPUs will end up when we take a stackshot.
6083  * If they're available to do work, they'll do so here.
6084  * Called with interrupts disabled & from the debugger trap.
6085  */
6086 void
stackshot_aux_cpu_entry(void)6087 stackshot_aux_cpu_entry(void)
6088 {
6089 	/*
6090 	 * This is where the other CPUs will end up when we take a stackshot.
6091 	 * Also, the main CPU will call this in the middle of its work to chip
6092 	 * away at the queue.
6093 	 */
6094 
6095 	/* Don't do work if we said we couldn't... */
6096 	if (!stackshot_cpu_ctx.scc_can_work) {
6097 		return;
6098 	}
6099 
6100 	/* Spin until we're ready to run. */
6101 	while (os_atomic_load(&stackshot_ctx.sc_state, acquire) == SS_SETUP) {
6102 		loop_wait();
6103 	}
6104 
6105 	/* Check to make sure the setup didn't error out or panic. */
6106 	if (stackshot_status_check() != KERN_SUCCESS) {
6107 		goto exit;
6108 	}
6109 
6110 	/* the CPU entering here is participating in the stackshot */
6111 	stackshot_cpu_ctx.scc_did_work = true;
6112 
6113 	if (stackshot_ctx.sc_main_cpuid == cpu_number()) {
6114 		stackshot_ctx.sc_retval = kdp_stackshot_kcdata_format();
6115 	} else {
6116 		stackshot_cpu_do_work();
6117 	}
6118 
6119 exit:
6120 	os_atomic_dec(&stackshot_ctx.sc_cpus_working, release);
6121 }
6122 
6123 boolean_t
stackshot_thread_is_idle_worker_unsafe(thread_t thread)6124 stackshot_thread_is_idle_worker_unsafe(thread_t thread)
6125 {
6126 	/* When the pthread kext puts a worker thread to sleep, it will
6127 	 * set kThreadWaitParkedWorkQueue in the block_hint of the thread
6128 	 * struct. See parkit() in kern/kern_support.c in libpthread.
6129 	 */
6130 	return (thread->state & TH_WAIT) &&
6131 	       (thread->block_hint == kThreadWaitParkedWorkQueue);
6132 }
6133 
6134 #if CONFIG_COALITIONS
6135 static void
stackshot_coalition_jetsam_count(void * arg,int i,coalition_t coal)6136 stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal)
6137 {
6138 #pragma unused(i, coal)
6139 	unsigned int *coalition_count = (unsigned int*)arg;
6140 	(*coalition_count)++;
6141 }
6142 
6143 static void
stackshot_coalition_jetsam_snapshot(void * arg,int i,coalition_t coal)6144 stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t coal)
6145 {
6146 	if (coalition_type(coal) != COALITION_TYPE_JETSAM) {
6147 		return;
6148 	}
6149 
6150 	struct jetsam_coalition_snapshot *coalitions = (struct jetsam_coalition_snapshot*)arg;
6151 	struct jetsam_coalition_snapshot *jcs = &coalitions[i];
6152 	task_t leader = TASK_NULL;
6153 	jcs->jcs_id = coalition_id(coal);
6154 	jcs->jcs_flags = 0;
6155 	jcs->jcs_thread_group = 0;
6156 
6157 	if (coalition_term_requested(coal)) {
6158 		jcs->jcs_flags |= kCoalitionTermRequested;
6159 	}
6160 	if (coalition_is_terminated(coal)) {
6161 		jcs->jcs_flags |= kCoalitionTerminated;
6162 	}
6163 	if (coalition_is_reaped(coal)) {
6164 		jcs->jcs_flags |= kCoalitionReaped;
6165 	}
6166 	if (coalition_is_privileged(coal)) {
6167 		jcs->jcs_flags |= kCoalitionPrivileged;
6168 	}
6169 
6170 #if CONFIG_THREAD_GROUPS
6171 	struct thread_group *thread_group = kdp_coalition_get_thread_group(coal);
6172 	if (thread_group) {
6173 		jcs->jcs_thread_group = thread_group_get_id(thread_group);
6174 	}
6175 #endif /* CONFIG_THREAD_GROUPS */
6176 
6177 	leader = kdp_coalition_get_leader(coal);
6178 	if (leader) {
6179 		jcs->jcs_leader_task_uniqueid = get_task_uniqueid(leader);
6180 	} else {
6181 		jcs->jcs_leader_task_uniqueid = 0;
6182 	}
6183 }
6184 #endif /* CONFIG_COALITIONS */
6185 
6186 #if CONFIG_THREAD_GROUPS
6187 static void
stackshot_thread_group_count(void * arg,int i,struct thread_group * tg)6188 stackshot_thread_group_count(void *arg, int i, struct thread_group *tg)
6189 {
6190 #pragma unused(i, tg)
6191 	unsigned int *n = (unsigned int*)arg;
6192 	(*n)++;
6193 }
6194 
6195 static void
stackshot_thread_group_snapshot(void * arg,int i,struct thread_group * tg)6196 stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg)
6197 {
6198 	struct thread_group_snapshot_v3 *thread_groups = arg;
6199 	struct thread_group_snapshot_v3 *tgs = &thread_groups[i];
6200 	const char *name = thread_group_get_name(tg);
6201 	uint32_t flags = thread_group_get_flags(tg);
6202 	tgs->tgs_id = thread_group_get_id(tg);
6203 	static_assert(THREAD_GROUP_MAXNAME > sizeof(tgs->tgs_name));
6204 	kdp_memcpy(tgs->tgs_name, name, sizeof(tgs->tgs_name));
6205 	kdp_memcpy(tgs->tgs_name_cont, name + sizeof(tgs->tgs_name),
6206 	    sizeof(tgs->tgs_name_cont));
6207 	tgs->tgs_flags =
6208 	    ((flags & THREAD_GROUP_FLAGS_EFFICIENT)     ? kThreadGroupEfficient     : 0) |
6209 	    ((flags & THREAD_GROUP_FLAGS_APPLICATION)   ? kThreadGroupApplication   : 0) |
6210 	    ((flags & THREAD_GROUP_FLAGS_CRITICAL)      ? kThreadGroupCritical      : 0) |
6211 	    ((flags & THREAD_GROUP_FLAGS_BEST_EFFORT)   ? kThreadGroupBestEffort    : 0) |
6212 	    ((flags & THREAD_GROUP_FLAGS_UI_APP)        ? kThreadGroupUIApplication : 0) |
6213 	    ((flags & THREAD_GROUP_FLAGS_MANAGED)       ? kThreadGroupManaged       : 0) |
6214 	    ((flags & THREAD_GROUP_FLAGS_STRICT_TIMERS) ? kThreadGroupStrictTimers  : 0) |
6215 	    0;
6216 }
6217 #endif /* CONFIG_THREAD_GROUPS */
6218 
6219 /* Determine if a thread has waitinfo that stackshot can provide */
6220 static int
stackshot_thread_has_valid_waitinfo(thread_t thread)6221 stackshot_thread_has_valid_waitinfo(thread_t thread)
6222 {
6223 	if (!(thread->state & TH_WAIT)) {
6224 		return 0;
6225 	}
6226 
6227 	switch (thread->block_hint) {
6228 	// If set to None or is a parked work queue, ignore it
6229 	case kThreadWaitParkedWorkQueue:
6230 	case kThreadWaitNone:
6231 		return 0;
6232 	// There is a short window where the pthread kext removes a thread
6233 	// from its ksyn wait queue before waking the thread up
6234 	case kThreadWaitPThreadMutex:
6235 	case kThreadWaitPThreadRWLockRead:
6236 	case kThreadWaitPThreadRWLockWrite:
6237 	case kThreadWaitPThreadCondVar:
6238 		return kdp_pthread_get_thread_kwq(thread) != NULL;
6239 	// All other cases are valid block hints if in a wait state
6240 	default:
6241 		return 1;
6242 	}
6243 }
6244 
6245 /* Determine if a thread has turnstileinfo that stackshot can provide */
6246 static int
stackshot_thread_has_valid_turnstileinfo(thread_t thread)6247 stackshot_thread_has_valid_turnstileinfo(thread_t thread)
6248 {
6249 	struct turnstile *ts = thread_get_waiting_turnstile(thread);
6250 
6251 	return stackshot_thread_has_valid_waitinfo(thread) &&
6252 	       ts != TURNSTILE_NULL;
6253 }
6254 
6255 static void
stackshot_thread_turnstileinfo(thread_t thread,thread_turnstileinfo_v2_t * tsinfo)6256 stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_v2_t *tsinfo)
6257 {
6258 	struct turnstile *ts;
6259 	struct ipc_service_port_label *ispl = NULL;
6260 
6261 	/* acquire turnstile information and store it in the stackshot */
6262 	ts = thread_get_waiting_turnstile(thread);
6263 	tsinfo->waiter = thread_tid(thread);
6264 	kdp_turnstile_fill_tsinfo(ts, tsinfo, &ispl);
6265 	tsinfo->portlabel_id = stackshot_plh_lookup(ispl,
6266 	    (tsinfo->turnstile_flags & STACKSHOT_TURNSTILE_STATUS_SENDPORT) ? STACKSHOT_PLH_LOOKUP_SEND :
6267 	    (tsinfo->turnstile_flags & STACKSHOT_TURNSTILE_STATUS_RECEIVEPORT) ? STACKSHOT_PLH_LOOKUP_RECEIVE :
6268 	    STACKSHOT_PLH_LOOKUP_UNKNOWN);
6269 }
6270 
6271 static void
stackshot_thread_wait_owner_info(thread_t thread,thread_waitinfo_v2_t * waitinfo)6272 stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_v2_t *waitinfo)
6273 {
6274 	thread_waitinfo_t *waitinfo_v1 = (thread_waitinfo_t *)waitinfo;
6275 	struct ipc_service_port_label *ispl = NULL;
6276 
6277 	waitinfo->waiter        = thread_tid(thread);
6278 	waitinfo->wait_type     = thread->block_hint;
6279 	waitinfo->wait_flags    = 0;
6280 
6281 	switch (waitinfo->wait_type) {
6282 	case kThreadWaitKernelMutex:
6283 		kdp_lck_mtx_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6284 		break;
6285 	case kThreadWaitPortReceive:
6286 		kdp_mqueue_recv_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo, &ispl);
6287 		waitinfo->portlabel_id  = stackshot_plh_lookup(ispl, STACKSHOT_PLH_LOOKUP_RECEIVE);
6288 		break;
6289 	case kThreadWaitPortSend:
6290 		kdp_mqueue_send_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo, &ispl);
6291 		waitinfo->portlabel_id  = stackshot_plh_lookup(ispl, STACKSHOT_PLH_LOOKUP_SEND);
6292 		break;
6293 	case kThreadWaitSemaphore:
6294 		kdp_sema_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6295 		break;
6296 	case kThreadWaitUserLock:
6297 		kdp_ulock_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6298 		break;
6299 	case kThreadWaitKernelRWLockRead:
6300 	case kThreadWaitKernelRWLockWrite:
6301 	case kThreadWaitKernelRWLockUpgrade:
6302 		kdp_rwlck_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6303 		break;
6304 	case kThreadWaitPThreadMutex:
6305 	case kThreadWaitPThreadRWLockRead:
6306 	case kThreadWaitPThreadRWLockWrite:
6307 	case kThreadWaitPThreadCondVar:
6308 		kdp_pthread_find_owner(thread, waitinfo_v1);
6309 		break;
6310 	case kThreadWaitWorkloopSyncWait:
6311 		kdp_workloop_sync_wait_find_owner(thread, thread->wait_event, waitinfo_v1);
6312 		break;
6313 	case kThreadWaitOnProcess:
6314 		kdp_wait4_find_process(thread, thread->wait_event, waitinfo_v1);
6315 		break;
6316 	case kThreadWaitSleepWithInheritor:
6317 		kdp_sleep_with_inheritor_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6318 		break;
6319 	case kThreadWaitEventlink:
6320 		kdp_eventlink_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6321 		break;
6322 	case kThreadWaitCompressor:
6323 		kdp_compressor_busy_find_owner(thread->wait_event, waitinfo_v1);
6324 		break;
6325 #ifdef CONFIG_EXCLAVES
6326 	case kThreadWaitExclaveCore:
6327 	case kThreadWaitExclaveKit:
6328 		kdp_esync_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6329 		break;
6330 #endif /* CONFIG_EXCLAVES */
6331 	case kThreadWaitPageBusy:
6332 		kdp_vm_page_sleep_find_owner(thread->wait_event, waitinfo_v1);
6333 		break;
6334 	case kThreadWaitPagingInProgress:
6335 	case kThreadWaitPagingActivity:
6336 	case kThreadWaitPagerInit:
6337 	case kThreadWaitPagerReady:
6338 	case kThreadWaitMemoryBlocked:
6339 	case kThreadWaitPageInThrottle:
6340 		kdp_vm_object_sleep_find_owner(thread->wait_event, waitinfo->wait_type, waitinfo_v1);
6341 		break;
6342 	default:
6343 		waitinfo->owner = 0;
6344 		waitinfo->context = 0;
6345 		break;
6346 	}
6347 }
6348