xref: /xnu-11417.121.6/osfmk/kern/kern_stackshot.c (revision a1e26a70f38d1d7daa7b49b258e2f8538ad81650)
1 /*
2  * Copyright (c) 2013-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 
30 #include <mach/mach_types.h>
31 #include <mach/vm_param.h>
32 #include <mach/mach_vm.h>
33 #include <mach/clock_types.h>
34 #include <sys/code_signing.h>
35 #include <sys/errno.h>
36 #include <sys/stackshot.h>
37 #if defined(__arm64__)
38 #include <arm/cpu_internal.h>
39 #endif /* __arm64__ */
40 #ifdef IMPORTANCE_INHERITANCE
41 #include <ipc/ipc_importance.h>
42 #endif
43 #include <sys/appleapiopts.h>
44 #include <kern/debug.h>
45 #include <kern/block_hint.h>
46 #include <uuid/uuid.h>
47 
48 #include <kdp/kdp_dyld.h>
49 #include <kdp/kdp_en_debugger.h>
50 #include <kdp/processor_core.h>
51 #include <kdp/kdp_common.h>
52 
53 #include <libsa/types.h>
54 #include <libkern/version.h>
55 #include <libkern/section_keywords.h>
56 
57 #include <string.h> /* bcopy */
58 
59 #include <kern/kern_stackshot.h>
60 #include <kern/kcdata_private.h>
61 #include <kern/backtrace.h>
62 #include <kern/coalition.h>
63 #include <kern/epoch_sync.h>
64 #include <kern/exclaves_stackshot.h>
65 #include <kern/exclaves_inspection.h>
66 #include <kern/processor.h>
67 #include <kern/host_statistics.h>
68 #include <kern/counter.h>
69 #include <kern/thread.h>
70 #include <kern/thread_group.h>
71 #include <kern/task.h>
72 #include <kern/telemetry.h>
73 #include <kern/clock.h>
74 #include <kern/policy_internal.h>
75 #include <kern/socd_client.h>
76 #include <kern/startup.h>
77 #include <vm/vm_map_xnu.h>
78 #include <vm/vm_kern_xnu.h>
79 #include <vm/vm_pageout.h>
80 #include <vm/vm_fault.h>
81 #include <vm/vm_shared_region_xnu.h>
82 #include <vm/vm_compressor_xnu.h>
83 #include <libkern/OSKextLibPrivate.h>
84 #include <os/log.h>
85 
86 #ifdef CONFIG_EXCLAVES
87 #include <kern/exclaves.tightbeam.h>
88 #endif /* CONFIG_EXCLAVES */
89 
90 #include <kern/exclaves_test_stackshot.h>
91 
92 #include <libkern/coreanalytics/coreanalytics.h>
93 
94 #if defined(__x86_64__)
95 #include <i386/mp.h>
96 #include <i386/cpu_threads.h>
97 #endif
98 
99 #include <pexpert/pexpert.h>
100 
101 #if CONFIG_PERVASIVE_CPI
102 #include <kern/monotonic.h>
103 #endif /* CONFIG_PERVASIVE_CPI */
104 
105 #include <san/kasan.h>
106 
107 #if DEBUG || DEVELOPMENT
108 #define STACKSHOT_COLLECTS_DIAGNOSTICS 1
109 #define STACKSHOT_COLLECTS_LATENCY_INFO 1
110 #else
111 #define STACKSHOT_COLLECTS_DIAGNOSTICS 0
112 #define STACKSHOT_COLLECTS_LATENCY_INFO 0
113 #endif /* DEBUG || DEVELOPMENT */
114 
115 #define STACKSHOT_COLLECTS_RDAR_126582377_DATA 0
116 
117 #if defined(__AMP__)
118 #define STACKSHOT_NUM_WORKQUEUES 2
119 #else /* __AMP__ */
120 #define STACKSHOT_NUM_WORKQUEUES 1
121 #endif
122 
123 #if defined(__arm64__)
124 #define STACKSHOT_NUM_BUFFERS MAX_CPU_CLUSTERS
125 #else /* __arm64__ */
126 #define STACKSHOT_NUM_BUFFERS 1
127 #endif /* __arm64__ */
128 
129 /* The number of threads which will land a task in the hardest workqueue. */
130 #define STACKSHOT_HARDEST_THREADCOUNT 10
131 
132 TUNABLE_DEV_WRITEABLE(unsigned int, stackshot_single_thread, "stackshot_single_thread", 0);
133 
134 extern unsigned int not_in_kdp;
135 
136 /* indicate to the compiler that some accesses are unaligned */
137 typedef uint64_t unaligned_u64 __attribute__((aligned(1)));
138 
139 int kdp_snapshot                            = 0;
140 
141 #pragma mark ---Stackshot Struct Definitions---
142 
143 typedef struct linked_kcdata_descriptor {
144 	struct kcdata_descriptor          kcdata;
145 	struct linked_kcdata_descriptor  *next;
146 } * linked_kcdata_descriptor_t;
147 
148 struct stackshot_workitem {
149 	task_t                        sswi_task;
150 	linked_kcdata_descriptor_t    sswi_data; /* The kcdata for this task. */
151 	int                           sswi_idx;  /* The index of this job, used for ordering kcdata across multiple queues. */
152 };
153 
154 struct stackshot_workqueue {
155 	uint32_t _Atomic              sswq_num_items; /* Only modified by main CPU */
156 	uint32_t _Atomic              sswq_cur_item; /* Modified by all CPUs */
157 	size_t                        sswq_capacity; /* Constant after preflight */
158 	bool _Atomic                  sswq_populated; /* Only modified by main CPU */
159 	struct stackshot_workitem    *__counted_by(capacity) sswq_items;
160 };
161 
162 struct freelist_entry {
163 	struct freelist_entry        *fl_next; /* Next entry in the freelist */
164 	size_t                        fl_size; /* Size of the entry (must be >= sizeof(struct freelist_entry)) */
165 };
166 
167 struct stackshot_buffer {
168 	void                         *ssb_ptr; /* Base of buffer */
169 	size_t                        ssb_size;
170 	size_t _Atomic                ssb_used;
171 	struct freelist_entry        *ssb_freelist; /* First freelist entry */
172 	int _Atomic                   ssb_freelist_lock;
173 	size_t _Atomic                ssb_overhead; /* Total amount ever freed (even if re-allocated from freelist) */
174 };
175 
176 struct kdp_snapshot_args {
177 	int                           pid;
178 	void                         *buffer;
179 	struct kcdata_descriptor     *descriptor;
180 	uint32_t                      buffer_size;
181 	uint64_t                      flags;
182 	uint64_t                      since_timestamp;
183 	uint32_t                      pagetable_mask;
184 };
185 
186 /*
187  * Keep a simple cache of the most recent validation done at a page granularity
188  * to avoid the expensive software KVA-to-phys translation in the VM.
189  */
190 
191 struct _stackshot_validation_state {
192 	vm_offset_t last_valid_page_kva;
193 	size_t last_valid_size;
194 };
195 
196 /* CPU-local generation counts for PLH */
197 struct _stackshot_plh_gen_state {
198 	uint8_t                *pgs_gen;       /* last 'gen #' seen in */
199 	int16_t                 pgs_curgen_min; /* min idx seen for this gen */
200 	int16_t                 pgs_curgen_max; /* max idx seen for this gen */
201 	uint8_t                 pgs_curgen;     /* current gen */
202 };
203 
204 /*
205  * For port labels, we have a small hash table we use to track the
206  * struct ipc_service_port_label pointers we see along the way.
207  * This structure encapsulates the global state.
208  *
209  * The hash table is insert-only, similar to "intern"ing strings.  It's
210  * only used an manipulated in during the stackshot collection.  We use
211  * seperate chaining, with the hash elements and chains being int16_ts
212  * indexes into the parallel arrays, with -1 ending the chain.  Array indices are
213  * allocated using a bump allocator.
214  *
215  * The parallel arrays contain:
216  *      - plh_array[idx]	the pointer entered
217  *      - plh_chains[idx]	the hash chain
218  *      - plh_gen[idx]		the last 'generation #' seen
219  *
220  * Generation IDs are used to track entries looked up in the current
221  * task; 0 is never used, and the plh_gen array is cleared to 0 on
222  * rollover.
223  *
224  * The portlabel_ids we report externally are just the index in the array,
225  * plus 1 to avoid 0 as a value.  0 is NONE, -1 is UNKNOWN (e.g. there is
226  * one, but we ran out of space)
227  */
228 struct port_label_hash {
229 	int _Atomic             plh_lock;       /* lock for concurrent modifications to this plh */
230 	uint16_t                plh_size;       /* size of allocations; 0 disables tracking */
231 	uint16_t                plh_count;      /* count of used entries in plh_array */
232 	struct ipc_service_port_label **plh_array; /* _size allocated, _count used */
233 	int16_t                *plh_chains;    /* _size allocated */
234 	int16_t                *plh_hash;      /* (1 << STACKSHOT_PLH_SHIFT) entry hash table: hash(ptr) -> array index */
235 #if DEVELOPMENT || DEBUG
236 	/* statistics */
237 	uint32_t _Atomic        plh_lookups;    /* # lookups or inserts */
238 	uint32_t _Atomic        plh_found;
239 	uint32_t _Atomic        plh_found_depth;
240 	uint32_t _Atomic        plh_insert;
241 	uint32_t _Atomic        plh_insert_depth;
242 	uint32_t _Atomic        plh_bad;
243 	uint32_t _Atomic        plh_bad_depth;
244 	uint32_t _Atomic        plh_lookup_send;
245 	uint32_t _Atomic        plh_lookup_receive;
246 #define PLH_STAT_OP(...)    (void)(__VA_ARGS__)
247 #else /* DEVELOPMENT || DEBUG */
248 #define PLH_STAT_OP(...)    (void)(0)
249 #endif /* DEVELOPMENT || DEBUG */
250 };
251 
252 #define plh_lock(plh) while(!os_atomic_cmpxchg(&(plh)->plh_lock, 0, 1, acquire)) { loop_wait(); }
253 #define plh_unlock(plh) os_atomic_store(&(plh)->plh_lock, 0, release);
254 
255 #define STACKSHOT_PLH_SHIFT    7
256 #define STACKSHOT_PLH_SIZE_MAX ((kdp_ipc_have_splabel)? 1024 : 0)
257 size_t stackshot_port_label_size = (2 * (1u << STACKSHOT_PLH_SHIFT));
258 #define STASKSHOT_PLH_SIZE(x) MIN((x), STACKSHOT_PLH_SIZE_MAX)
259 
260 struct stackshot_cpu_context {
261 	bool                               scc_can_work; /* Whether the CPU can do more stackshot work */
262 	bool                               scc_did_work; /* Whether the CPU actually did any stackshot work */
263 	linked_kcdata_descriptor_t         scc_kcdata_head; /* See `linked_kcdata_alloc_callback */
264 	linked_kcdata_descriptor_t         scc_kcdata_tail; /* See `linked_kcdata_alloc_callback */
265 	uintptr_t                         *scc_stack_buffer; /* A buffer for stacktraces. */
266 	struct stackshot_fault_stats       scc_fault_stats;
267 	struct _stackshot_validation_state scc_validation_state;
268 	struct _stackshot_plh_gen_state    scc_plh_gen;
269 };
270 
271 /*
272  * When directly modifying the stackshot state, always use the macros below to
273  * work wth this enum - the higher order bits are used to store an error code
274  * in the case of SS_ERRORED.
275  *
276  *        +------------------------------------+-------------------+
277  *        |                                    |                   |
278  *        v                                    |                   |
279  * +-------------+     +----------+     +------------+     +------------+
280  * | SS_INACTIVE |---->| SS_SETUP |---->| SS_RUNNING |---->| SS_ERRORED |
281  * +-------------+     +----------+     +------------+     +------------+
282  *                         |  |                |                ^  |
283  *                         |  +----------------|----------------+  |
284  * +-------------+         |                   |                   |
285  * | SS_PANICKED |<--------+-------------------+                   |
286  * +-------------+                                                 |
287  *        ^                                                        |
288  *        |                                                        |
289  *        +--------------------------------------------------------+
290  */
291 __enum_closed_decl(stackshot_state_t, uint, {
292 	SS_INACTIVE = 0x0, /* -> SS_SETUP */
293 	SS_SETUP    = 0x1, /* -> SS_RUNNING, SS_ERRORED, SS_PANICKED */
294 	SS_RUNNING  = 0x2, /* -> SS_ERRORED, SS_PANICKED, SS_INACTIVE */
295 	SS_ERRORED  = 0x3, /* -> SS_INACTIVE, SS_PANICKED */
296 	SS_PANICKED = 0x4, /* -> N/A */
297 	_SS_COUNT
298 });
299 
300 static_assert(_SS_COUNT <= 0x5);
301 /* Get the stackshot state ID from a stackshot_state_t. */
302 #define SS_STATE(state) ((state) & 0x7u)
303 /* Get the error code from a stackshot_state_t. */
304 #define SS_ERRCODE(state) ((state) >> 3)
305 /* Make a stackshot error state with a given code. */
306 #define SS_MKERR(code) (((code) << 3) | SS_ERRORED)
307 
308 struct stackshot_context {
309 	/* Constants & Arguments */
310 	struct kdp_snapshot_args      sc_args;
311 	int                           sc_calling_cpuid;
312 	int                           sc_main_cpuid;
313 	bool                          sc_enable_faulting;
314 	uint64_t                      sc_microsecs; /* Timestamp */
315 	bool                          sc_panic_stackshot;
316 	size_t                        sc_min_kcdata_size;
317 	bool                          sc_is_singlethreaded;
318 
319 	/* State & Errors */
320 	stackshot_state_t _Atomic     sc_state; /* Only modified by calling CPU, main CPU, or panicking CPU. See comment above type definition for details. */
321 	kern_return_t                 sc_retval; /* The return value of the main thread */
322 	uint32_t _Atomic              sc_cpus_working;
323 
324 	/* KCData */
325 	linked_kcdata_descriptor_t    sc_pretask_kcdata;
326 	linked_kcdata_descriptor_t    sc_posttask_kcdata;
327 	kcdata_descriptor_t           sc_finalized_kcdata;
328 
329 	/* Buffers & Queues */
330 	struct stackshot_buffer       __counted_by(num_buffers) sc_buffers[STACKSHOT_NUM_BUFFERS];
331 	size_t                        sc_num_buffers;
332 	struct stackshot_workqueue    __counted_by(STACKSHOT_NUM_WORKQUEUES) sc_workqueues[STACKSHOT_NUM_WORKQUEUES];
333 	struct port_label_hash        sc_plh;
334 
335 	/* Statistics */
336 	struct stackshot_duration_v2  sc_duration;
337 	uint32_t                      sc_bytes_traced;
338 	uint32_t                      sc_bytes_uncompressed;
339 #if STACKSHOT_COLLECTS_LATENCY_INFO
340 	struct stackshot_latency_collection_v2 sc_latency;
341 #endif
342 };
343 
344 #define STACKSHOT_DEBUG_TRACEBUF_SIZE 16
345 
346 struct stackshot_trace_entry {
347 	int               sste_line_no;
348 	uint64_t          sste_timestamp;
349 	mach_vm_address_t sste_data;
350 };
351 
352 struct stackshot_trace_buffer {
353 	uint64_t                     sstb_last_trace_timestamp;
354 	size_t                       sstb_tail_idx;
355 	size_t                       sstb_size;
356 	struct stackshot_trace_entry __counted_by(STACKSHOT_DEBUG_TRACEBUF_SIZE) sstb_entries[STACKSHOT_DEBUG_TRACEBUF_SIZE];
357 };
358 
359 #pragma mark ---Stackshot State and Data---
360 
361 /*
362  * Two stackshot states, one for panic and one for normal.
363  * That way, we can take a stackshot during a panic without clobbering state.
364  */
365 #define STACKSHOT_CTX_IDX_NORMAL 0
366 #define STACKSHOT_CTX_IDX_PANIC  1
367 size_t cur_stackshot_ctx_idx   = STACKSHOT_CTX_IDX_NORMAL;
368 struct stackshot_context stackshot_contexts[2] = {{0}, {0}};
369 #define stackshot_ctx (stackshot_contexts[cur_stackshot_ctx_idx])
370 #define stackshot_args (stackshot_ctx.sc_args)
371 #define stackshot_flags (stackshot_args.flags)
372 
373 static struct {
374 	uint64_t last_abs_start;      /* start time of last stackshot */
375 	uint64_t last_abs_end;        /* end time of last stackshot */
376 	uint64_t stackshots_taken;    /* total stackshots taken since boot */
377 	uint64_t stackshots_duration; /* total abs time spent in stackshot_trap() since boot */
378 } stackshot_stats = { 0 };
379 
380 #if STACKSHOT_COLLECTS_LATENCY_INFO
381 static struct stackshot_latency_cpu PERCPU_DATA(stackshot_cpu_latency_percpu);
382 #define stackshot_cpu_latency (*PERCPU_GET(stackshot_cpu_latency_percpu))
383 #endif
384 
385 static struct stackshot_cpu_context PERCPU_DATA(stackshot_cpu_ctx_percpu);
386 #define stackshot_cpu_ctx (*PERCPU_GET(stackshot_cpu_ctx_percpu))
387 
388 static struct kcdata_descriptor PERCPU_DATA(stackshot_kcdata_percpu);
389 #define stackshot_kcdata_p (PERCPU_GET(stackshot_kcdata_percpu))
390 
391 #if STACKSHOT_COLLECTS_LATENCY_INFO
392 static bool collect_latency_info = true;
393 #endif
394 
395 static uint64_t stackshot_max_fault_time;
396 
397 #if STACKSHOT_COLLECTS_DIAGNOSTICS
398 static struct stackshot_trace_buffer PERCPU_DATA(stackshot_trace_buffer);
399 #endif
400 
401 #pragma mark ---Stackshot Global State---
402 
403 uint32_t stackshot_estimate_adj = 25; /* experiment factor: 0-100, adjust our estimate up by this amount */
404 
405 static uint32_t stackshot_initial_estimate;
406 static uint32_t stackshot_initial_estimate_adj;
407 static uint64_t stackshot_duration_prior_abs;   /* prior attempts, abs */
408 static unaligned_u64 * stackshot_duration_outer;
409 static uint64_t stackshot_tries;
410 
411 void * kernel_stackshot_buf   = NULL; /* Pointer to buffer for stackshots triggered from the kernel and retrieved later */
412 int kernel_stackshot_buf_size = 0;
413 
414 void * stackshot_snapbuf = NULL; /* Used by stack_snapshot2 (to be removed) */
415 
416 #if CONFIG_EXCLAVES
417 static ctid_t *stackshot_exclave_inspect_ctids = NULL;
418 static size_t stackshot_exclave_inspect_ctid_count = 0;
419 static size_t stackshot_exclave_inspect_ctid_capacity = 0;
420 
421 static kern_return_t stackshot_exclave_kr = KERN_SUCCESS;
422 #endif /* CONFIG_EXCLAVES */
423 
424 #if DEBUG || DEVELOPMENT
425 TUNABLE(bool, disable_exclave_stackshot, "-disable_exclave_stackshot", false);
426 #else
427 const bool disable_exclave_stackshot = false;
428 #endif
429 
430 #pragma mark ---Stackshot Static Function Declarations---
431 
432 __private_extern__ void stackshot_init( void );
433 static boolean_t        memory_iszero(void *addr, size_t size);
434 static void             stackshot_cpu_do_work(void);
435 static kern_return_t    stackshot_finalize_kcdata(void);
436 static kern_return_t    stackshot_finalize_singlethreaded_kcdata(void);
437 static kern_return_t    stackshot_collect_kcdata(void);
438 static int              kdp_stackshot_kcdata_format();
439 static void             kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap);
440 static vm_offset_t      stackshot_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_flags, uint32_t *kdp_fault_result_flags);
441 static boolean_t        stackshot_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, uint32_t *kdp_fault_result);
442 static int              stackshot_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, uint32_t *kdp_fault_results);
443 static boolean_t        stackshot_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, uint32_t *kdp_fault_results);
444 static uint64_t         proc_was_throttled_from_task(task_t task);
445 static void             stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_v2_t * waitinfo);
446 static int              stackshot_thread_has_valid_waitinfo(thread_t thread);
447 static void             stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_v2_t *tsinfo);
448 static int              stackshot_thread_has_valid_turnstileinfo(thread_t thread);
449 static uint32_t         get_stackshot_estsize(uint32_t prev_size_hint, uint32_t adj, uint64_t trace_flags, pid_t target_pid);
450 static kern_return_t    kdp_snapshot_preflight_internal(struct kdp_snapshot_args args);
451 
452 #if CONFIG_COALITIONS
453 static void             stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal);
454 static void             stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t coal);
455 #endif /* CONFIG_COALITIONS */
456 
457 #if CONFIG_THREAD_GROUPS
458 static void             stackshot_thread_group_count(void *arg, int i, struct thread_group *tg);
459 static void             stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg);
460 #endif /* CONFIG_THREAD_GROUPS */
461 
462 extern uint64_t         workqueue_get_task_ss_flags_from_pwq_state_kdp(void *proc);
463 
464 static kcdata_descriptor_t linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor, size_t min_size);
465 
466 #pragma mark ---Stackshot Externs---
467 
468 struct proc;
469 extern int              proc_pid(struct proc *p);
470 extern uint64_t         proc_uniqueid(void *p);
471 extern uint64_t         proc_was_throttled(void *p);
472 extern uint64_t         proc_did_throttle(void *p);
473 extern int              proc_exiting(void *p);
474 extern int              proc_in_teardown(void *p);
475 static uint64_t         proc_did_throttle_from_task(task_t task);
476 extern void             proc_name_kdp(struct proc *p, char * buf, int size);
477 extern int              proc_threadname_kdp(void * uth, char * buf, size_t size);
478 extern void             proc_starttime_kdp(void * p, uint64_t * tv_sec, uint64_t * tv_usec, uint64_t * abstime);
479 extern void             proc_archinfo_kdp(void* p, cpu_type_t* cputype, cpu_subtype_t* cpusubtype);
480 extern uint64_t         proc_getcsflags_kdp(void * p);
481 extern boolean_t        proc_binary_uuid_kdp(task_t task, uuid_t uuid);
482 extern int              memorystatus_get_pressure_status_kdp(void);
483 extern void             memorystatus_proc_flags_unsafe(void * v, boolean_t *is_dirty, boolean_t *is_dirty_tracked, boolean_t *allow_idle_exit);
484 extern void             panic_stackshot_release_lock(void);
485 
486 extern int count_busy_buffers(void); /* must track with declaration in bsd/sys/buf_internal.h */
487 
488 #if CONFIG_TELEMETRY
489 extern kern_return_t stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval);
490 #endif /* CONFIG_TELEMETRY */
491 
492 extern kern_return_t kern_stack_snapshot_with_reason(char* reason);
493 extern kern_return_t kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user);
494 
495 static size_t stackshot_plh_est_size(void);
496 
497 #if CONFIG_EXCLAVES
498 static kern_return_t collect_exclave_threads(uint64_t);
499 static kern_return_t stackshot_setup_exclave_waitlist(void);
500 #endif
501 
502 /*
503  * Validates that the given address for a word is both a valid page and has
504  * default caching attributes for the current map.
505  */
506 bool machine_trace_thread_validate_kva(vm_offset_t);
507 /*
508  * Validates a region that stackshot will potentially inspect.
509  */
510 static bool _stackshot_validate_kva(vm_offset_t, size_t);
511 /*
512  * Must be called whenever stackshot is re-driven.
513  */
514 static void _stackshot_validation_reset(void);
515 /*
516  * A kdp-safe strlen() call.  Returns:
517  *      -1 if we reach maxlen or a bad address before the end of the string, or
518  *      strlen(s)
519  */
520 static long _stackshot_strlen(const char *s, size_t maxlen);
521 
522 #define MAX_FRAMES 1000
523 #define STACKSHOT_PAGETABLE_BUFSZ 4000
524 #define MAX_LOADINFOS 500
525 #define MAX_DYLD_COMPACTINFO (20 * 1024)  // max bytes of compactinfo to include per proc/shared region
526 #define TASK_IMP_WALK_LIMIT 20
527 
528 typedef struct thread_snapshot *thread_snapshot_t;
529 typedef struct task_snapshot *task_snapshot_t;
530 
531 #if CONFIG_KDP_INTERACTIVE_DEBUGGING
532 extern kdp_send_t    kdp_en_send_pkt;
533 #endif
534 
535 /*
536  * Stackshot locking and other defines.
537  */
538 LCK_GRP_DECLARE(stackshot_subsys_lck_grp, "stackshot_subsys_lock");
539 LCK_MTX_DECLARE(stackshot_subsys_mutex, &stackshot_subsys_lck_grp);
540 
541 #define STACKSHOT_SUBSYS_LOCK() lck_mtx_lock(&stackshot_subsys_mutex)
542 #define STACKSHOT_SUBSYS_TRY_LOCK() lck_mtx_try_lock(&stackshot_subsys_mutex)
543 #define STACKSHOT_SUBSYS_UNLOCK() lck_mtx_unlock(&stackshot_subsys_mutex)
544 #define STACKSHOT_SUBSYS_ASSERT_LOCKED() lck_mtx_assert(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
545 
546 #define SANE_BOOTPROFILE_TRACEBUF_SIZE (64ULL * 1024ULL * 1024ULL)
547 #define SANE_TRACEBUF_SIZE (8ULL * 1024ULL * 1024ULL)
548 
549 #define TRACEBUF_SIZE_PER_GB (1024ULL * 1024ULL)
550 #define GIGABYTES (1024ULL * 1024ULL * 1024ULL)
551 
552 SECURITY_READ_ONLY_LATE(static uint32_t) max_tracebuf_size = SANE_TRACEBUF_SIZE;
553 
554 /*
555  * We currently set a ceiling of 3 milliseconds spent in the kdp fault path
556  * for non-panic stackshots where faulting is requested.
557  */
558 #define KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS (3 * NSEC_PER_MSEC)
559 
560 
561 #ifndef ROUNDUP
562 #define ROUNDUP(x, y)            ((((x)+(y)-1)/(y))*(y))
563 #endif
564 
565 #define STACKSHOT_QUEUE_LABEL_MAXSIZE  64
566 
567 #pragma mark ---Stackshot Useful Macros---
568 
569 #define kcd_end_address(kcd) ((void *)((uint64_t)((kcd)->kcd_addr_begin) + kcdata_memory_get_used_bytes((kcd))))
570 #define kcd_max_address(kcd) ((void *)((kcd)->kcd_addr_begin + (kcd)->kcd_length))
571 /*
572  * Use of the kcd_exit_on_error(action) macro requires a local
573  * 'kern_return_t error' variable and 'error_exit' label.
574  */
575 #define kcd_exit_on_error(action)                      \
576 	do {                                               \
577 	    if (KERN_SUCCESS != (error = (action))) {      \
578 	        STACKSHOT_TRACE(error);                    \
579 	        if (error == KERN_RESOURCE_SHORTAGE) {     \
580 	            error = KERN_INSUFFICIENT_BUFFER_SIZE; \
581 	        }                                          \
582 	        goto error_exit;                           \
583 	    }                                              \
584 	} while (0); /* end kcd_exit_on_error */
585 
586 #if defined(__arm64__)
587 #define loop_wait_noguard() __builtin_arm_wfe()
588 #elif defined(__x86_64__)
589 #define loop_wait_noguard() __builtin_ia32_pause()
590 #else
591 #define loop_wait_noguard()
592 #endif /* __x86_64__ */
593 
594 #define loop_wait() { loop_wait_noguard(); stackshot_panic_guard(); }
595 
596 static inline void stackshot_panic_guard(void);
597 
598 static __attribute__((noreturn, noinline)) void
stackshot_panic_spin(void)599 stackshot_panic_spin(void)
600 {
601 	if (stackshot_cpu_ctx.scc_can_work) {
602 		stackshot_cpu_ctx.scc_can_work = false;
603 		os_atomic_dec(&stackshot_ctx.sc_cpus_working, acquire);
604 	}
605 	if (stackshot_ctx.sc_calling_cpuid == cpu_number()) {
606 		while (os_atomic_load(&stackshot_ctx.sc_cpus_working, acquire) != 0) {
607 			loop_wait_noguard();
608 		}
609 		panic_stackshot_release_lock();
610 	}
611 	while (1) {
612 		loop_wait_noguard();
613 	}
614 }
615 
616 /**
617  * Immediately aborts if another CPU panicked during the stackshot.
618  */
619 static inline void
stackshot_panic_guard(void)620 stackshot_panic_guard(void)
621 {
622 	if (__improbable(os_atomic_load(&stackshot_ctx.sc_state, relaxed) == SS_PANICKED)) {
623 		stackshot_panic_spin();
624 	}
625 }
626 
627 /*
628  * Signal that we panicked during a stackshot by setting an atomic flag and
629  * waiting for others to coalesce before continuing the panic. Other CPUs will
630  * spin on this as soon as they see it set in order to prevent multiple
631  * concurrent panics. The calling CPU (i.e. the one holding the debugger lock)
632  * will release it for us in `stackshot_panic_spin` so we can continue
633  * panicking.
634  *
635  * This is called from panic_trap_to_debugger.
636  */
637 void
stackshot_cpu_signal_panic(void)638 stackshot_cpu_signal_panic(void)
639 {
640 	stackshot_state_t o_state;
641 	if (stackshot_active()) {
642 		/* Check if someone else panicked before we did. */
643 		o_state = os_atomic_xchg(&stackshot_ctx.sc_state, SS_PANICKED, seq_cst);
644 		if (o_state == SS_PANICKED) {
645 			stackshot_panic_spin();
646 		}
647 
648 		/* We're the first CPU to panic - wait for everyone to coalesce. */
649 		if (stackshot_cpu_ctx.scc_can_work) {
650 			stackshot_cpu_ctx.scc_can_work = false;
651 			os_atomic_dec(&stackshot_ctx.sc_cpus_working, acquire);
652 		}
653 		while (os_atomic_load(&stackshot_ctx.sc_cpus_working, seq_cst) != 0) {
654 			loop_wait_noguard();
655 		}
656 	}
657 }
658 
659 /*
660  * Sets the stackshot state to SS_ERRORED along with the error code.
661  * Only works if the current state is SS_RUNNING or SS_SETUP.
662  */
663 static inline void
stackshot_set_error(kern_return_t error)664 stackshot_set_error(kern_return_t error)
665 {
666 	stackshot_state_t cur_state;
667 	stackshot_state_t err_state = SS_MKERR(error);
668 	if (__improbable(!os_atomic_cmpxchgv(&stackshot_ctx.sc_state, SS_RUNNING, err_state, &cur_state, seq_cst))) {
669 		if (cur_state == SS_SETUP) {
670 			os_atomic_cmpxchg(&stackshot_ctx.sc_state, SS_SETUP, err_state, seq_cst);
671 		} else {
672 			/* Our state is something other than SS_RUNNING or SS_SETUP... Check for panic. */
673 			stackshot_panic_guard();
674 		}
675 	}
676 }
677 
678 /* Returns an error code if the current stackshot context has errored out.
679  * Also functions as a panic guard.
680  */
681 __result_use_check
682 static inline kern_return_t
stackshot_status_check(void)683 stackshot_status_check(void)
684 {
685 	stackshot_state_t state = os_atomic_load(&stackshot_ctx.sc_state, relaxed);
686 
687 	/* Check for panic */
688 	if (__improbable(SS_STATE(state) == SS_PANICKED)) {
689 		stackshot_panic_spin();
690 	}
691 
692 	/* Check for error */
693 	if (__improbable(SS_STATE(state) == SS_ERRORED)) {
694 		kern_return_t err = SS_ERRCODE(state);
695 		assert(err != KERN_SUCCESS); /* SS_ERRORED should always store an associated error code. */
696 		return err;
697 	}
698 
699 	return KERN_SUCCESS;
700 }
701 
702 #pragma mark ---Stackshot Tracing---
703 
704 #if STACKSHOT_COLLECTS_DIAGNOSTICS
705 static void
stackshot_trace(int line_no,mach_vm_address_t data)706 stackshot_trace(int line_no, mach_vm_address_t data)
707 {
708 	struct stackshot_trace_buffer *buffer = PERCPU_GET(stackshot_trace_buffer);
709 	buffer->sstb_entries[buffer->sstb_tail_idx] = (struct stackshot_trace_entry) {
710 		.sste_line_no = line_no,
711 		.sste_timestamp = mach_continuous_time(),
712 		.sste_data = data
713 	};
714 	buffer->sstb_tail_idx = (buffer->sstb_tail_idx + 1) % STACKSHOT_DEBUG_TRACEBUF_SIZE;
715 	buffer->sstb_size = MIN(buffer->sstb_size + 1, STACKSHOT_DEBUG_TRACEBUF_SIZE);
716 }
717 #define STACKSHOT_TRACE(data) stackshot_trace(__LINE__, (mach_vm_address_t) (data))
718 
719 #else /* STACKSHOT_COLLECTS_DIAGNOSTICS */
720 #define STACKSHOT_TRACE(data) ((void) data)
721 #endif /* !STACKSHOT_COLLECTS_DIAGNOSTICS */
722 
723 #pragma mark ---Stackshot Buffer Management---
724 
725 #define freelist_lock(buffer) while(!os_atomic_cmpxchg(&buffer->ssb_freelist_lock, 0, 1, acquire)) { loop_wait(); }
726 #define freelist_unlock(buffer) os_atomic_store(&buffer->ssb_freelist_lock, 0, release);
727 
728 /**
729  * Allocates some data from the shared stackshot buffer freelist.
730  * This should not be used directly, it is a last resort if we run out of space.
731  */
732 static void *
stackshot_freelist_alloc(size_t size,struct stackshot_buffer * buffer,kern_return_t * error)733 stackshot_freelist_alloc(
734 	size_t size,
735 	struct stackshot_buffer *buffer,
736 	kern_return_t *error)
737 {
738 	struct freelist_entry **cur_freelist, **best_freelist = NULL, *ret = NULL;
739 
740 	freelist_lock(buffer);
741 
742 	cur_freelist = &buffer->ssb_freelist;
743 
744 	while (*cur_freelist != NULL) {
745 		if (((*cur_freelist)->fl_size >= size) && ((best_freelist == NULL) || ((*best_freelist)->fl_size > (*cur_freelist)->fl_size))) {
746 			best_freelist = cur_freelist;
747 			if ((*best_freelist)->fl_size == size) {
748 				break;
749 			}
750 		}
751 		cur_freelist = &((*cur_freelist)->fl_next);
752 	}
753 
754 	/* If we found a freelist entry, update the freelist */
755 	if (best_freelist != NULL) {
756 		os_atomic_sub(&buffer->ssb_overhead, size, relaxed);
757 		ret = *best_freelist;
758 
759 		/* If there's enough unused space at the end of this entry, we should make a new one */
760 		if (((*best_freelist)->fl_size - size) > sizeof(struct freelist_entry)) {
761 			struct freelist_entry *new_freelist = (struct freelist_entry*) ((mach_vm_address_t) *best_freelist + size);
762 			*new_freelist = (struct freelist_entry) {
763 				.fl_next = (*best_freelist)->fl_next,
764 				.fl_size = (*best_freelist)->fl_size - size
765 			};
766 			(*best_freelist)->fl_next = new_freelist;
767 		}
768 
769 		/* Update previous entry with next or new entry */
770 		*best_freelist = (*best_freelist)->fl_next;
771 	}
772 
773 	freelist_unlock(buffer);
774 
775 	if (error != NULL) {
776 		if (ret == NULL) {
777 			*error = KERN_INSUFFICIENT_BUFFER_SIZE;
778 		} else {
779 			*error = KERN_SUCCESS;
780 		}
781 	}
782 
783 	return ret;
784 }
785 
786 /**
787  * Allocates some data from the shared stackshot buffer.
788  * Should not be used directly - see the `stackshot_alloc` and
789  * `stackshot_alloc_arr` macros.
790  */
791 static void *
stackshot_buffer_alloc(size_t size,struct stackshot_buffer * buffer,kern_return_t * error)792 stackshot_buffer_alloc(
793 	size_t size,
794 	struct stackshot_buffer *buffer,
795 	kern_return_t *error)
796 {
797 	size_t o_used, new_used;
798 
799 	stackshot_panic_guard();
800 	assert(!stackshot_ctx.sc_is_singlethreaded);
801 
802 	os_atomic_rmw_loop(&buffer->ssb_used, o_used, new_used, relaxed, {
803 		new_used = o_used + size;
804 		if (new_used > buffer->ssb_size) {
805 		        os_atomic_rmw_loop_give_up(return stackshot_freelist_alloc(size, buffer, error));
806 		}
807 	});
808 
809 	if (error != NULL) {
810 		*error = KERN_SUCCESS;
811 	}
812 
813 	return (void*) ((mach_vm_address_t) buffer->ssb_ptr + o_used);
814 }
815 
816 /**
817  * Finds the best stackshot buffer to use (prefer our cluster's buffer)
818  * and allocates from it.
819  * Should not be used directly - see the `stackshot_alloc` and
820  * `stackshot_alloc_arr` macros.
821  */
822 __result_use_check
823 static void *
stackshot_best_buffer_alloc(size_t size,kern_return_t * error)824 stackshot_best_buffer_alloc(size_t size, kern_return_t *error)
825 {
826 #if defined(__AMP__)
827 	kern_return_t err;
828 	int           my_cluster;
829 	void         *ret = NULL;
830 #endif /* __AMP__ */
831 
832 #if STACKSHOT_COLLECTS_LATENCY_INFO
833 	stackshot_cpu_latency.total_buf += size;
834 #endif
835 
836 #if defined(__AMP__)
837 	/* First, try our cluster's buffer */
838 	my_cluster = cpu_cluster_id();
839 	ret = stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[my_cluster], &err);
840 
841 	/* Try other buffers now. */
842 	if (err != KERN_SUCCESS) {
843 		for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) {
844 			if (buf_idx == my_cluster) {
845 				continue;
846 			}
847 
848 			ret = stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[buf_idx], &err);
849 			if (err == KERN_SUCCESS) {
850 #if STACKSHOT_COLLECTS_LATENCY_INFO
851 				stackshot_cpu_latency.intercluster_buf_used += size;
852 #endif
853 				break;
854 			}
855 		}
856 	}
857 
858 	if (error != NULL) {
859 		*error = err;
860 	}
861 
862 	return ret;
863 #else /* __AMP__ */
864 	return stackshot_buffer_alloc(size, &stackshot_ctx.sc_buffers[0], error);
865 #endif /* !__AMP__ */
866 }
867 
868 /**
869  * Frees some data from the shared stackshot buffer and adds it to the freelist.
870  */
871 static void
stackshot_buffer_free(void * ptr,struct stackshot_buffer * buffer,size_t size)872 stackshot_buffer_free(
873 	void *ptr,
874 	struct stackshot_buffer *buffer,
875 	size_t size)
876 {
877 	stackshot_panic_guard();
878 
879 	/* This should never be called during a singlethreaded stackshot. */
880 	assert(!stackshot_ctx.sc_is_singlethreaded);
881 
882 	os_atomic_add(&buffer->ssb_overhead, size, relaxed);
883 
884 	/* Make sure we have enough space for the freelist entry */
885 	if (size < sizeof(struct freelist_entry)) {
886 		return;
887 	}
888 
889 	freelist_lock(buffer);
890 
891 	/* Create new freelist entry and push it to the front of the list */
892 	*((struct freelist_entry*) ptr) = (struct freelist_entry) {
893 		.fl_size = size,
894 		.fl_next = buffer->ssb_freelist
895 	};
896 	buffer->ssb_freelist = ptr;
897 
898 	freelist_unlock(buffer);
899 }
900 
901 /**
902  * Allocates some data from the stackshot buffer. Uses the bump allocator in
903  * multithreaded mode and endalloc in singlethreaded.
904  * err must ALWAYS be nonnull.
905  * Should not be used directly - see the macros in kern_stackshot.h.
906  */
907 void *
stackshot_alloc_with_size(size_t size,kern_return_t * err)908 stackshot_alloc_with_size(size_t size, kern_return_t *err)
909 {
910 	void *ptr;
911 	assert(err != NULL);
912 	assert(stackshot_active());
913 
914 	stackshot_panic_guard();
915 
916 	if (stackshot_ctx.sc_is_singlethreaded) {
917 		ptr = kcdata_endalloc(stackshot_kcdata_p, size);
918 		if (ptr == NULL) {
919 			*err = KERN_INSUFFICIENT_BUFFER_SIZE;
920 		}
921 	} else {
922 		ptr = stackshot_best_buffer_alloc(size, err);
923 		if (ptr == NULL) {
924 			/* We should always return an error if we return a null ptr */
925 			assert3u(*err, !=, KERN_SUCCESS);
926 		}
927 	}
928 
929 	return ptr;
930 }
931 
932 /**
933  * Initializes a new kcdata buffer somewhere in a linked kcdata list.
934  * Allocates a buffer for the kcdata from the shared stackshot buffer.
935  *
936  * See `linked_kcdata_alloc_callback` for the implementation details of
937  * linked kcdata for stackshot.
938  */
939 __result_use_check
940 static kern_return_t
linked_kcdata_init(linked_kcdata_descriptor_t descriptor,size_t min_size,unsigned int data_type,unsigned int flags)941 linked_kcdata_init(
942 	linked_kcdata_descriptor_t descriptor,
943 	size_t min_size,
944 	unsigned int data_type,
945 	unsigned int flags)
946 {
947 	void              *buf_ptr;
948 	kern_return_t      error;
949 	size_t             buf_size = MAX(min_size, stackshot_ctx.sc_min_kcdata_size);
950 
951 	buf_ptr = stackshot_alloc_arr(uint8_t, buf_size, &error);
952 	if (error != KERN_SUCCESS) {
953 		return error;
954 	}
955 
956 	error = kcdata_memory_static_init(&descriptor->kcdata, (mach_vm_address_t) buf_ptr, data_type, buf_size, flags);
957 	if (error != KERN_SUCCESS) {
958 		return error;
959 	}
960 
961 	descriptor->kcdata.kcd_alloc_callback = linked_kcdata_alloc_callback;
962 
963 	return KERN_SUCCESS;
964 }
965 
966 static void
stackshot_kcdata_free_unused(kcdata_descriptor_t descriptor)967 stackshot_kcdata_free_unused(kcdata_descriptor_t descriptor)
968 {
969 	/*
970 	 * If we have free space at the end of the kcdata, we can add it to the
971 	 * freelist. We always add to *our* cluster's freelist, no matter where
972 	 * the data was originally allocated.
973 	 *
974 	 * Important Note: We do not use kcdata_memory_get_used_bytes here because
975 	 * that includes extra space for the end tag (which we do not care about).
976 	 */
977 	int    buffer;
978 	size_t used_size = descriptor->kcd_addr_end - descriptor->kcd_addr_begin;
979 	size_t free_size = (descriptor->kcd_length - used_size);
980 	if (free_size > 0) {
981 #if defined(__arm64__)
982 		buffer = cpu_cluster_id();
983 #else /* __arm64__ */
984 		buffer = 0;
985 #endif /* !__arm64__ */
986 		stackshot_buffer_free((void*) descriptor->kcd_addr_end, &stackshot_ctx.sc_buffers[buffer], free_size);
987 		descriptor->kcd_length = used_size;
988 	}
989 }
990 
991 /**
992  * The callback for linked kcdata, which is called when one of the kcdata
993  * buffers runs out of space. This allocates a new kcdata descriptor &
994  * buffer in the linked list and sets it up.
995  *
996  * When kcdata calls this callback, it takes the returned descriptor
997  * and copies it to its own descriptor (which will be the per-cpu kcdata
998  * descriptor, in the case of stackshot).
999  *
1000  * --- Stackshot linked kcdata details ---
1001  * The way stackshot allocates kcdata buffers (in a non-panic context) is via
1002  * a basic bump allocator (see `stackshot_buffer_alloc`) and a linked list of
1003  * kcdata structures. The kcdata are allocated with a reasonable size based on
1004  * some system heuristics (or more if whatever is being pushed into the buffer
1005  * is larger). When the current kcdata buffer runs out of space, it calls this
1006  * callback, which allocates a new linked kcdata object at the tail of the
1007  * current list.
1008  *
1009  * The per-cpu `stackshot_kcdata_p` descriptor is the "tail" of the list, but
1010  * is not actually part of the linked list (this simplified implementation,
1011  * since it didn't require changing every kcdata call & a bunch of
1012  * kcdata code, since the current in-use descriptor is always in the same place
1013  * this way). When it is filled up and this callback is called, the
1014  * `stackshot_kcdata_p` descriptor is copied to the *actual* tail of the list
1015  * (in stackshot_cpu_ctx.scc_kcdata_tail), and a new linked kcdata struct is
1016  * allocated at the tail.
1017  */
1018 static kcdata_descriptor_t
linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor,size_t min_size)1019 linked_kcdata_alloc_callback(kcdata_descriptor_t descriptor, size_t min_size)
1020 {
1021 	kern_return_t error;
1022 	linked_kcdata_descriptor_t new_kcdata = NULL;
1023 
1024 	/* This callback should ALWAYS be coming from our per-cpu kcdata. If not, something has gone horribly wrong.*/
1025 	stackshot_panic_guard();
1026 	assert(descriptor == stackshot_kcdata_p);
1027 
1028 	/* Free the unused space in the buffer and copy it to the tail of the linked kcdata list. */
1029 	stackshot_kcdata_free_unused(descriptor);
1030 	stackshot_cpu_ctx.scc_kcdata_tail->kcdata = *descriptor;
1031 
1032 	/* Allocate another linked_kcdata and initialize it. */
1033 	new_kcdata = stackshot_alloc(struct linked_kcdata_descriptor, &error);
1034 	if (error != KERN_SUCCESS) {
1035 		return NULL;
1036 	}
1037 
1038 	/* It doesn't matter what we mark the data type as - we're throwing it away when weave the data together anyway. */
1039 	error = linked_kcdata_init(new_kcdata, min_size, KCDATA_BUFFER_BEGIN_STACKSHOT, descriptor->kcd_flags);
1040 	if (error != KERN_SUCCESS) {
1041 		return NULL;
1042 	}
1043 
1044 	bzero(descriptor, sizeof(struct kcdata_descriptor));
1045 	stackshot_cpu_ctx.scc_kcdata_tail->next = new_kcdata;
1046 	stackshot_cpu_ctx.scc_kcdata_tail = new_kcdata;
1047 
1048 	return &new_kcdata->kcdata;
1049 }
1050 
1051 /**
1052  * Allocates a new linked kcdata list for the current CPU and sets it up.
1053  * If there was a previous linked kcdata descriptor, you should call
1054  * `stackshot_finalize_linked_kcdata` first, or otherwise save it somewhere.
1055  */
1056 __result_use_check
1057 static kern_return_t
stackshot_new_linked_kcdata(void)1058 stackshot_new_linked_kcdata(void)
1059 {
1060 	kern_return_t error;
1061 
1062 	stackshot_panic_guard();
1063 	assert(!stackshot_ctx.sc_panic_stackshot);
1064 
1065 	stackshot_cpu_ctx.scc_kcdata_head = stackshot_alloc(struct linked_kcdata_descriptor, &error);
1066 	if (error != KERN_SUCCESS) {
1067 		return error;
1068 	}
1069 
1070 	kcd_exit_on_error(linked_kcdata_init(stackshot_cpu_ctx.scc_kcdata_head, 0,
1071 	    KCDATA_BUFFER_BEGIN_STACKSHOT,
1072 	    KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER | KCFLAG_ALLOC_CALLBACK));
1073 
1074 	stackshot_cpu_ctx.scc_kcdata_tail = stackshot_cpu_ctx.scc_kcdata_head;
1075 	*stackshot_kcdata_p = stackshot_cpu_ctx.scc_kcdata_head->kcdata;
1076 
1077 error_exit:
1078 	return error;
1079 }
1080 
1081 /**
1082  * Finalizes the current linked kcdata structure for the CPU by updating the
1083  * tail of the list with the per-cpu kcdata descriptor.
1084  */
1085 static void
stackshot_finalize_linked_kcdata(void)1086 stackshot_finalize_linked_kcdata(void)
1087 {
1088 	stackshot_panic_guard();
1089 	assert(!stackshot_ctx.sc_panic_stackshot);
1090 	stackshot_kcdata_free_unused(stackshot_kcdata_p);
1091 	if (stackshot_cpu_ctx.scc_kcdata_tail != NULL) {
1092 		stackshot_cpu_ctx.scc_kcdata_tail->kcdata = *stackshot_kcdata_p;
1093 	}
1094 	*stackshot_kcdata_p = (struct kcdata_descriptor){};
1095 }
1096 
1097 /*
1098  * Initialize the mutex governing access to the stack snapshot subsystem
1099  * and other stackshot related bits.
1100  */
1101 __private_extern__ void
stackshot_init(void)1102 stackshot_init(void)
1103 {
1104 	mach_timebase_info_data_t timebase;
1105 
1106 	clock_timebase_info(&timebase);
1107 	stackshot_max_fault_time = ((KDP_FAULT_PATH_MAX_TIME_PER_STACKSHOT_NSECS * timebase.denom) / timebase.numer);
1108 
1109 	max_tracebuf_size = MAX(max_tracebuf_size, ((ROUNDUP(max_mem, GIGABYTES) / GIGABYTES) * TRACEBUF_SIZE_PER_GB));
1110 
1111 	PE_parse_boot_argn("stackshot_maxsz", &max_tracebuf_size, sizeof(max_tracebuf_size));
1112 }
1113 
1114 /*
1115  * Called with interrupts disabled after stackshot context has been
1116  * initialized.
1117  */
1118 static kern_return_t
stackshot_trap(void)1119 stackshot_trap(void)
1120 {
1121 	kern_return_t   rv;
1122 
1123 #if defined(__x86_64__)
1124 	/*
1125 	 * Since mp_rendezvous and stackshot both attempt to capture cpus then perform an
1126 	 * operation, it's essential to apply mutual exclusion to the other when one
1127 	 * mechanism is in operation, lest there be a deadlock as the mechanisms race to
1128 	 * capture CPUs.
1129 	 *
1130 	 * Further, we assert that invoking stackshot from mp_rendezvous*() is not
1131 	 * allowed, so we check to ensure there there is no rendezvous in progress before
1132 	 * trying to grab the lock (if there is, a deadlock will occur when we try to
1133 	 * grab the lock).  This is accomplished by setting cpu_rendezvous_in_progress to
1134 	 * TRUE in the mp rendezvous action function.  If stackshot_trap() is called by
1135 	 * a subordinate of the call chain within the mp rendezvous action, this flag will
1136 	 * be set and can be used to detect the inevitable deadlock that would occur
1137 	 * if this thread tried to grab the rendezvous lock.
1138 	 */
1139 
1140 	if (current_cpu_datap()->cpu_rendezvous_in_progress == TRUE) {
1141 		panic("Calling stackshot from a rendezvous is not allowed!");
1142 	}
1143 
1144 	mp_rendezvous_lock();
1145 #endif
1146 
1147 	stackshot_stats.last_abs_start = mach_absolute_time();
1148 	stackshot_stats.last_abs_end = 0;
1149 
1150 	rv = DebuggerTrapWithState(DBOP_STACKSHOT, NULL, NULL, NULL, 0, NULL, FALSE, 0, NULL);
1151 
1152 	stackshot_stats.last_abs_end = mach_absolute_time();
1153 	stackshot_stats.stackshots_taken++;
1154 	stackshot_stats.stackshots_duration += (stackshot_stats.last_abs_end - stackshot_stats.last_abs_start);
1155 
1156 #if defined(__x86_64__)
1157 	mp_rendezvous_unlock();
1158 #endif
1159 	return rv;
1160 }
1161 
1162 extern void stackshot_get_timing(uint64_t *last_abs_start, uint64_t *last_abs_end, uint64_t *count, uint64_t *total_duration);
1163 void
stackshot_get_timing(uint64_t * last_abs_start,uint64_t * last_abs_end,uint64_t * count,uint64_t * total_duration)1164 stackshot_get_timing(uint64_t *last_abs_start, uint64_t *last_abs_end, uint64_t *count, uint64_t *total_duration)
1165 {
1166 	STACKSHOT_SUBSYS_LOCK();
1167 	*last_abs_start = stackshot_stats.last_abs_start;
1168 	*last_abs_end = stackshot_stats.last_abs_end;
1169 	*count = stackshot_stats.stackshots_taken;
1170 	*total_duration = stackshot_stats.stackshots_duration;
1171 	STACKSHOT_SUBSYS_UNLOCK();
1172 }
1173 
1174 kern_return_t
stack_snapshot_from_kernel(int pid,void * buf,uint32_t size,uint64_t flags,uint64_t delta_since_timestamp,uint32_t pagetable_mask,unsigned * bytes_traced)1175 stack_snapshot_from_kernel(int pid, void *buf, uint32_t size, uint64_t flags, uint64_t delta_since_timestamp, uint32_t pagetable_mask, unsigned *bytes_traced)
1176 {
1177 	kern_return_t error = KERN_SUCCESS;
1178 	boolean_t istate;
1179 	struct kdp_snapshot_args args;
1180 
1181 	args = (struct kdp_snapshot_args) {
1182 		.pid =               pid,
1183 		.buffer =            buf,
1184 		.buffer_size =       size,
1185 		.flags =             flags,
1186 		.since_timestamp =   delta_since_timestamp,
1187 		.pagetable_mask =    pagetable_mask
1188 	};
1189 
1190 #if DEVELOPMENT || DEBUG
1191 	if (kern_feature_override(KF_STACKSHOT_OVRD) == TRUE) {
1192 		return KERN_NOT_SUPPORTED;
1193 	}
1194 #endif
1195 	if ((buf == NULL) || (size <= 0) || (bytes_traced == NULL)) {
1196 		return KERN_INVALID_ARGUMENT;
1197 	}
1198 
1199 	/* zero caller's buffer to match KMA_ZERO in other path */
1200 	bzero(buf, size);
1201 
1202 	/* cap in individual stackshot to max_tracebuf_size */
1203 	if (size > max_tracebuf_size) {
1204 		size = max_tracebuf_size;
1205 	}
1206 
1207 	/* Serialize tracing */
1208 	if (flags & STACKSHOT_TRYLOCK) {
1209 		if (!STACKSHOT_SUBSYS_TRY_LOCK()) {
1210 			return KERN_LOCK_OWNED;
1211 		}
1212 	} else {
1213 		STACKSHOT_SUBSYS_LOCK();
1214 	}
1215 
1216 #if CONFIG_EXCLAVES
1217 	assert(!stackshot_exclave_inspect_ctids);
1218 #endif
1219 
1220 	stackshot_initial_estimate = 0;
1221 	stackshot_duration_prior_abs = 0;
1222 	stackshot_duration_outer = NULL;
1223 
1224 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_KERN_RECORD) | DBG_FUNC_START,
1225 	    flags, size, pid, delta_since_timestamp);
1226 
1227 	/* Prepare the compressor for a stackshot */
1228 	error = vm_compressor_kdp_init();
1229 	if (error != KERN_SUCCESS) {
1230 		return error;
1231 	}
1232 
1233 #if STACKSHOT_COLLECTS_RDAR_126582377_DATA
1234 	// Opportunistically collect reports of the rdar://126582377 failure.
1235 	// If the allocation doesn't succeed, or if another CPU "steals" the
1236 	// allocated event first, that is acceptable.
1237 	ca_event_t new_event = CA_EVENT_ALLOCATE_FLAGS(bad_stackshot_upper16, Z_NOWAIT);
1238 	if (new_event) {
1239 		if (os_atomic_cmpxchg(&rdar_126582377_event, NULL, new_event, relaxed) == 0) {
1240 			// Already set up, so free it
1241 			CA_EVENT_DEALLOCATE(new_event);
1242 		}
1243 	}
1244 #endif
1245 
1246 	istate = ml_set_interrupts_enabled(FALSE);
1247 	uint64_t time_start      = mach_absolute_time();
1248 
1249 	/* Emit a SOCD tracepoint that we are initiating a stackshot */
1250 	SOCD_TRACE_XNU_START(STACKSHOT);
1251 
1252 	/* Preload trace parameters*/
1253 	error = kdp_snapshot_preflight_internal(args);
1254 
1255 	/*
1256 	 * Trap to the debugger to obtain a coherent stack snapshot; this populates
1257 	 * the trace buffer
1258 	 */
1259 	if (error == KERN_SUCCESS) {
1260 		error = stackshot_trap();
1261 	}
1262 
1263 	uint64_t time_end = mach_absolute_time();
1264 
1265 	/* Emit a SOCD tracepoint that we have completed the stackshot */
1266 	SOCD_TRACE_XNU_END(STACKSHOT);
1267 
1268 	ml_set_interrupts_enabled(istate);
1269 
1270 #if CONFIG_EXCLAVES
1271 	/* stackshot trap should only finish successfully or with no pending Exclave threads */
1272 	assert(error == KERN_SUCCESS || stackshot_exclave_inspect_ctids == NULL);
1273 #endif
1274 
1275 	/*
1276 	 * Stackshot is no longer active.
1277 	 * (We have to do this here for the special interrupt disable timeout case to work)
1278 	 */
1279 	os_atomic_store(&stackshot_ctx.sc_state, SS_INACTIVE, release);
1280 
1281 	/* Release kdp compressor buffers */
1282 	vm_compressor_kdp_teardown();
1283 
1284 	/* Collect multithreaded kcdata into one finalized buffer */
1285 	if (error == KERN_SUCCESS && !stackshot_ctx.sc_is_singlethreaded) {
1286 		error = stackshot_collect_kcdata();
1287 	}
1288 
1289 #if CONFIG_EXCLAVES
1290 	if (error == KERN_SUCCESS && stackshot_exclave_inspect_ctids) {
1291 		error = collect_exclave_threads(flags);
1292 	}
1293 #endif /* CONFIG_EXCLAVES */
1294 
1295 	if (error == KERN_SUCCESS) {
1296 		if (!stackshot_ctx.sc_is_singlethreaded) {
1297 			error = stackshot_finalize_kcdata();
1298 		} else {
1299 			error = stackshot_finalize_singlethreaded_kcdata();
1300 		}
1301 	}
1302 
1303 	if (stackshot_duration_outer) {
1304 		*stackshot_duration_outer = time_end - time_start;
1305 	}
1306 	*bytes_traced = kdp_stack_snapshot_bytes_traced();
1307 
1308 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_KERN_RECORD) | DBG_FUNC_END,
1309 	    error, (time_end - time_start), size, *bytes_traced);
1310 
1311 	STACKSHOT_SUBSYS_UNLOCK();
1312 	return error;
1313 }
1314 
1315 #if CONFIG_TELEMETRY
1316 kern_return_t
stack_microstackshot(user_addr_t tracebuf,uint32_t tracebuf_size,uint32_t flags,int32_t * retval)1317 stack_microstackshot(user_addr_t tracebuf, uint32_t tracebuf_size, uint32_t flags, int32_t *retval)
1318 {
1319 	int error = KERN_FAILURE;
1320 	uint32_t bytes_traced = 0;
1321 
1322 	/*
1323 	 * "Flags" is actually treated as an enumeration, make sure only one value
1324 	 * is passed at a time.
1325 	 */
1326 	bool set_mark = flags & STACKSHOT_SET_MICROSTACKSHOT_MARK;
1327 	flags &= ~STACKSHOT_SET_MICROSTACKSHOT_MARK;
1328 	if (__builtin_popcount(flags) != 1) {
1329 		return KERN_INVALID_ARGUMENT;
1330 	}
1331 
1332 	/*
1333 	 * Ensure that there's space to copyout to.
1334 	 */
1335 	if (tracebuf == USER_ADDR_NULL || tracebuf_size == 0) {
1336 		return KERN_INVALID_ARGUMENT;
1337 	}
1338 
1339 	STACKSHOT_SUBSYS_LOCK();
1340 
1341 	switch (flags) {
1342 	case STACKSHOT_GET_KERNEL_MICROSTACKSHOT:
1343 		/*
1344 		 * Kernel samples consume from their buffer, so using a mark is the only
1345 		 * allowed option.
1346 		 */
1347 		if (!set_mark) {
1348 			error = KERN_INVALID_ARGUMENT;
1349 			break;
1350 		}
1351 		bytes_traced = tracebuf_size;
1352 		error = telemetry_kernel_gather(tracebuf, &bytes_traced);
1353 		*retval = (int)bytes_traced;
1354 		break;
1355 	case STACKSHOT_GET_MICROSTACKSHOT: {
1356 		if (tracebuf_size > max_tracebuf_size) {
1357 			error = KERN_INVALID_ARGUMENT;
1358 			break;
1359 		}
1360 
1361 		bytes_traced = tracebuf_size;
1362 		error = telemetry_gather(tracebuf, &bytes_traced, set_mark);
1363 		*retval = (int)bytes_traced;
1364 		break;
1365 	}
1366 	default:
1367 		error = KERN_NOT_SUPPORTED;
1368 		break;
1369 	}
1370 
1371 	STACKSHOT_SUBSYS_UNLOCK();
1372 	return error;
1373 }
1374 #endif /* CONFIG_TELEMETRY */
1375 
1376 /**
1377  * Grabs the next work item from the stackshot work queue.
1378  */
1379 static struct stackshot_workitem *
stackshot_get_workitem(struct stackshot_workqueue * queue)1380 stackshot_get_workitem(struct stackshot_workqueue *queue)
1381 {
1382 	uint32_t old_count, new_count;
1383 
1384 	/* note: this relies on give_up not performing the write, just bailing out immediately */
1385 	os_atomic_rmw_loop(&queue->sswq_cur_item, old_count, new_count, acq_rel, {
1386 		if (old_count >= os_atomic_load(&queue->sswq_num_items, relaxed)) {
1387 		        os_atomic_rmw_loop_give_up(return NULL);
1388 		}
1389 		new_count = old_count + 1;
1390 	});
1391 
1392 	return &queue->sswq_items[old_count];
1393 };
1394 
1395 /**
1396  * Puts an item on the appropriate stackshot work queue.
1397  * We don't need the lock for this, but only because it's
1398  * only called by one writer..
1399  *
1400  * @returns
1401  * true if the item fit in the queue, false if not.
1402  */
1403 static kern_return_t
stackshot_put_workitem(struct stackshot_workitem item)1404 stackshot_put_workitem(struct stackshot_workitem item)
1405 {
1406 	struct stackshot_workqueue *queue;
1407 
1408 	/* Put in higher queue if task has more threads, with highest queue having >= STACKSHOT_HARDEST_THREADCOUNT threads */
1409 	size_t queue_idx = ((item.sswi_task->thread_count * (STACKSHOT_NUM_WORKQUEUES - 1)) / STACKSHOT_HARDEST_THREADCOUNT);
1410 	queue_idx = MIN(queue_idx, STACKSHOT_NUM_WORKQUEUES - 1);
1411 
1412 	queue = &stackshot_ctx.sc_workqueues[queue_idx];
1413 
1414 	size_t num_items = os_atomic_load(&queue->sswq_num_items, relaxed);
1415 
1416 	if (num_items >= queue->sswq_capacity) {
1417 		return KERN_INSUFFICIENT_BUFFER_SIZE;
1418 	}
1419 
1420 	queue->sswq_items[num_items] = item;
1421 	os_atomic_inc(&queue->sswq_num_items, release);
1422 
1423 	return KERN_SUCCESS;
1424 }
1425 
1426 #define calc_num_linked_kcdata_frames(size, kcdata_size) (1 + ((size) - 1) / (kcdata_size))
1427 #define calc_linked_kcdata_size(size, kcdata_size) (calc_num_linked_kcdata_frames((size), (kcdata_size)) * ((kcdata_size) + sizeof(struct linked_kcdata_descriptor)))
1428 
1429 #define TASK_UUID_AVG_SIZE (16 * sizeof(uuid_t)) /* Average space consumed by UUIDs/task */
1430 #define TASK_SHARED_CACHE_AVG_SIZE (128) /* Average space consumed by task shared cache info */
1431 #define sizeof_if_traceflag(a, flag) (((trace_flags & (flag)) != 0) ? sizeof(a) : 0)
1432 
1433 #define FUDGED_SIZE(size, adj) (((size) * ((adj) + 100)) / 100)
1434 
1435 /*
1436  * Return the estimated size of a single task (including threads)
1437  * in a stackshot with the given flags.
1438  */
1439 static uint32_t
get_stackshot_est_tasksize(uint64_t trace_flags)1440 get_stackshot_est_tasksize(uint64_t trace_flags)
1441 {
1442 	size_t total_size;
1443 	size_t threads_per_task = (((threads_count + terminated_threads_count) - 1) / (tasks_count + terminated_tasks_count)) + 1;
1444 	size_t est_thread_size = sizeof(struct thread_snapshot_v4) + 42 * sizeof(uintptr_t);
1445 	size_t est_task_size = sizeof(struct task_snapshot_v2) +
1446 	    TASK_UUID_AVG_SIZE +
1447 	    TASK_SHARED_CACHE_AVG_SIZE +
1448 	    sizeof_if_traceflag(struct io_stats_snapshot, STACKSHOT_INSTRS_CYCLES) +
1449 	    sizeof_if_traceflag(uint32_t, STACKSHOT_ASID) +
1450 	    sizeof_if_traceflag(sizeof(uintptr_t) * STACKSHOT_PAGETABLE_BUFSZ, STACKSHOT_PAGE_TABLES) +
1451 	    sizeof_if_traceflag(struct instrs_cycles_snapshot_v2, STACKSHOT_INSTRS_CYCLES) +
1452 	    sizeof(struct stackshot_cpu_architecture) +
1453 	    sizeof(struct stackshot_task_codesigning_info);
1454 
1455 #if STACKSHOT_COLLECTS_LATENCY_INFO
1456 	if (collect_latency_info) {
1457 		est_thread_size += sizeof(struct stackshot_latency_thread);
1458 		est_task_size += sizeof(struct stackshot_latency_task);
1459 	}
1460 #endif
1461 
1462 	total_size = est_task_size + threads_per_task * est_thread_size;
1463 
1464 	return total_size;
1465 }
1466 
1467 /*
1468  * Return the estimated size of a stackshot based on the
1469  * number of currently running threads and tasks.
1470  *
1471  * adj is an adjustment in units of percentage
1472  */
1473 static uint32_t
get_stackshot_estsize(uint32_t prev_size_hint,uint32_t adj,uint64_t trace_flags,pid_t target_pid)1474 get_stackshot_estsize(
1475 	uint32_t prev_size_hint,
1476 	uint32_t adj,
1477 	uint64_t trace_flags,
1478 	pid_t target_pid)
1479 {
1480 	vm_size_t thread_and_task_total;
1481 	uint64_t  size;
1482 	uint32_t  estimated_size;
1483 	bool      process_scoped = ((target_pid != -1) && ((trace_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0));
1484 
1485 	/*
1486 	 * We use the estimated task size (with a fudge factor) as the default
1487 	 * linked kcdata buffer size in an effort to reduce overhead (ideally, we want
1488 	 * each task to only need a single kcdata buffer.)
1489 	 */
1490 	uint32_t est_task_size = get_stackshot_est_tasksize(trace_flags);
1491 	uint32_t est_kcdata_size = FUDGED_SIZE(est_task_size, adj);
1492 	uint64_t est_preamble_size = calc_linked_kcdata_size(8192 * 4, est_kcdata_size);
1493 	uint64_t est_postamble_size = calc_linked_kcdata_size(8192 * 2, est_kcdata_size);
1494 	uint64_t est_extra_size = 0;
1495 
1496 	adj = MIN(adj, 100u);   /* no more than double our estimate */
1497 
1498 #if STACKSHOT_COLLECTS_LATENCY_INFO
1499 	est_extra_size += real_ncpus * sizeof(struct stackshot_latency_cpu);
1500 	est_extra_size += sizeof(struct stackshot_latency_collection_v2);
1501 #endif
1502 
1503 	est_extra_size += real_ncpus * MAX_FRAMES * sizeof(uintptr_t); /* Stacktrace buffers */
1504 	est_extra_size += FUDGED_SIZE(tasks_count, 10) * sizeof(uintptr_t) * STACKSHOT_NUM_WORKQUEUES; /* Work queues */
1505 	est_extra_size += sizeof_if_traceflag(sizeof(uintptr_t) * STACKSHOT_PAGETABLE_BUFSZ * real_ncpus, STACKSHOT_PAGE_TABLES);
1506 
1507 	thread_and_task_total = calc_linked_kcdata_size(est_task_size, est_kcdata_size);
1508 	if (!process_scoped) {
1509 		thread_and_task_total *= tasks_count;
1510 	}
1511 	size = thread_and_task_total + est_preamble_size + est_postamble_size + est_extra_size; /* estimate */
1512 	size = FUDGED_SIZE(size, adj); /* add adj */
1513 	size = MAX(size, prev_size_hint); /* allow hint to increase */
1514 	size += stackshot_plh_est_size(); /* add space for the port label hash */
1515 	size = MIN(size, VM_MAP_TRUNC_PAGE(UINT32_MAX, PAGE_MASK)); /* avoid overflow */
1516 	estimated_size = (uint32_t) VM_MAP_ROUND_PAGE(size, PAGE_MASK); /* round to pagesize */
1517 
1518 	return estimated_size;
1519 }
1520 
1521 /**
1522  * Copies a linked list of kcdata structures into a final kcdata structure.
1523  * Only used from stackshot_finalize_kcdata.
1524  */
1525 __result_use_check
1526 static kern_return_t
stackshot_copy_linked_kcdata(kcdata_descriptor_t final_kcdata,linked_kcdata_descriptor_t linked_kcdata)1527 stackshot_copy_linked_kcdata(kcdata_descriptor_t final_kcdata, linked_kcdata_descriptor_t linked_kcdata)
1528 {
1529 	kern_return_t error = KERN_SUCCESS;
1530 
1531 	while (linked_kcdata) {
1532 		/* Walk linked kcdata list */
1533 		kcdata_descriptor_t cur_kcdata = &linked_kcdata->kcdata;
1534 		if ((cur_kcdata->kcd_addr_end - cur_kcdata->kcd_addr_begin) == 0) {
1535 			linked_kcdata = linked_kcdata->next;
1536 			continue;
1537 		}
1538 
1539 		/* Every item in the linked kcdata should have a header tag of type KCDATA_BUFFER_BEGIN_STACKSHOT. */
1540 		assert(((struct kcdata_item*) cur_kcdata->kcd_addr_begin)->type == KCDATA_BUFFER_BEGIN_STACKSHOT);
1541 		assert((final_kcdata->kcd_addr_begin + final_kcdata->kcd_length) > final_kcdata->kcd_addr_end);
1542 		size_t header_size = sizeof(kcdata_item_t) + kcdata_calc_padding(sizeof(kcdata_item_t));
1543 		size_t size = cur_kcdata->kcd_addr_end - cur_kcdata->kcd_addr_begin - header_size;
1544 		size_t free = (final_kcdata->kcd_length + final_kcdata->kcd_addr_begin) - final_kcdata->kcd_addr_end;
1545 		if (free < size) {
1546 			error = KERN_INSUFFICIENT_BUFFER_SIZE;
1547 			goto error_exit;
1548 		}
1549 
1550 		/* Just memcpy the data over (and compress if we need to.) */
1551 		kcdata_compression_window_open(final_kcdata);
1552 		error = kcdata_memcpy(final_kcdata, final_kcdata->kcd_addr_end, (void*) (cur_kcdata->kcd_addr_begin + header_size), size);
1553 		if (error != KERN_SUCCESS) {
1554 			goto error_exit;
1555 		}
1556 		final_kcdata->kcd_addr_end += size;
1557 		kcdata_compression_window_close(final_kcdata);
1558 
1559 		linked_kcdata = linked_kcdata->next;
1560 	}
1561 
1562 error_exit:
1563 	return error;
1564 }
1565 
1566 /**
1567  * Copies the duration, latency, and diagnostic info into a final kcdata buffer.
1568  * Only used by stackshot_finalize_kcdata and stackshot_finalize_singlethreaded_kcdata.
1569  */
1570 __result_use_check
1571 static kern_return_t
stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata)1572 stackshot_push_duration_and_latency(kcdata_descriptor_t kcdata)
1573 {
1574 	kern_return_t error;
1575 	mach_vm_address_t out_addr;
1576 	bool use_fault_path = ((stackshot_flags & (STACKSHOT_ENABLE_UUID_FAULTING | STACKSHOT_ENABLE_BT_FAULTING)) != 0);
1577 #if STACKSHOT_COLLECTS_LATENCY_INFO
1578 	size_t            buffer_used = 0;
1579 	size_t            buffer_overhead = 0;
1580 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
1581 
1582 	if (use_fault_path) {
1583 		struct stackshot_fault_stats stats = (struct stackshot_fault_stats) {
1584 			.sfs_pages_faulted_in = 0,
1585 			.sfs_time_spent_faulting = 0,
1586 			.sfs_system_max_fault_time = stackshot_max_fault_time,
1587 			.sfs_stopped_faulting = false
1588 		};
1589 		percpu_foreach_base(base) {
1590 			struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
1591 			if (!cpu_ctx->scc_did_work) {
1592 				continue;
1593 			}
1594 			stats.sfs_pages_faulted_in += cpu_ctx->scc_fault_stats.sfs_pages_faulted_in;
1595 			stats.sfs_time_spent_faulting += cpu_ctx->scc_fault_stats.sfs_time_spent_faulting;
1596 			stats.sfs_stopped_faulting = stats.sfs_stopped_faulting || cpu_ctx->scc_fault_stats.sfs_stopped_faulting;
1597 		}
1598 		kcdata_push_data(kcdata, STACKSHOT_KCTYPE_STACKSHOT_FAULT_STATS,
1599 		    sizeof(struct stackshot_fault_stats), &stats);
1600 	}
1601 
1602 #if STACKSHOT_COLLECTS_LATENCY_INFO
1603 	int num_working_cpus = 0;
1604 	if (collect_latency_info) {
1605 		/* Add per-CPU latency info */
1606 		percpu_foreach(cpu_ctx, stackshot_cpu_ctx_percpu) {
1607 			if (cpu_ctx->scc_did_work) {
1608 				num_working_cpus++;
1609 			}
1610 		}
1611 		kcdata_compression_window_open(kcdata);
1612 		kcd_exit_on_error(kcdata_get_memory_addr_for_array(
1613 			    kcdata, STACKSHOT_KCTYPE_LATENCY_INFO_CPU, sizeof(struct stackshot_latency_cpu), num_working_cpus, &out_addr));
1614 		percpu_foreach_base(base) {
1615 			if (PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu)->scc_did_work) {
1616 				kcdata_memcpy(kcdata, out_addr, PERCPU_GET_WITH_BASE(base, stackshot_cpu_latency_percpu),
1617 				    sizeof(struct stackshot_latency_cpu));
1618 				out_addr += sizeof(struct stackshot_latency_cpu);
1619 			}
1620 		}
1621 		kcd_exit_on_error(kcdata_compression_window_close(kcdata));
1622 
1623 		/* Add up buffer info */
1624 		for (size_t buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) {
1625 			struct stackshot_buffer *buf = &stackshot_ctx.sc_buffers[buf_idx];
1626 			buffer_used += os_atomic_load(&buf->ssb_used, relaxed);
1627 			buffer_overhead += os_atomic_load(&buf->ssb_overhead, relaxed);
1628 		}
1629 		stackshot_ctx.sc_latency.buffer_size = stackshot_ctx.sc_args.buffer_size;
1630 		stackshot_ctx.sc_latency.buffer_overhead = buffer_overhead;
1631 		stackshot_ctx.sc_latency.buffer_used = buffer_used;
1632 		stackshot_ctx.sc_latency.buffer_count = stackshot_ctx.sc_num_buffers;
1633 
1634 		/* Add overall latency info */
1635 		kcd_exit_on_error(kcdata_push_data(
1636 			    kcdata, STACKSHOT_KCTYPE_LATENCY_INFO,
1637 			    sizeof(stackshot_ctx.sc_latency), &stackshot_ctx.sc_latency));
1638 	}
1639 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
1640 
1641 	if ((stackshot_flags & STACKSHOT_DO_COMPRESS) == 0) {
1642 		assert(!stackshot_ctx.sc_panic_stackshot);
1643 		kcd_exit_on_error(kcdata_get_memory_addr(kcdata, STACKSHOT_KCTYPE_STACKSHOT_DURATION,
1644 		    sizeof(struct stackshot_duration_v2), &out_addr));
1645 		struct stackshot_duration_v2 *duration_p = (void *) out_addr;
1646 		memcpy(duration_p, &stackshot_ctx.sc_duration, sizeof(*duration_p));
1647 		stackshot_duration_outer = (unaligned_u64 *) &duration_p->stackshot_duration_outer;
1648 		kcd_exit_on_error(kcdata_add_uint64_with_description(kcdata, stackshot_tries, "stackshot_tries"));
1649 	} else {
1650 		kcd_exit_on_error(kcdata_push_data(kcdata, STACKSHOT_KCTYPE_STACKSHOT_DURATION, sizeof(stackshot_ctx.sc_duration), &stackshot_ctx.sc_duration));
1651 		stackshot_duration_outer = NULL;
1652 	}
1653 
1654 error_exit:
1655 	return error;
1656 }
1657 
1658 /**
1659  * Allocates the final kcdata buffer for a mulitithreaded stackshot,
1660  * where all of the per-task kcdata (and exclave kcdata) will end up.
1661  */
1662 __result_use_check
1663 static kern_return_t
stackshot_alloc_final_kcdata(void)1664 stackshot_alloc_final_kcdata(void)
1665 {
1666 	vm_offset_t   final_kcdata_buffer = 0;
1667 	kern_return_t error = KERN_SUCCESS;
1668 	uint32_t hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
1669 	    : (stackshot_flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
1670 	    : KCDATA_BUFFER_BEGIN_STACKSHOT;
1671 
1672 	if (stackshot_ctx.sc_is_singlethreaded) {
1673 		return KERN_SUCCESS;
1674 	}
1675 
1676 	if ((error = kmem_alloc(kernel_map, &final_kcdata_buffer, stackshot_args.buffer_size,
1677 	    KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) {
1678 		os_log_error(OS_LOG_DEFAULT, "stackshot: final allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, stackshot_args.buffer_size, max_tracebuf_size, stackshot_tries);
1679 		return KERN_RESOURCE_SHORTAGE;
1680 	}
1681 
1682 	stackshot_ctx.sc_finalized_kcdata = kcdata_memory_alloc_init(final_kcdata_buffer, hdr_tag,
1683 	    stackshot_args.buffer_size, KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER);
1684 
1685 	if (stackshot_ctx.sc_finalized_kcdata == NULL) {
1686 		kmem_free(kernel_map, final_kcdata_buffer, stackshot_args.buffer_size);
1687 		return KERN_FAILURE;
1688 	}
1689 
1690 	return KERN_SUCCESS;
1691 }
1692 
1693 /**
1694  * Frees the final kcdata buffer.
1695  */
1696 static void
stackshot_free_final_kcdata(void)1697 stackshot_free_final_kcdata(void)
1698 {
1699 	if (stackshot_ctx.sc_is_singlethreaded || (stackshot_ctx.sc_finalized_kcdata == NULL)) {
1700 		return;
1701 	}
1702 
1703 	kmem_free(kernel_map, stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin, stackshot_args.buffer_size);
1704 	kcdata_memory_destroy(stackshot_ctx.sc_finalized_kcdata);
1705 	stackshot_ctx.sc_finalized_kcdata = NULL;
1706 }
1707 
1708 /**
1709  * Called once we exit the debugger trap to collate all of the separate linked
1710  * kcdata lists into one kcdata buffer. The calling thread will run this, and
1711  * it is guaranteed that nobody else is touching any stackshot state at this
1712  * point. In the case of a panic stackshot, this is never called since we only
1713  * use one thread.
1714  *
1715  * Called with interrupts enabled, stackshot subsys lock held.
1716  */
1717 __result_use_check
1718 static kern_return_t
stackshot_collect_kcdata(void)1719 stackshot_collect_kcdata(void)
1720 {
1721 	kern_return_t error = 0;
1722 	uint32_t      hdr_tag;
1723 
1724 	assert(!stackshot_ctx.sc_panic_stackshot && !stackshot_ctx.sc_is_singlethreaded);
1725 	LCK_MTX_ASSERT(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1726 
1727 	/* Allocate our final kcdata buffer. */
1728 	kcd_exit_on_error(stackshot_alloc_final_kcdata());
1729 	assert(stackshot_ctx.sc_finalized_kcdata != NULL);
1730 
1731 	/* Setup compression if we need it. */
1732 	if (stackshot_flags & STACKSHOT_DO_COMPRESS) {
1733 		hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
1734 		    : KCDATA_BUFFER_BEGIN_STACKSHOT;
1735 		kcd_exit_on_error(kcdata_init_compress(stackshot_ctx.sc_finalized_kcdata, hdr_tag, kdp_memcpy, KCDCT_ZLIB));
1736 	}
1737 
1738 	/* Copy over all of the pre task-iteration kcdata (to preserve order as if it were single-threaded) */
1739 	kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, stackshot_ctx.sc_pretask_kcdata));
1740 
1741 	/* Set each queue's cur_item to 0. */
1742 	for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
1743 		os_atomic_store(&stackshot_ctx.sc_workqueues[i].sswq_cur_item, 0, relaxed);
1744 	}
1745 
1746 	/*
1747 	 * Iterate over work queue(s) and copy the kcdata in.
1748 	 */
1749 	while (true) {
1750 		struct stackshot_workitem  *next_item = NULL;
1751 		struct stackshot_workqueue *next_queue = NULL;
1752 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
1753 			struct stackshot_workqueue *queue = &stackshot_ctx.sc_workqueues[i];
1754 			size_t cur_item = os_atomic_load(&queue->sswq_cur_item, relaxed);
1755 
1756 			/* Check if we're done with this queue */
1757 			if (cur_item >= os_atomic_load(&queue->sswq_num_items, relaxed)) {
1758 				continue;
1759 			}
1760 
1761 			/* Check if this workitem should come next */
1762 			struct stackshot_workitem *item = &queue->sswq_items[cur_item];
1763 			if ((next_item == NULL) || (next_item->sswi_idx > item->sswi_idx)) {
1764 				next_item = item;
1765 				next_queue = queue;
1766 			}
1767 		}
1768 
1769 		/* Queues are empty. */
1770 		if (next_item == NULL) {
1771 			break;
1772 		}
1773 
1774 		assert(next_queue);
1775 		assert(next_item->sswi_data != NULL);
1776 
1777 		os_atomic_inc(&next_queue->sswq_cur_item, relaxed);
1778 		kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, next_item->sswi_data));
1779 	}
1780 
1781 	/* Write post-task kcdata */
1782 	kcd_exit_on_error(stackshot_copy_linked_kcdata(stackshot_ctx.sc_finalized_kcdata, stackshot_ctx.sc_posttask_kcdata));
1783 error_exit:
1784 	if (error != KERN_SUCCESS) {
1785 		stackshot_free_final_kcdata();
1786 	}
1787 	return error;
1788 }
1789 
1790 
1791 /**
1792  * Called at the very end of stackshot data generation, to write final timing
1793  * data to the kcdata structure and close compression. Only called for
1794  * multi-threaded stackshots; see stackshot_finalize_singlethreaded_kcata for
1795  * single-threaded variant.
1796  *
1797  * Called with interrupts enabled, stackshot subsys lock held.
1798  */
1799 __result_use_check
1800 static kern_return_t
stackshot_finalize_kcdata(void)1801 stackshot_finalize_kcdata(void)
1802 {
1803 	kern_return_t error = 0;
1804 
1805 	assert(!stackshot_ctx.sc_panic_stackshot && !stackshot_ctx.sc_is_singlethreaded);
1806 	LCK_MTX_ASSERT(&stackshot_subsys_mutex, LCK_MTX_ASSERT_OWNED);
1807 
1808 	assert(stackshot_ctx.sc_finalized_kcdata != NULL);
1809 
1810 	/* Write stackshot timing info */
1811 	kcd_exit_on_error(stackshot_push_duration_and_latency(stackshot_ctx.sc_finalized_kcdata));
1812 
1813 	/* Note: exactly 0 or 1 call to something pushing more data can be called after kcd_finalize_compression */
1814 	kcd_finalize_compression(stackshot_ctx.sc_finalized_kcdata);
1815 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_ctx.sc_finalized_kcdata, stackshot_flags, "stackshot_out_flags"));
1816 	kcd_exit_on_error(kcdata_write_buffer_end(stackshot_ctx.sc_finalized_kcdata));
1817 
1818 	stackshot_ctx.sc_bytes_traced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_ctx.sc_finalized_kcdata);
1819 	stackshot_ctx.sc_bytes_uncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_ctx.sc_finalized_kcdata);
1820 
1821 	if (os_atomic_load(&stackshot_ctx.sc_retval, relaxed) == KERN_SUCCESS) {
1822 		/* releases and zeros done */
1823 		kcd_exit_on_error(kcdata_finish(stackshot_ctx.sc_finalized_kcdata));
1824 	}
1825 
1826 	memcpy(stackshot_args.buffer, (void*) stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin, stackshot_args.buffer_size);
1827 
1828 	/* Fix duration_outer offset */
1829 	if (stackshot_duration_outer != NULL) {
1830 		stackshot_duration_outer = (unaligned_u64*) ((mach_vm_address_t) stackshot_args.buffer + ((mach_vm_address_t) stackshot_duration_outer - stackshot_ctx.sc_finalized_kcdata->kcd_addr_begin));
1831 	}
1832 
1833 error_exit:
1834 	stackshot_free_final_kcdata();
1835 	return error;
1836 }
1837 
1838 /**
1839  * Finalizes the kcdata for a singlethreaded stackshot.
1840  *
1841  * May be called from interrupt/panic context.
1842  */
1843 __result_use_check
1844 static kern_return_t
stackshot_finalize_singlethreaded_kcdata(void)1845 stackshot_finalize_singlethreaded_kcdata(void)
1846 {
1847 	kern_return_t error;
1848 
1849 	assert(stackshot_ctx.sc_is_singlethreaded);
1850 
1851 	kcd_exit_on_error(stackshot_push_duration_and_latency(stackshot_ctx.sc_finalized_kcdata));
1852 	/* Note: exactly 0 or 1 call to something pushing more data can be called after kcd_finalize_compression */
1853 	kcd_finalize_compression(stackshot_ctx.sc_finalized_kcdata);
1854 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_ctx.sc_finalized_kcdata, stackshot_flags, "stackshot_out_flags"));
1855 	kcd_exit_on_error(kcdata_write_buffer_end(stackshot_ctx.sc_finalized_kcdata));
1856 
1857 	stackshot_ctx.sc_bytes_traced = (uint32_t) kcdata_memory_get_used_bytes(stackshot_ctx.sc_finalized_kcdata);
1858 	stackshot_ctx.sc_bytes_uncompressed = (uint32_t) kcdata_memory_get_uncompressed_bytes(stackshot_ctx.sc_finalized_kcdata);
1859 
1860 	kcd_exit_on_error(kcdata_finish(stackshot_ctx.sc_finalized_kcdata));
1861 
1862 	if (stackshot_ctx.sc_panic_stackshot) {
1863 		*stackshot_args.descriptor = *stackshot_ctx.sc_finalized_kcdata;
1864 	}
1865 
1866 error_exit:
1867 	return error;
1868 }
1869 
1870 /*
1871  * stackshot_remap_buffer:	Utility function to remap bytes_traced bytes starting at stackshotbuf
1872  *				into the current task's user space and subsequently copy out the address
1873  *				at which the buffer has been mapped in user space to out_buffer_addr.
1874  *
1875  * Inputs:			stackshotbuf - pointer to the original buffer in the kernel's address space
1876  *				bytes_traced - length of the buffer to remap starting from stackshotbuf
1877  *				out_buffer_addr - pointer to placeholder where newly mapped buffer will be mapped.
1878  *				out_size_addr - pointer to be filled in with the size of the buffer
1879  *
1880  * Outputs:			ENOSPC if there is not enough free space in the task's address space to remap the buffer
1881  *				EINVAL for all other errors returned by task_remap_buffer/mach_vm_remap
1882  *				an error from copyout
1883  */
1884 static kern_return_t
stackshot_remap_buffer(void * stackshotbuf,uint32_t bytes_traced,uint64_t out_buffer_addr,uint64_t out_size_addr)1885 stackshot_remap_buffer(void *stackshotbuf, uint32_t bytes_traced, uint64_t out_buffer_addr, uint64_t out_size_addr)
1886 {
1887 	int                     error = 0;
1888 	mach_vm_offset_t        stackshotbuf_user_addr = (mach_vm_offset_t)NULL;
1889 	vm_prot_t               cur_prot = VM_PROT_NONE, max_prot = VM_PROT_NONE;
1890 
1891 	error = mach_vm_remap(current_map(), &stackshotbuf_user_addr, bytes_traced, 0,
1892 	    VM_FLAGS_ANYWHERE, kernel_map, (mach_vm_offset_t)stackshotbuf, FALSE,
1893 	    &cur_prot, &max_prot, VM_INHERIT_DEFAULT);
1894 	/*
1895 	 * If the call to mach_vm_remap fails, we return the appropriate converted error
1896 	 */
1897 	if (error == KERN_SUCCESS) {
1898 		/* If the user addr somehow didn't get set, we should make sure that we fail, and (eventually)
1899 		 * panic on development kernels to find out why
1900 		 */
1901 		if (stackshotbuf_user_addr == (mach_vm_offset_t)NULL) {
1902 #if DEVELOPMENT || DEBUG
1903 			os_log_error(OS_LOG_DEFAULT, "stackshot: mach_vm_remap succeeded with NULL\n");
1904 #endif // DEVELOPMENT || DEBUG
1905 			return KERN_FAILURE;
1906 		}
1907 
1908 		/*
1909 		 * If we fail to copy out the address or size of the new buffer, we remove the buffer mapping that
1910 		 * we just made in the task's user space.
1911 		 */
1912 		error = copyout(CAST_DOWN(void *, &stackshotbuf_user_addr), (user_addr_t)out_buffer_addr, sizeof(stackshotbuf_user_addr));
1913 		if (error != KERN_SUCCESS) {
1914 			mach_vm_deallocate(get_task_map(current_task()), stackshotbuf_user_addr, (mach_vm_size_t)bytes_traced);
1915 			return error;
1916 		}
1917 		error = copyout(&bytes_traced, (user_addr_t)out_size_addr, sizeof(bytes_traced));
1918 		if (error != KERN_SUCCESS) {
1919 			mach_vm_deallocate(get_task_map(current_task()), stackshotbuf_user_addr, (mach_vm_size_t)bytes_traced);
1920 			return error;
1921 		}
1922 	}
1923 	return error;
1924 }
1925 
1926 #if CONFIG_EXCLAVES
1927 
1928 static kern_return_t
stackshot_setup_exclave_waitlist(void)1929 stackshot_setup_exclave_waitlist(void)
1930 {
1931 	kern_return_t error = KERN_SUCCESS;
1932 	size_t exclave_threads_max = exclaves_ipc_buffer_count();
1933 	size_t waitlist_size = 0;
1934 
1935 	assert(!stackshot_exclave_inspect_ctids);
1936 
1937 	if (exclaves_inspection_is_initialized() && exclave_threads_max) {
1938 		if (os_mul_overflow(exclave_threads_max, sizeof(ctid_t), &waitlist_size)) {
1939 			error = KERN_INVALID_ARGUMENT;
1940 			goto error;
1941 		}
1942 		stackshot_exclave_inspect_ctids = stackshot_alloc_with_size(waitlist_size, &error);
1943 		if (!stackshot_exclave_inspect_ctids) {
1944 			goto error;
1945 		}
1946 		stackshot_exclave_inspect_ctid_count = 0;
1947 		stackshot_exclave_inspect_ctid_capacity = exclave_threads_max;
1948 	}
1949 
1950 error:
1951 	return error;
1952 }
1953 
1954 static kern_return_t
collect_exclave_threads(uint64_t ss_flags)1955 collect_exclave_threads(uint64_t ss_flags)
1956 {
1957 	size_t i;
1958 	ctid_t ctid;
1959 	thread_t thread;
1960 	kern_return_t kr = KERN_SUCCESS;
1961 	STACKSHOT_SUBSYS_ASSERT_LOCKED();
1962 
1963 	lck_mtx_lock(&exclaves_collect_mtx);
1964 
1965 	if (stackshot_exclave_inspect_ctid_count == 0) {
1966 		/* Nothing to do */
1967 		goto out;
1968 	}
1969 
1970 	// When asking for ASIDs, make sure we get all exclaves asids and mappings as well
1971 	exclaves_stackshot_raw_addresses = (ss_flags & STACKSHOT_ASID);
1972 	exclaves_stackshot_all_address_spaces = (ss_flags & (STACKSHOT_ASID | STACKSHOT_EXCLAVES));
1973 
1974 	/* This error is intentionally ignored: we are now committed to collecting
1975 	 * these threads, or at least properly waking them. If this fails, the first
1976 	 * collected thread should also fail to append to the kcdata, and will abort
1977 	 * further collection, properly clearing the AST and waking these threads.
1978 	 */
1979 	kcdata_add_container_marker(stackshot_ctx.sc_finalized_kcdata, KCDATA_TYPE_CONTAINER_BEGIN,
1980 	    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
1981 
1982 	for (i = 0; i < stackshot_exclave_inspect_ctid_count; ++i) {
1983 		ctid = stackshot_exclave_inspect_ctids[i];
1984 		thread = ctid_get_thread(ctid);
1985 		assert(thread);
1986 		exclaves_inspection_queue_add(&exclaves_inspection_queue_stackshot, &thread->th_exclaves_inspection_queue_stackshot);
1987 	}
1988 	exclaves_inspection_begin_collecting();
1989 	exclaves_inspection_wait_complete(&exclaves_inspection_queue_stackshot);
1990 	kr = stackshot_exclave_kr; /* Read the result of work done on our behalf, by collection thread */
1991 	if (kr != KERN_SUCCESS) {
1992 		goto out;
1993 	}
1994 
1995 	kr = kcdata_add_container_marker(stackshot_ctx.sc_finalized_kcdata, KCDATA_TYPE_CONTAINER_END,
1996 	    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
1997 	if (kr != KERN_SUCCESS) {
1998 		goto out;
1999 	}
2000 out:
2001 	/* clear Exclave buffer now that it's been used */
2002 	stackshot_exclave_inspect_ctids = NULL;
2003 	stackshot_exclave_inspect_ctid_capacity = 0;
2004 	stackshot_exclave_inspect_ctid_count = 0;
2005 
2006 	lck_mtx_unlock(&exclaves_collect_mtx);
2007 	return kr;
2008 }
2009 
2010 static kern_return_t
stackshot_exclaves_process_stacktrace(const address_v__opt_s * _Nonnull st,void * kcdata_ptr)2011 stackshot_exclaves_process_stacktrace(const address_v__opt_s *_Nonnull st, void *kcdata_ptr)
2012 {
2013 	kern_return_t error = KERN_SUCCESS;
2014 	exclave_ecstackentry_addr_t * addr = NULL;
2015 	__block size_t count = 0;
2016 
2017 	if (!st->has_value) {
2018 		goto error_exit;
2019 	}
2020 
2021 	address__v_visit(&st->value, ^(size_t __unused i, const stackshottypes_address_s __unused item) {
2022 		count++;
2023 	});
2024 
2025 	kcdata_compression_window_open(kcdata_ptr);
2026 	kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_IPCSTACKENTRY_ECSTACK,
2027 	    sizeof(exclave_ecstackentry_addr_t), count, (mach_vm_address_t*)&addr));
2028 
2029 	address__v_visit(&st->value, ^(size_t i, const stackshottypes_address_s item) {
2030 		addr[i] = (exclave_ecstackentry_addr_t)item;
2031 	});
2032 
2033 	kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2034 
2035 error_exit:
2036 	return error;
2037 }
2038 
2039 static kern_return_t
stackshot_exclaves_process_ipcstackentry(uint64_t index,const stackshottypes_ipcstackentry_s * _Nonnull ise,void * kcdata_ptr)2040 stackshot_exclaves_process_ipcstackentry(uint64_t index, const stackshottypes_ipcstackentry_s *_Nonnull ise, void *kcdata_ptr)
2041 {
2042 	kern_return_t error = KERN_SUCCESS;
2043 
2044 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2045 	    STACKSHOT_KCCONTAINER_EXCLAVE_IPCSTACKENTRY, index));
2046 
2047 	struct exclave_ipcstackentry_info info = { 0 };
2048 	info.eise_asid = ise->asid;
2049 
2050 	info.eise_tnid = ise->tnid;
2051 
2052 	if (ise->invocationid.has_value) {
2053 		info.eise_flags |= kExclaveIpcStackEntryHaveInvocationID;
2054 		info.eise_invocationid = ise->invocationid.value;
2055 	} else {
2056 		info.eise_invocationid = 0;
2057 	}
2058 
2059 	info.eise_flags |= (ise->stacktrace.has_value ? kExclaveIpcStackEntryHaveStack : 0);
2060 
2061 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_IPCSTACKENTRY_INFO, sizeof(struct exclave_ipcstackentry_info), &info));
2062 
2063 	if (ise->stacktrace.has_value) {
2064 		kcd_exit_on_error(stackshot_exclaves_process_stacktrace(&ise->stacktrace, kcdata_ptr));
2065 	}
2066 
2067 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2068 	    STACKSHOT_KCCONTAINER_EXCLAVE_IPCSTACKENTRY, index));
2069 
2070 error_exit:
2071 	return error;
2072 }
2073 
2074 static kern_return_t
stackshot_exclaves_process_ipcstack(const stackshottypes_ipcstackentry_v__opt_s * _Nonnull ipcstack,void * kcdata_ptr)2075 stackshot_exclaves_process_ipcstack(const stackshottypes_ipcstackentry_v__opt_s *_Nonnull ipcstack, void *kcdata_ptr)
2076 {
2077 	__block kern_return_t kr = KERN_SUCCESS;
2078 
2079 	if (!ipcstack->has_value) {
2080 		goto error_exit;
2081 	}
2082 
2083 	stackshottypes_ipcstackentry__v_visit(&ipcstack->value, ^(size_t i, const stackshottypes_ipcstackentry_s *_Nonnull item) {
2084 		if (kr == KERN_SUCCESS) {
2085 		        kr = stackshot_exclaves_process_ipcstackentry(i, item, kcdata_ptr);
2086 		}
2087 	});
2088 
2089 error_exit:
2090 	return kr;
2091 }
2092 
2093 static kern_return_t
stackshot_exclaves_process_stackshotentry(const stackshot_stackshotentry_s * _Nonnull se,void * kcdata_ptr)2094 stackshot_exclaves_process_stackshotentry(const stackshot_stackshotentry_s *_Nonnull se, void *kcdata_ptr)
2095 {
2096 	kern_return_t error = KERN_SUCCESS;
2097 
2098 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2099 	    STACKSHOT_KCCONTAINER_EXCLAVE_SCRESULT, se->scid));
2100 
2101 	struct exclave_scresult_info info = { 0 };
2102 	info.esc_id = se->scid;
2103 	info.esc_flags = se->ipcstack.has_value ? kExclaveScresultHaveIPCStack : 0;
2104 
2105 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_SCRESULT_INFO, sizeof(struct exclave_scresult_info), &info));
2106 
2107 	if (se->ipcstack.has_value) {
2108 		kcd_exit_on_error(stackshot_exclaves_process_ipcstack(&se->ipcstack, kcdata_ptr));
2109 	}
2110 
2111 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2112 	    STACKSHOT_KCCONTAINER_EXCLAVE_SCRESULT, se->scid));
2113 
2114 error_exit:
2115 	return error;
2116 }
2117 
2118 static kern_return_t
stackshot_exclaves_process_textlayout_segments(const stackshottypes_textlayout_s * _Nonnull tl,void * kcdata_ptr,bool want_raw_addresses)2119 stackshot_exclaves_process_textlayout_segments(const stackshottypes_textlayout_s *_Nonnull tl, void *kcdata_ptr, bool want_raw_addresses)
2120 {
2121 	kern_return_t error = KERN_SUCCESS;
2122 	__block struct exclave_textlayout_segment_v2 * info = NULL;
2123 
2124 	__block size_t count = 0;
2125 	stackshottypes_textsegment__v_visit(&tl->textsegments, ^(size_t __unused i, const stackshottypes_textsegment_s __unused *_Nonnull item) {
2126 		count++;
2127 	});
2128 
2129 	if (!count) {
2130 		goto error_exit;
2131 	}
2132 
2133 	kcdata_compression_window_open(kcdata_ptr);
2134 	kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_SEGMENTS,
2135 	    sizeof(struct exclave_textlayout_segment_v2), count, (mach_vm_address_t*)&info));
2136 
2137 	stackshottypes_textsegment__v_visit(&tl->textsegments, ^(size_t __unused i, const stackshottypes_textsegment_s *_Nonnull item) {
2138 		memcpy(&info->layoutSegment_uuid, item->uuid, sizeof(uuid_t));
2139 		info->layoutSegment_loadAddress = item->loadaddress;
2140 		if (want_raw_addresses) {
2141 		        info->layoutSegment_rawLoadAddress = item->rawloadaddress.has_value ? item->rawloadaddress.value: 0;
2142 		} else {
2143 		        info->layoutSegment_rawLoadAddress = 0;
2144 		}
2145 		info++;
2146 	});
2147 
2148 	kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2149 
2150 error_exit:
2151 	return error;
2152 }
2153 
2154 static kern_return_t
stackshot_exclaves_process_textlayout(const stackshottypes_textlayout_s * _Nonnull tl,void * kcdata_ptr,bool want_raw_addresses)2155 stackshot_exclaves_process_textlayout(const stackshottypes_textlayout_s *_Nonnull tl, void *kcdata_ptr, bool want_raw_addresses)
2156 {
2157 	kern_return_t error = KERN_SUCCESS;
2158 	__block struct exclave_textlayout_info info = { 0 };
2159 
2160 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2161 	    STACKSHOT_KCCONTAINER_EXCLAVE_TEXTLAYOUT, tl->textlayoutid));
2162 
2163 	// tightbeam optional interfaced don't have enough const.
2164 	u32__opt_s sharedcacheindex_opt = tl->sharedcacheindex;
2165 	const uint32_t *sharedcache_index = u32__opt_get(&sharedcacheindex_opt);
2166 
2167 	info.layout_id = tl->textlayoutid;
2168 
2169 	info.etl_flags =
2170 	    (want_raw_addresses ? 0 : kExclaveTextLayoutLoadAddressesUnslid) |
2171 	    (sharedcache_index == NULL ? 0 : kExclaveTextLayoutHasSharedCache);
2172 	info.sharedcache_index = (sharedcache_index == NULL) ? UINT32_MAX : *sharedcache_index;
2173 
2174 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_TEXTLAYOUT_INFO, sizeof(struct exclave_textlayout_info), &info));
2175 	kcd_exit_on_error(stackshot_exclaves_process_textlayout_segments(tl, kcdata_ptr, want_raw_addresses));
2176 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2177 	    STACKSHOT_KCCONTAINER_EXCLAVE_TEXTLAYOUT, tl->textlayoutid));
2178 error_exit:
2179 	return error;
2180 }
2181 
2182 static kern_return_t
stackshot_exclaves_process_addressspace(const stackshottypes_addressspace_s * _Nonnull as,void * kcdata_ptr,bool want_raw_addresses)2183 stackshot_exclaves_process_addressspace(const stackshottypes_addressspace_s *_Nonnull as, void *kcdata_ptr, bool want_raw_addresses)
2184 {
2185 	kern_return_t error = KERN_SUCCESS;
2186 	struct exclave_addressspace_info info = { 0 };
2187 	__block size_t name_len = 0;
2188 	uint8_t * name = NULL;
2189 
2190 	u8__v_visit(&as->name, ^(size_t __unused i, const uint8_t __unused item) {
2191 		name_len++;
2192 	});
2193 
2194 	info.eas_id = as->asid;
2195 
2196 	if (want_raw_addresses && as->rawaddressslide.has_value) {
2197 		info.eas_flags = kExclaveAddressSpaceHaveSlide;
2198 		info.eas_slide = as->rawaddressslide.value;
2199 	} else {
2200 		info.eas_flags = 0;
2201 		info.eas_slide = UINT64_MAX;
2202 	}
2203 
2204 	info.eas_layoutid = as->textlayoutid; // text layout for this address space
2205 	info.eas_asroot = as->asroot.has_value ? as->asroot.value : 0;
2206 
2207 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_BEGIN,
2208 	    STACKSHOT_KCCONTAINER_EXCLAVE_ADDRESSSPACE, as->asid));
2209 	kcd_exit_on_error(kcdata_push_data(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_ADDRESSSPACE_INFO, sizeof(struct exclave_addressspace_info), &info));
2210 
2211 	if (name_len > 0) {
2212 		kcdata_compression_window_open(kcdata_ptr);
2213 		kcd_exit_on_error(kcdata_get_memory_addr(kcdata_ptr, STACKSHOT_KCTYPE_EXCLAVE_ADDRESSSPACE_NAME, name_len + 1, (mach_vm_address_t*)&name));
2214 
2215 		u8__v_visit(&as->name, ^(size_t i, const uint8_t item) {
2216 			name[i] = item;
2217 		});
2218 		name[name_len] = 0;
2219 
2220 		kcd_exit_on_error(kcdata_compression_window_close(kcdata_ptr));
2221 	}
2222 
2223 	kcd_exit_on_error(kcdata_add_container_marker(kcdata_ptr, KCDATA_TYPE_CONTAINER_END,
2224 	    STACKSHOT_KCCONTAINER_EXCLAVE_ADDRESSSPACE, as->asid));
2225 error_exit:
2226 	return error;
2227 }
2228 
2229 kern_return_t
2230 stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s *result, void *kcdata_ptr, bool want_raw_addresses);
2231 
2232 kern_return_t
stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s * result,void * kcdata_ptr,bool want_raw_addresses)2233 stackshot_exclaves_process_stackshot(const stackshot_stackshotresult_s *result, void *kcdata_ptr, bool want_raw_addresses)
2234 {
2235 	__block kern_return_t kr = KERN_SUCCESS;
2236 
2237 	stackshot_stackshotentry__v_visit(&result->stackshotentries, ^(size_t __unused i, const stackshot_stackshotentry_s *_Nonnull item) {
2238 		if (kr == KERN_SUCCESS) {
2239 		        kr = stackshot_exclaves_process_stackshotentry(item, kcdata_ptr);
2240 		}
2241 	});
2242 
2243 	stackshottypes_addressspace__v_visit(&result->addressspaces, ^(size_t __unused i, const stackshottypes_addressspace_s *_Nonnull item) {
2244 		if (kr == KERN_SUCCESS) {
2245 		        kr = stackshot_exclaves_process_addressspace(item, kcdata_ptr, want_raw_addresses);
2246 		}
2247 	});
2248 
2249 	stackshottypes_textlayout__v_visit(&result->textlayouts, ^(size_t __unused i, const stackshottypes_textlayout_s *_Nonnull item) {
2250 		if (kr == KERN_SUCCESS) {
2251 		        kr = stackshot_exclaves_process_textlayout(item, kcdata_ptr, want_raw_addresses);
2252 		}
2253 	});
2254 
2255 	return kr;
2256 }
2257 
2258 kern_return_t
2259 stackshot_exclaves_process_result(kern_return_t collect_kr, const stackshot_stackshotresult_s *result, bool want_raw_addresses);
2260 
2261 kern_return_t
stackshot_exclaves_process_result(kern_return_t collect_kr,const stackshot_stackshotresult_s * result,bool want_raw_addresses)2262 stackshot_exclaves_process_result(kern_return_t collect_kr, const stackshot_stackshotresult_s *result, bool want_raw_addresses)
2263 {
2264 	kern_return_t kr = KERN_SUCCESS;
2265 	if (result == NULL) {
2266 		return collect_kr;
2267 	}
2268 
2269 	kr = stackshot_exclaves_process_stackshot(result, stackshot_ctx.sc_finalized_kcdata, want_raw_addresses);
2270 
2271 	stackshot_exclave_kr = kr;
2272 
2273 	return kr;
2274 }
2275 
2276 
2277 static void
commit_exclaves_ast(void)2278 commit_exclaves_ast(void)
2279 {
2280 	size_t i = 0;
2281 	thread_t thread = NULL;
2282 	size_t count;
2283 
2284 	assert(debug_mode_active());
2285 
2286 	count = os_atomic_load(&stackshot_exclave_inspect_ctid_count, acquire);
2287 
2288 	if (stackshot_exclave_inspect_ctids) {
2289 		for (i = 0; i < count; ++i) {
2290 			thread = ctid_get_thread(stackshot_exclave_inspect_ctids[i]);
2291 			assert(thread);
2292 			thread_reference(thread);
2293 			os_atomic_or(&thread->th_exclaves_inspection_state, TH_EXCLAVES_INSPECTION_STACKSHOT, relaxed);
2294 		}
2295 	}
2296 }
2297 
2298 #endif /* CONFIG_EXCLAVES */
2299 
2300 kern_return_t
kern_stack_snapshot_internal(int stackshot_config_version,void * stackshot_config,size_t stackshot_config_size,boolean_t stackshot_from_user)2301 kern_stack_snapshot_internal(int stackshot_config_version, void *stackshot_config, size_t stackshot_config_size, boolean_t stackshot_from_user)
2302 {
2303 	int error = 0;
2304 	boolean_t prev_interrupt_state;
2305 	bool did_copyout = false;
2306 	uint32_t bytes_traced = 0;
2307 	uint32_t stackshot_estimate = 0;
2308 	struct kdp_snapshot_args snapshot_args;
2309 
2310 	void * buf_to_free = NULL;
2311 	int size_to_free = 0;
2312 	bool is_traced = false;    /* has FUNC_START tracepoint fired? */
2313 	uint64_t tot_interrupts_off_abs = 0; /* sum(time with interrupts off) */
2314 
2315 	/* Parsed arguments */
2316 	uint64_t                out_buffer_addr;
2317 	uint64_t                out_size_addr;
2318 	uint32_t                size_hint = 0;
2319 
2320 	snapshot_args.pagetable_mask = STACKSHOT_PAGETABLES_MASK_ALL;
2321 
2322 	if (stackshot_config == NULL) {
2323 		return KERN_INVALID_ARGUMENT;
2324 	}
2325 #if DEVELOPMENT || DEBUG
2326 	/* TBD: ask stackshot clients to avoid issuing stackshots in this
2327 	 * configuration in lieu of the kernel feature override.
2328 	 */
2329 	if (kern_feature_override(KF_STACKSHOT_OVRD) == TRUE) {
2330 		return KERN_NOT_SUPPORTED;
2331 	}
2332 #endif
2333 
2334 	switch (stackshot_config_version) {
2335 	case STACKSHOT_CONFIG_TYPE:
2336 		if (stackshot_config_size != sizeof(stackshot_config_t)) {
2337 			return KERN_INVALID_ARGUMENT;
2338 		}
2339 		stackshot_config_t *config = (stackshot_config_t *) stackshot_config;
2340 		out_buffer_addr = config->sc_out_buffer_addr;
2341 		out_size_addr = config->sc_out_size_addr;
2342 		snapshot_args.pid = config->sc_pid;
2343 		snapshot_args.flags = config->sc_flags;
2344 		snapshot_args.since_timestamp = config->sc_delta_timestamp;
2345 		if (config->sc_size <= max_tracebuf_size) {
2346 			size_hint = config->sc_size;
2347 		}
2348 		/*
2349 		 * Retain the pre-sc_pagetable_mask behavior of STACKSHOT_PAGE_TABLES,
2350 		 * dump every level if the pagetable_mask is not set
2351 		 */
2352 		if (snapshot_args.flags & STACKSHOT_PAGE_TABLES && config->sc_pagetable_mask) {
2353 			snapshot_args.pagetable_mask = config->sc_pagetable_mask;
2354 		}
2355 		break;
2356 	default:
2357 		return KERN_NOT_SUPPORTED;
2358 	}
2359 
2360 	/*
2361 	 * Currently saving a kernel buffer and trylock are only supported from the
2362 	 * internal/KEXT API.
2363 	 */
2364 	if (stackshot_from_user) {
2365 		if (snapshot_args.flags & (STACKSHOT_TRYLOCK | STACKSHOT_SAVE_IN_KERNEL_BUFFER | STACKSHOT_FROM_PANIC)) {
2366 			return KERN_NO_ACCESS;
2367 		}
2368 #if !DEVELOPMENT && !DEBUG
2369 		if (snapshot_args.flags & (STACKSHOT_DO_COMPRESS)) {
2370 			return KERN_NO_ACCESS;
2371 		}
2372 #endif
2373 	} else {
2374 		if (!(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2375 			return KERN_NOT_SUPPORTED;
2376 		}
2377 	}
2378 
2379 	if (!((snapshot_args.flags & STACKSHOT_KCDATA_FORMAT) || (snapshot_args.flags & STACKSHOT_RETRIEVE_EXISTING_BUFFER))) {
2380 		return KERN_NOT_SUPPORTED;
2381 	}
2382 
2383 	/* Compresssed delta stackshots or page dumps are not yet supported */
2384 	if (((snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) || (snapshot_args.flags & STACKSHOT_PAGE_TABLES))
2385 	    && (snapshot_args.flags & STACKSHOT_DO_COMPRESS)) {
2386 		return KERN_NOT_SUPPORTED;
2387 	}
2388 
2389 	/*
2390 	 * If we're not saving the buffer in the kernel pointer, we need a place to copy into.
2391 	 */
2392 	if ((!out_buffer_addr || !out_size_addr) && !(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2393 		return KERN_INVALID_ARGUMENT;
2394 	}
2395 
2396 	if (snapshot_args.since_timestamp != 0 && ((snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) == 0)) {
2397 		return KERN_INVALID_ARGUMENT;
2398 	}
2399 
2400 	/* EXCLAVES and SKIP_EXCLAVES conflict */
2401 	if ((snapshot_args.flags & (STACKSHOT_EXCLAVES | STACKSHOT_SKIP_EXCLAVES)) == (STACKSHOT_EXCLAVES | STACKSHOT_SKIP_EXCLAVES)) {
2402 		return KERN_INVALID_ARGUMENT;
2403 	}
2404 
2405 #if CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS
2406 	if (!mt_core_supported) {
2407 		snapshot_args.flags &= ~STACKSHOT_INSTRS_CYCLES;
2408 	}
2409 #else /* CONFIG_PERVASIVE_CPI && CONFIG_CPU_COUNTERS */
2410 	snapshot_args.flags &= ~STACKSHOT_INSTRS_CYCLES;
2411 #endif /* !CONFIG_PERVASIVE_CPI || !CONFIG_CPU_COUNTERS */
2412 
2413 	STACKSHOT_TESTPOINT(TP_WAIT_START_STACKSHOT);
2414 	STACKSHOT_SUBSYS_LOCK();
2415 
2416 	stackshot_tries = 0;
2417 
2418 	if (snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER) {
2419 		/*
2420 		 * Don't overwrite an existing stackshot
2421 		 */
2422 		if (kernel_stackshot_buf != NULL) {
2423 			error = KERN_MEMORY_PRESENT;
2424 			goto error_early_exit;
2425 		}
2426 	} else if (snapshot_args.flags & STACKSHOT_RETRIEVE_EXISTING_BUFFER) {
2427 		if ((kernel_stackshot_buf == NULL) || (kernel_stackshot_buf_size <= 0)) {
2428 			error = KERN_NOT_IN_SET;
2429 			goto error_early_exit;
2430 		}
2431 		error = stackshot_remap_buffer(kernel_stackshot_buf, kernel_stackshot_buf_size,
2432 		    out_buffer_addr, out_size_addr);
2433 		/*
2434 		 * If we successfully remapped the buffer into the user's address space, we
2435 		 * set buf_to_free and size_to_free so the prior kernel mapping will be removed
2436 		 * and then clear the kernel stackshot pointer and associated size.
2437 		 */
2438 		if (error == KERN_SUCCESS) {
2439 			did_copyout = true;
2440 			buf_to_free = kernel_stackshot_buf;
2441 			size_to_free = (int) VM_MAP_ROUND_PAGE(kernel_stackshot_buf_size, PAGE_MASK);
2442 			kernel_stackshot_buf = NULL;
2443 			kernel_stackshot_buf_size = 0;
2444 		}
2445 
2446 		goto error_early_exit;
2447 	}
2448 
2449 	if (snapshot_args.flags & STACKSHOT_GET_BOOT_PROFILE) {
2450 		void *bootprofile = NULL;
2451 		uint32_t len = 0;
2452 #if CONFIG_TELEMETRY
2453 		bootprofile_get(&bootprofile, &len);
2454 #endif
2455 		if (!bootprofile || !len) {
2456 			error = KERN_NOT_IN_SET;
2457 			goto error_early_exit;
2458 		}
2459 		error = stackshot_remap_buffer(bootprofile, len, out_buffer_addr, out_size_addr);
2460 		if (error == KERN_SUCCESS) {
2461 			did_copyout = true;
2462 		}
2463 		goto error_early_exit;
2464 	}
2465 
2466 	stackshot_duration_prior_abs = 0;
2467 	stackshot_initial_estimate_adj = os_atomic_load(&stackshot_estimate_adj, relaxed);
2468 	snapshot_args.buffer_size = stackshot_estimate =
2469 	    get_stackshot_estsize(size_hint, stackshot_initial_estimate_adj, snapshot_args.flags, snapshot_args.pid);
2470 	stackshot_initial_estimate = stackshot_estimate;
2471 
2472 	// ensure at least one attempt, even if the initial size from estimate was too big
2473 	snapshot_args.buffer_size = MIN(snapshot_args.buffer_size, max_tracebuf_size);
2474 
2475 	KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD) | DBG_FUNC_START,
2476 	    snapshot_args.flags, snapshot_args.buffer_size, snapshot_args.pid, snapshot_args.since_timestamp);
2477 	is_traced = true;
2478 
2479 #if CONFIG_EXCLAVES
2480 	assert(!stackshot_exclave_inspect_ctids);
2481 #endif
2482 
2483 	for (; snapshot_args.buffer_size <= max_tracebuf_size; snapshot_args.buffer_size = MIN(snapshot_args.buffer_size << 1, max_tracebuf_size)) {
2484 		stackshot_tries++;
2485 		if ((error = kmem_alloc(kernel_map, (vm_offset_t *)&snapshot_args.buffer, snapshot_args.buffer_size,
2486 		    KMA_ZERO | KMA_DATA, VM_KERN_MEMORY_DIAG)) != KERN_SUCCESS) {
2487 			os_log_error(OS_LOG_DEFAULT, "stackshot: initial allocation failed: %d, allocating %u bytes of %u max, try %llu\n", (int)error, snapshot_args.buffer_size, max_tracebuf_size, stackshot_tries);
2488 			error = KERN_RESOURCE_SHORTAGE;
2489 			goto error_exit;
2490 		}
2491 
2492 		uint32_t hdr_tag = (snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2493 		    : (snapshot_args.flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
2494 		    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2495 		#pragma unused(hdr_tag)
2496 
2497 		stackshot_duration_outer = NULL;
2498 
2499 		/* if compression was requested, allocate the extra zlib scratch area */
2500 		if (snapshot_args.flags & STACKSHOT_DO_COMPRESS) {
2501 			hdr_tag = (snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2502 			    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2503 			if (error != KERN_SUCCESS) {
2504 				os_log_error(OS_LOG_DEFAULT, "failed to initialize compression: %d!\n",
2505 				    (int) error);
2506 				goto error_exit;
2507 			}
2508 		}
2509 
2510 		/* Prepare the compressor for a stackshot */
2511 		error = vm_compressor_kdp_init();
2512 		if (error != KERN_SUCCESS) {
2513 			goto error_exit;
2514 		}
2515 
2516 		/*
2517 		 * Disable interrupts and save the current interrupt state.
2518 		 */
2519 		prev_interrupt_state = ml_set_interrupts_enabled(FALSE);
2520 		uint64_t time_start  = mach_absolute_time();
2521 
2522 		/* Emit a SOCD tracepoint that we are initiating a stackshot */
2523 		SOCD_TRACE_XNU_START(STACKSHOT);
2524 
2525 		/*
2526 		 * Load stackshot parameters.
2527 		 */
2528 		error = kdp_snapshot_preflight_internal(snapshot_args);
2529 
2530 		if (error == KERN_SUCCESS) {
2531 			error = stackshot_trap();
2532 		}
2533 
2534 		/* Emit a SOCD tracepoint that we have completed the stackshot */
2535 		SOCD_TRACE_XNU_END(STACKSHOT);
2536 		ml_set_interrupts_enabled(prev_interrupt_state);
2537 
2538 #if CONFIG_EXCLAVES
2539 		/* stackshot trap should only finish successfully or with no pending Exclave threads */
2540 		assert(error == KERN_SUCCESS || stackshot_exclave_inspect_ctids == NULL);
2541 #endif
2542 
2543 		/*
2544 		 * Stackshot is no longer active.
2545 		 * (We have to do this here for the special interrupt disable timeout case to work)
2546 		 */
2547 		os_atomic_store(&stackshot_ctx.sc_state, SS_INACTIVE, release);
2548 
2549 		/* Release compressor kdp buffers */
2550 		vm_compressor_kdp_teardown();
2551 
2552 		/* Record duration that interrupts were disabled */
2553 		uint64_t time_end = mach_absolute_time();
2554 		tot_interrupts_off_abs += (time_end - time_start);
2555 
2556 		/* Collect multithreaded kcdata into one finalized buffer */
2557 		if (error == KERN_SUCCESS && !stackshot_ctx.sc_is_singlethreaded) {
2558 			error = stackshot_collect_kcdata();
2559 		}
2560 
2561 #if CONFIG_EXCLAVES
2562 		if (error == KERN_SUCCESS && stackshot_exclave_inspect_ctids) {
2563 			if (stackshot_exclave_inspect_ctid_count > 0) {
2564 				STACKSHOT_TESTPOINT(TP_START_COLLECTION);
2565 			}
2566 			error = collect_exclave_threads(snapshot_args.flags);
2567 		}
2568 #endif /* CONFIG_EXCLAVES */
2569 
2570 		if (error == KERN_SUCCESS) {
2571 			if (stackshot_ctx.sc_is_singlethreaded) {
2572 				error = stackshot_finalize_singlethreaded_kcdata();
2573 			} else {
2574 				error = stackshot_finalize_kcdata();
2575 			}
2576 
2577 			if ((error != KERN_SUCCESS) && (error != KERN_INSUFFICIENT_BUFFER_SIZE)) {
2578 				goto error_exit;
2579 			}
2580 			if (error == KERN_INSUFFICIENT_BUFFER_SIZE && snapshot_args.buffer_size == max_tracebuf_size) {
2581 				os_log_error(OS_LOG_DEFAULT, "stackshot: final buffer size was insufficient at maximum size\n");
2582 				error = KERN_RESOURCE_SHORTAGE;
2583 				goto error_exit;
2584 			}
2585 		}
2586 
2587 		/* record the duration that interupts were disabled + kcdata was being finalized */
2588 		if (stackshot_duration_outer) {
2589 			*stackshot_duration_outer = mach_absolute_time() - time_start;
2590 		}
2591 
2592 		if (error != KERN_SUCCESS) {
2593 			os_log_error(OS_LOG_DEFAULT, "stackshot: debugger call failed: %d, try %llu, buffer %u estimate %u\n", (int)error, stackshot_tries, snapshot_args.buffer_size, stackshot_estimate);
2594 			kmem_free(kernel_map, (vm_offset_t)snapshot_args.buffer, snapshot_args.buffer_size);
2595 			snapshot_args.buffer = NULL;
2596 			if (error == KERN_INSUFFICIENT_BUFFER_SIZE) {
2597 				/*
2598 				 * If we didn't allocate a big enough buffer, deallocate and try again.
2599 				 */
2600 				KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD_SHORT) | DBG_FUNC_NONE,
2601 				    time_end - time_start, stackshot_estimate, snapshot_args.buffer_size);
2602 				stackshot_duration_prior_abs += (time_end - time_start);
2603 				if (snapshot_args.buffer_size == max_tracebuf_size) {
2604 					os_log_error(OS_LOG_DEFAULT, "stackshot: initial buffer size was insufficient at maximum size\n");
2605 					error = KERN_RESOURCE_SHORTAGE;
2606 					goto error_exit;
2607 				}
2608 				continue;
2609 			} else {
2610 				goto error_exit;
2611 			}
2612 		}
2613 
2614 		bytes_traced = kdp_stack_snapshot_bytes_traced();
2615 		if (bytes_traced <= 0) {
2616 			error = KERN_ABORTED;
2617 			goto error_exit;
2618 		}
2619 
2620 		if (!(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER)) {
2621 			error = stackshot_remap_buffer(snapshot_args.buffer, bytes_traced, out_buffer_addr, out_size_addr);
2622 			if (error == KERN_SUCCESS) {
2623 				did_copyout = true;
2624 			}
2625 			goto error_exit;
2626 		}
2627 
2628 		if (!(snapshot_args.flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT)) {
2629 			os_log_info(OS_LOG_DEFAULT, "stackshot: succeeded, traced %u bytes to %u buffer (estimate %u) try %llu\n", bytes_traced, snapshot_args.buffer_size, stackshot_estimate, stackshot_tries);
2630 		}
2631 
2632 		/*
2633 		 * Save the stackshot in the kernel buffer.
2634 		 */
2635 		kernel_stackshot_buf = snapshot_args.buffer;
2636 		kernel_stackshot_buf_size =  bytes_traced;
2637 		/*
2638 		 * Figure out if we didn't use all the pages in the buffer. If so, we set buf_to_free to the beginning of
2639 		 * the next page after the end of the stackshot in the buffer so that the kmem_free clips the buffer and
2640 		 * update size_to_free for kmem_free accordingly.
2641 		 */
2642 		size_to_free = snapshot_args.buffer_size - (int) VM_MAP_ROUND_PAGE(bytes_traced, PAGE_MASK);
2643 
2644 		assert(size_to_free >= 0);
2645 
2646 		if (size_to_free != 0) {
2647 			buf_to_free = (void *)((uint64_t)snapshot_args.buffer + snapshot_args.buffer_size - size_to_free);
2648 		}
2649 
2650 		snapshot_args.buffer = NULL;
2651 		snapshot_args.buffer_size = 0;
2652 		goto error_exit;
2653 	}
2654 
2655 error_exit:
2656 	if (is_traced) {
2657 		KDBG_RELEASE(MACHDBG_CODE(DBG_MACH_STACKSHOT, STACKSHOT_RECORD) | DBG_FUNC_END,
2658 		    error, tot_interrupts_off_abs, snapshot_args.buffer_size, bytes_traced);
2659 	}
2660 
2661 error_early_exit:
2662 	if (snapshot_args.buffer != NULL) {
2663 		kmem_free(kernel_map, (vm_offset_t)snapshot_args.buffer, snapshot_args.buffer_size);
2664 	}
2665 	if (buf_to_free != NULL) {
2666 		kmem_free(kernel_map, (vm_offset_t)buf_to_free, size_to_free);
2667 	}
2668 
2669 	if (error == KERN_SUCCESS && !(snapshot_args.flags & STACKSHOT_SAVE_IN_KERNEL_BUFFER) && !did_copyout) {
2670 		/* If we return success, we must have done the copyout to userspace. If
2671 		 * we somehow did not, we need to indicate failure instead.
2672 		 */
2673 #if DEVELOPMENT || DEBUG
2674 		os_log_error(OS_LOG_DEFAULT, "stackshot: reached end without doing copyout\n");
2675 #endif // DEVELOPMENT || DEBUG
2676 		error = KERN_FAILURE;
2677 	}
2678 
2679 	STACKSHOT_SUBSYS_UNLOCK();
2680 	STACKSHOT_TESTPOINT(TP_STACKSHOT_DONE);
2681 
2682 	return error;
2683 }
2684 
2685 /*
2686  * Set up state and parameters for a stackshot.
2687  * (This runs on the calling CPU before other CPUs enter the debugger trap.)
2688  * Called when interrupts are disabled, but we're not in the debugger trap yet.
2689  */
2690 __result_use_check
2691 static kern_return_t
kdp_snapshot_preflight_internal(struct kdp_snapshot_args args)2692 kdp_snapshot_preflight_internal(struct kdp_snapshot_args args)
2693 {
2694 	kern_return_t error = KERN_SUCCESS;
2695 	uint64_t microsecs = 0, secs = 0;
2696 	bool is_panic = ((args.flags & STACKSHOT_FROM_PANIC) != 0);
2697 	bool process_scoped = (args.pid != -1) &&
2698 	    ((args.flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0);
2699 	bool is_singlethreaded = stackshot_single_thread || (process_scoped || is_panic || ((args.flags & STACKSHOT_PAGE_TABLES) != 0));
2700 	clock_get_calendar_microtime((clock_sec_t *)&secs, (clock_usec_t *)&microsecs);
2701 
2702 	cur_stackshot_ctx_idx = (is_panic ? STACKSHOT_CTX_IDX_PANIC : STACKSHOT_CTX_IDX_NORMAL);
2703 
2704 	/* Setup overall state */
2705 	stackshot_ctx = (struct stackshot_context) {
2706 		.sc_args               = args,
2707 		.sc_state              = SS_SETUP,
2708 		.sc_bytes_traced       = 0,
2709 		.sc_bytes_uncompressed = 0,
2710 		.sc_microsecs          = microsecs + (secs * USEC_PER_SEC),
2711 		.sc_panic_stackshot    = is_panic,
2712 		.sc_is_singlethreaded  = is_singlethreaded,
2713 		.sc_cpus_working       = 0,
2714 		.sc_retval             = 0,
2715 		.sc_calling_cpuid      = cpu_number(),
2716 		.sc_main_cpuid         = is_singlethreaded ? cpu_number() : -1,
2717 		.sc_min_kcdata_size    = get_stackshot_est_tasksize(args.flags),
2718 		.sc_enable_faulting    = false,
2719 	};
2720 
2721 	if (!stackshot_ctx.sc_panic_stackshot) {
2722 #if defined(__AMP__)
2723 		/* On AMP systems, we want to split the buffers up by cluster to avoid cache line effects. */
2724 		stackshot_ctx.sc_num_buffers = is_singlethreaded ? 1 : ml_get_cluster_count();
2725 #else /* __AMP__ */
2726 		stackshot_ctx.sc_num_buffers = 1;
2727 #endif /* !__AMP__ */
2728 		size_t bufsz = args.buffer_size / stackshot_ctx.sc_num_buffers;
2729 		for (int buf_idx = 0; buf_idx < stackshot_ctx.sc_num_buffers; buf_idx++) {
2730 			stackshot_ctx.sc_buffers[buf_idx] = (struct stackshot_buffer) {
2731 				.ssb_ptr = (void*) ((mach_vm_address_t) args.buffer + (bufsz * buf_idx)),
2732 				.ssb_size = bufsz,
2733 				.ssb_used = 0,
2734 				.ssb_freelist = NULL,
2735 				.ssb_freelist_lock = 0,
2736 				.ssb_overhead = 0
2737 			};
2738 		}
2739 
2740 		/* Setup per-cpu state */
2741 		percpu_foreach_base(base) {
2742 			*PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu) = (struct stackshot_cpu_context) { 0 };
2743 		}
2744 
2745 		if (is_singlethreaded) {
2746 			/* If the stackshot is singlethreaded, set up the kcdata - we don't bother with linked-list kcdata in singlethreaded mode. */
2747 			uint32_t hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2748 			    : (stackshot_flags & STACKSHOT_DO_COMPRESS) ? KCDATA_BUFFER_BEGIN_COMPRESSED
2749 			    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2750 			kcdata_memory_static_init(stackshot_kcdata_p, (mach_vm_address_t) stackshot_args.buffer, hdr_tag,
2751 			    stackshot_args.buffer_size, KCFLAG_USE_MEMCOPY | KCFLAG_NO_AUTO_ENDBUFFER);
2752 			if (stackshot_flags & STACKSHOT_DO_COMPRESS) {
2753 				hdr_tag = (stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) ? KCDATA_BUFFER_BEGIN_DELTA_STACKSHOT
2754 				    : KCDATA_BUFFER_BEGIN_STACKSHOT;
2755 				kcd_exit_on_error(kcdata_init_compress(stackshot_kcdata_p, hdr_tag, kdp_memcpy, KCDCT_ZLIB));
2756 			}
2757 			stackshot_cpu_ctx.scc_stack_buffer = kcdata_endalloc(stackshot_kcdata_p, sizeof(uintptr_t) * MAX_FRAMES);
2758 		}
2759 	} else {
2760 		/*
2761 		 * If this is a panic stackshot, we need to handle things differently.
2762 		 * The panic code hands us a kcdata descriptor to work with instead of
2763 		 * us making one ourselves.
2764 		 */
2765 		*stackshot_kcdata_p = *stackshot_args.descriptor;
2766 		stackshot_cpu_ctx = (struct stackshot_cpu_context) {
2767 			.scc_can_work = true,
2768 			.scc_stack_buffer = kcdata_endalloc(stackshot_kcdata_p, sizeof(uintptr_t) * MAX_FRAMES)
2769 		};
2770 #if STACKSHOT_COLLECTS_LATENCY_INFO
2771 		*(PERCPU_GET(stackshot_trace_buffer)) = (struct stackshot_trace_buffer) {};
2772 #endif
2773 	}
2774 
2775 	/* Set up our cpu state */
2776 	stackshot_cpu_preflight();
2777 
2778 error_exit:
2779 	return error;
2780 }
2781 
2782 /*
2783  * The old function signature for kdp_snapshot_preflight, used in the panic path.
2784  * Called when interrupts are disabled, but we're not in the debugger trap yet.
2785  */
2786 void
kdp_snapshot_preflight(int pid,void * tracebuf,uint32_t tracebuf_size,uint64_t flags,kcdata_descriptor_t data_p,uint64_t since_timestamp,uint32_t pagetable_mask)2787 kdp_snapshot_preflight(int pid, void * tracebuf, uint32_t tracebuf_size, uint64_t flags,
2788     kcdata_descriptor_t data_p, uint64_t since_timestamp, uint32_t pagetable_mask)
2789 {
2790 	__assert_only kern_return_t err;
2791 	err = kdp_snapshot_preflight_internal((struct kdp_snapshot_args) {
2792 		.pid = pid,
2793 		.buffer = tracebuf,
2794 		.buffer_size = tracebuf_size,
2795 		.flags = flags,
2796 		.descriptor = data_p,
2797 		.since_timestamp = since_timestamp,
2798 		.pagetable_mask = pagetable_mask
2799 	});
2800 
2801 
2802 	/* This shouldn't ever return an error in the panic path. */
2803 	assert(err == KERN_SUCCESS);
2804 }
2805 
2806 static void
stackshot_reset_state(void)2807 stackshot_reset_state(void)
2808 {
2809 	stackshot_ctx = (struct stackshot_context) { 0 };
2810 }
2811 
2812 void
panic_stackshot_reset_state(void)2813 panic_stackshot_reset_state(void)
2814 {
2815 	stackshot_reset_state();
2816 }
2817 
2818 boolean_t
stackshot_active(void)2819 stackshot_active(void)
2820 {
2821 	return os_atomic_load(&stackshot_ctx.sc_state, relaxed) != SS_INACTIVE;
2822 }
2823 
2824 boolean_t
panic_stackshot_active(void)2825 panic_stackshot_active(void)
2826 {
2827 	return os_atomic_load(&stackshot_contexts[STACKSHOT_CTX_IDX_PANIC].sc_state, relaxed) != SS_INACTIVE;
2828 }
2829 
2830 uint32_t
kdp_stack_snapshot_bytes_traced(void)2831 kdp_stack_snapshot_bytes_traced(void)
2832 {
2833 	return stackshot_ctx.sc_bytes_traced;
2834 }
2835 
2836 uint32_t
kdp_stack_snapshot_bytes_uncompressed(void)2837 kdp_stack_snapshot_bytes_uncompressed(void)
2838 {
2839 	return stackshot_ctx.sc_bytes_uncompressed;
2840 }
2841 
2842 static boolean_t
memory_iszero(void * addr,size_t size)2843 memory_iszero(void *addr, size_t size)
2844 {
2845 	char *data = (char *)addr;
2846 	for (size_t i = 0; i < size; i++) {
2847 		if (data[i] != 0) {
2848 			return FALSE;
2849 		}
2850 	}
2851 	return TRUE;
2852 }
2853 
2854 static void
_stackshot_validation_reset(void)2855 _stackshot_validation_reset(void)
2856 {
2857 	percpu_foreach_base(base) {
2858 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
2859 		cpu_ctx->scc_validation_state.last_valid_page_kva = -1;
2860 		cpu_ctx->scc_validation_state.last_valid_size = 0;
2861 	}
2862 }
2863 
2864 static bool
_stackshot_validate_kva(vm_offset_t addr,size_t size)2865 _stackshot_validate_kva(vm_offset_t addr, size_t size)
2866 {
2867 	vm_offset_t page_addr = atop_kernel(addr);
2868 	if (stackshot_cpu_ctx.scc_validation_state.last_valid_page_kva == page_addr &&
2869 	    stackshot_cpu_ctx.scc_validation_state.last_valid_size <= size) {
2870 		return true;
2871 	}
2872 
2873 	if (ml_validate_nofault(addr, size)) {
2874 		stackshot_cpu_ctx.scc_validation_state.last_valid_page_kva = page_addr;
2875 		stackshot_cpu_ctx.scc_validation_state.last_valid_size = size;
2876 		return true;
2877 	}
2878 	return false;
2879 }
2880 
2881 static long
_stackshot_strlen(const char * s,size_t maxlen)2882 _stackshot_strlen(const char *s, size_t maxlen)
2883 {
2884 	size_t len = 0;
2885 	for (len = 0; _stackshot_validate_kva((vm_offset_t)s, 1); len++, s++) {
2886 		if (*s == 0) {
2887 			return len;
2888 		}
2889 		if (len >= maxlen) {
2890 			return -1;
2891 		}
2892 	}
2893 	return -1; /* failed before end of string */
2894 }
2895 
2896 
2897 static size_t
stackshot_plh_est_size(void)2898 stackshot_plh_est_size(void)
2899 {
2900 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
2901 	size_t size = STASKSHOT_PLH_SIZE(stackshot_port_label_size);
2902 
2903 	if (size == 0) {
2904 		return 0;
2905 	}
2906 #define SIZE_EST(x) ROUNDUP((x), sizeof (uintptr_t))
2907 	return SIZE_EST(size * sizeof(*plh->plh_array)) +
2908 	       SIZE_EST(size * sizeof(*plh->plh_chains)) +
2909 	       SIZE_EST(size * sizeof(*stackshot_cpu_ctx.scc_plh_gen.pgs_gen) * real_ncpus) +
2910 	       SIZE_EST((1ul << STACKSHOT_PLH_SHIFT) * sizeof(*plh->plh_hash));
2911 #undef SIZE_EST
2912 }
2913 
2914 static void
stackshot_plh_reset(void)2915 stackshot_plh_reset(void)
2916 {
2917 	stackshot_ctx.sc_plh = (struct port_label_hash){.plh_size = 0};  /* structure assignment */
2918 }
2919 
2920 static kern_return_t
stackshot_plh_setup(void)2921 stackshot_plh_setup(void)
2922 {
2923 	kern_return_t error;
2924 	size_t size;
2925 	bool percpu_alloc_failed = false;
2926 	struct port_label_hash plh = {
2927 		.plh_size = STASKSHOT_PLH_SIZE(stackshot_port_label_size),
2928 		.plh_count = 0,
2929 	};
2930 
2931 	stackshot_plh_reset();
2932 
2933 	percpu_foreach_base(base) {
2934 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
2935 		cpu_ctx->scc_plh_gen = (struct _stackshot_plh_gen_state){
2936 			.pgs_gen = NULL,
2937 			.pgs_curgen = 1,
2938 			.pgs_curgen_min = STACKSHOT_PLH_SIZE_MAX,
2939 			.pgs_curgen_max = 0,
2940 		};
2941 	}
2942 
2943 	size = plh.plh_size;
2944 	if (size == 0) {
2945 		return KERN_SUCCESS;
2946 	}
2947 	plh.plh_array = stackshot_alloc_with_size(size * sizeof(*plh.plh_array), &error);
2948 	plh.plh_chains = stackshot_alloc_with_size(size * sizeof(*plh.plh_chains), &error);
2949 	percpu_foreach_base(base) {
2950 		struct stackshot_cpu_context *cpu_ctx = PERCPU_GET_WITH_BASE(base, stackshot_cpu_ctx_percpu);
2951 		cpu_ctx->scc_plh_gen.pgs_gen = stackshot_alloc_with_size(size * sizeof(*cpu_ctx->scc_plh_gen.pgs_gen), &error);
2952 		if (cpu_ctx->scc_plh_gen.pgs_gen == NULL) {
2953 			percpu_alloc_failed = true;
2954 			break;
2955 		}
2956 		for (int x = 0; x < size; x++) {
2957 			cpu_ctx->scc_plh_gen.pgs_gen[x] = 0;
2958 		}
2959 	}
2960 	plh.plh_hash = stackshot_alloc_with_size((1ul << STACKSHOT_PLH_SHIFT) * sizeof(*plh.plh_hash), &error);
2961 	if (error != KERN_SUCCESS) {
2962 		return error;
2963 	}
2964 	if (plh.plh_array == NULL || plh.plh_chains == NULL || percpu_alloc_failed || plh.plh_hash == NULL) {
2965 		PLH_STAT_OP(os_atomic_inc(&stackshot_ctx.sc_plh.plh_bad, relaxed));
2966 		return KERN_SUCCESS;
2967 	}
2968 	for (int x = 0; x < size; x++) {
2969 		plh.plh_array[x] = NULL;
2970 		plh.plh_chains[x] = -1;
2971 	}
2972 	for (int x = 0; x < (1ul << STACKSHOT_PLH_SHIFT); x++) {
2973 		plh.plh_hash[x] = -1;
2974 	}
2975 	stackshot_ctx.sc_plh = plh;  /* structure assignment */
2976 	return KERN_SUCCESS;
2977 }
2978 
2979 static int16_t
stackshot_plh_hash(struct ipc_service_port_label * ispl)2980 stackshot_plh_hash(struct ipc_service_port_label *ispl)
2981 {
2982 	uintptr_t ptr = (uintptr_t)ispl;
2983 	static_assert(STACKSHOT_PLH_SHIFT < 16, "plh_hash must fit in 15 bits");
2984 #define PLH_HASH_STEP(ptr, x) \
2985 	    ((((x) * STACKSHOT_PLH_SHIFT) < (sizeof(ispl) * CHAR_BIT)) ? ((ptr) >> ((x) * STACKSHOT_PLH_SHIFT)) : 0)
2986 	ptr ^= PLH_HASH_STEP(ptr, 16);
2987 	ptr ^= PLH_HASH_STEP(ptr, 8);
2988 	ptr ^= PLH_HASH_STEP(ptr, 4);
2989 	ptr ^= PLH_HASH_STEP(ptr, 2);
2990 	ptr ^= PLH_HASH_STEP(ptr, 1);
2991 #undef PLH_HASH_STEP
2992 	return (int16_t)(ptr & ((1ul << STACKSHOT_PLH_SHIFT) - 1));
2993 }
2994 
2995 enum stackshot_plh_lookup_type {
2996 	STACKSHOT_PLH_LOOKUP_UNKNOWN,
2997 	STACKSHOT_PLH_LOOKUP_SEND,
2998 	STACKSHOT_PLH_LOOKUP_RECEIVE,
2999 };
3000 
3001 static void
stackshot_plh_resetgen(void)3002 stackshot_plh_resetgen(void)
3003 {
3004 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3005 	uint16_t plh_size = stackshot_ctx.sc_plh.plh_size;
3006 
3007 	if (pgs->pgs_curgen_min == STACKSHOT_PLH_SIZE_MAX && pgs->pgs_curgen_max == 0) {
3008 		return;  // no lookups, nothing using the current generation
3009 	}
3010 	pgs->pgs_curgen++;
3011 	pgs->pgs_curgen_min = STACKSHOT_PLH_SIZE_MAX;
3012 	pgs->pgs_curgen_max = 0;
3013 	if (pgs->pgs_curgen == 0) { // wrapped, zero the array and increment the generation
3014 		for (int x = 0; x < plh_size; x++) {
3015 			pgs->pgs_gen[x] = 0;
3016 		}
3017 		pgs->pgs_curgen = 1;
3018 	}
3019 }
3020 
3021 static int16_t
stackshot_plh_lookup_locked(struct ipc_service_port_label * ispl,enum stackshot_plh_lookup_type type)3022 stackshot_plh_lookup_locked(struct ipc_service_port_label *ispl, enum stackshot_plh_lookup_type type)
3023 {
3024 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3025 	int depth;
3026 	int16_t cur;
3027 	if (ispl == NULL) {
3028 		return STACKSHOT_PORTLABELID_NONE;
3029 	}
3030 	switch (type) {
3031 	case STACKSHOT_PLH_LOOKUP_SEND:
3032 		PLH_STAT_OP(os_atomic_inc(&plh->plh_lookup_send, relaxed));
3033 		break;
3034 	case STACKSHOT_PLH_LOOKUP_RECEIVE:
3035 		PLH_STAT_OP(os_atomic_inc(&plh->plh_lookup_receive, relaxed));
3036 		break;
3037 	default:
3038 		break;
3039 	}
3040 	PLH_STAT_OP(os_atomic_inc(&plh->plh_lookups, relaxed));
3041 	if (plh->plh_size == 0) {
3042 		return STACKSHOT_PORTLABELID_MISSING;
3043 	}
3044 	int16_t hash = stackshot_plh_hash(ispl);
3045 	assert(hash >= 0 && hash < (1ul << STACKSHOT_PLH_SHIFT));
3046 	depth = 0;
3047 	for (cur = plh->plh_hash[hash]; cur >= 0; cur = plh->plh_chains[cur]) {
3048 		/* cur must be in-range, and chain depth can never be above our # allocated */
3049 		if (cur >= plh->plh_count || depth > plh->plh_count || depth > plh->plh_size) {
3050 			PLH_STAT_OP(os_atomic_inc(&plh->plh_bad, relaxed));
3051 			PLH_STAT_OP(os_atomic_add(&plh->plh_bad_depth, depth, relaxed));
3052 			return STACKSHOT_PORTLABELID_MISSING;
3053 		}
3054 		assert(cur < plh->plh_count);
3055 		if (plh->plh_array[cur] == ispl) {
3056 			PLH_STAT_OP(os_atomic_inc(&plh->plh_found, relaxed));
3057 			PLH_STAT_OP(os_atomic_add(&plh->plh_found_depth, depth, relaxed));
3058 			goto found;
3059 		}
3060 		depth++;
3061 	}
3062 	/* not found in hash table, so alloc and insert it */
3063 	if (cur != -1) {
3064 		PLH_STAT_OP(os_atomic_inc(&plh->plh_bad, relaxed));
3065 		PLH_STAT_OP(os_atomic_add(&plh->plh_bad_depth, depth, relaxed));
3066 		return STACKSHOT_PORTLABELID_MISSING; /* bad end of chain */
3067 	}
3068 	PLH_STAT_OP(os_atomic_inc(&plh->plh_insert, relaxed));
3069 	PLH_STAT_OP(os_atomic_add(&plh->plh_insert_depth, depth, relaxed));
3070 	if (plh->plh_count >= plh->plh_size) {
3071 		return STACKSHOT_PORTLABELID_MISSING; /* no space */
3072 	}
3073 	cur = plh->plh_count;
3074 	plh->plh_count++;
3075 	plh->plh_array[cur] = ispl;
3076 	plh->plh_chains[cur] = plh->plh_hash[hash];
3077 	plh->plh_hash[hash] = cur;
3078 found:  ;
3079 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3080 	pgs->pgs_gen[cur] = pgs->pgs_curgen;
3081 	if (pgs->pgs_curgen_min > cur) {
3082 		pgs->pgs_curgen_min = cur;
3083 	}
3084 	if (pgs->pgs_curgen_max < cur) {
3085 		pgs->pgs_curgen_max = cur;
3086 	}
3087 	return cur + 1;   /* offset to avoid 0 */
3088 }
3089 
3090 static kern_return_t
kdp_stackshot_plh_record_locked(void)3091 kdp_stackshot_plh_record_locked(void)
3092 {
3093 	kern_return_t error = KERN_SUCCESS;
3094 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3095 	struct _stackshot_plh_gen_state *pgs = &stackshot_cpu_ctx.scc_plh_gen;
3096 	uint16_t count = plh->plh_count;
3097 	uint8_t curgen = pgs->pgs_curgen;
3098 	int16_t curgen_min = pgs->pgs_curgen_min;
3099 	int16_t curgen_max = pgs->pgs_curgen_max;
3100 	if (curgen_min <= curgen_max && curgen_max < count &&
3101 	    count <= plh->plh_size && plh->plh_size <= STACKSHOT_PLH_SIZE_MAX) {
3102 		struct ipc_service_port_label **arr = plh->plh_array;
3103 		size_t ispl_size, max_namelen;
3104 		kdp_ipc_splabel_size(&ispl_size, &max_namelen);
3105 		for (int idx = curgen_min; idx <= curgen_max; idx++) {
3106 			struct ipc_service_port_label *ispl = arr[idx];
3107 			struct portlabel_info spl = {
3108 				.portlabel_id = (idx + 1),
3109 			};
3110 			const char *name = NULL;
3111 			long name_sz = 0;
3112 			if (pgs->pgs_gen[idx] != curgen) {
3113 				continue;
3114 			}
3115 			if (_stackshot_validate_kva((vm_offset_t)ispl, ispl_size)) {
3116 				kdp_ipc_fill_splabel(ispl, &spl, &name);
3117 #if STACKSHOT_COLLECTS_RDAR_126582377_DATA
3118 			} else {
3119 				if (ispl != NULL && (vm_offset_t)ispl >> 48 == 0x0000) {
3120 					ca_event_t event_to_send = os_atomic_xchg(&rdar_126582377_event, NULL, relaxed);
3121 					if (event_to_send) {
3122 						CA_EVENT_SEND(event_to_send);
3123 					}
3124 				}
3125 #endif
3126 			}
3127 
3128 			kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
3129 			    STACKSHOT_KCCONTAINER_PORTLABEL, idx + 1));
3130 			if (name != NULL && (name_sz = _stackshot_strlen(name, max_namelen)) > 0) {   /* validates the kva */
3131 				kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_PORTLABEL_NAME, name_sz + 1, name));
3132 			} else {
3133 				spl.portlabel_flags |= STACKSHOT_PORTLABEL_READFAILED;
3134 			}
3135 			kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_PORTLABEL, sizeof(spl), &spl));
3136 			kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
3137 			    STACKSHOT_KCCONTAINER_PORTLABEL, idx + 1));
3138 		}
3139 	}
3140 
3141 error_exit:
3142 	return error;
3143 }
3144 
3145 // record any PLH referenced since the last stackshot_plh_resetgen() call
3146 static kern_return_t
kdp_stackshot_plh_record(void)3147 kdp_stackshot_plh_record(void)
3148 {
3149 	kern_return_t error;
3150 	plh_lock(&stackshot_ctx.sc_plh);
3151 	error = kdp_stackshot_plh_record_locked();
3152 	plh_unlock(&stackshot_ctx.sc_plh);
3153 	return error;
3154 }
3155 
3156 static int16_t
stackshot_plh_lookup(struct ipc_service_port_label * ispl,enum stackshot_plh_lookup_type type)3157 stackshot_plh_lookup(struct ipc_service_port_label *ispl, enum stackshot_plh_lookup_type type)
3158 {
3159 	int16_t result;
3160 	plh_lock(&stackshot_ctx.sc_plh);
3161 	result = stackshot_plh_lookup_locked(ispl, type);
3162 	plh_unlock(&stackshot_ctx.sc_plh);
3163 	return result;
3164 }
3165 
3166 #if DEVELOPMENT || DEBUG
3167 static kern_return_t
kdp_stackshot_plh_stats(void)3168 kdp_stackshot_plh_stats(void)
3169 {
3170 	kern_return_t error = KERN_SUCCESS;
3171 	struct port_label_hash *plh = &stackshot_ctx.sc_plh;
3172 
3173 #define PLH_STAT(x) do { if (os_atomic_load(&plh->x, relaxed) != 0) { \
3174 	kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, os_atomic_load(&plh->x, relaxed), "stackshot_" #x)); \
3175 } } while (0)
3176 	PLH_STAT(plh_size);
3177 	PLH_STAT(plh_lookups);
3178 	PLH_STAT(plh_found);
3179 	PLH_STAT(plh_found_depth);
3180 	PLH_STAT(plh_insert);
3181 	PLH_STAT(plh_insert_depth);
3182 	PLH_STAT(plh_bad);
3183 	PLH_STAT(plh_bad_depth);
3184 	PLH_STAT(plh_lookup_send);
3185 	PLH_STAT(plh_lookup_receive);
3186 #undef PLH_STAT
3187 
3188 error_exit:
3189 	return error;
3190 }
3191 #endif /* DEVELOPMENT || DEBUG */
3192 
3193 static uint64_t
kcdata_get_task_ss_flags(task_t task)3194 kcdata_get_task_ss_flags(task_t task)
3195 {
3196 	uint64_t ss_flags = 0;
3197 	boolean_t task_64bit_addr = task_has_64Bit_addr(task);
3198 	void *bsd_info = get_bsdtask_info(task);
3199 
3200 	if (task_64bit_addr) {
3201 		ss_flags |= kUser64_p;
3202 	}
3203 	if (!task->active || task_is_a_corpse(task) || proc_exiting(bsd_info)) {
3204 		ss_flags |= kTerminatedSnapshot;
3205 	}
3206 	if (task->pidsuspended) {
3207 		ss_flags |= kPidSuspended;
3208 	}
3209 	if (task->frozen) {
3210 		ss_flags |= kFrozen;
3211 	}
3212 	if (task->effective_policy.tep_darwinbg == 1) {
3213 		ss_flags |= kTaskDarwinBG;
3214 	}
3215 	if (task->requested_policy.trp_role == TASK_FOREGROUND_APPLICATION) {
3216 		ss_flags |= kTaskIsForeground;
3217 	}
3218 	if (task->requested_policy.trp_boosted == 1) {
3219 		ss_flags |= kTaskIsBoosted;
3220 	}
3221 	if (task->effective_policy.tep_sup_active == 1) {
3222 		ss_flags |= kTaskIsSuppressed;
3223 	}
3224 #if CONFIG_MEMORYSTATUS
3225 
3226 	boolean_t dirty = FALSE, dirty_tracked = FALSE, allow_idle_exit = FALSE;
3227 	memorystatus_proc_flags_unsafe(bsd_info, &dirty, &dirty_tracked, &allow_idle_exit);
3228 	if (dirty) {
3229 		ss_flags |= kTaskIsDirty;
3230 	}
3231 	if (dirty_tracked) {
3232 		ss_flags |= kTaskIsDirtyTracked;
3233 	}
3234 	if (allow_idle_exit) {
3235 		ss_flags |= kTaskAllowIdleExit;
3236 	}
3237 
3238 #endif
3239 	if (task->effective_policy.tep_tal_engaged) {
3240 		ss_flags |= kTaskTALEngaged;
3241 	}
3242 
3243 	ss_flags |= workqueue_get_task_ss_flags_from_pwq_state_kdp(bsd_info);
3244 
3245 #if IMPORTANCE_INHERITANCE
3246 	if (task->task_imp_base) {
3247 		if (task->task_imp_base->iit_donor) {
3248 			ss_flags |= kTaskIsImpDonor;
3249 		}
3250 		if (task->task_imp_base->iit_live_donor) {
3251 			ss_flags |= kTaskIsLiveImpDonor;
3252 		}
3253 	}
3254 #endif
3255 	return ss_flags;
3256 }
3257 
3258 static kern_return_t
kcdata_record_shared_cache_info(kcdata_descriptor_t kcd,task_t task,unaligned_u64 * task_snap_ss_flags)3259 kcdata_record_shared_cache_info(kcdata_descriptor_t kcd, task_t task, unaligned_u64 *task_snap_ss_flags)
3260 {
3261 	kern_return_t error = KERN_SUCCESS;
3262 
3263 	uint64_t shared_cache_slide = 0;
3264 	uint64_t shared_cache_first_mapping = 0;
3265 	uint32_t kdp_fault_results = 0;
3266 	uint32_t shared_cache_id = 0;
3267 	struct dyld_shared_cache_loadinfo shared_cache_data = {0};
3268 
3269 
3270 	assert(task_snap_ss_flags != NULL);
3271 
3272 	/* Get basic info about the shared region pointer, regardless of any failures */
3273 	if (task->shared_region == NULL) {
3274 		*task_snap_ss_flags |= kTaskSharedRegionNone;
3275 	} else if (task->shared_region == primary_system_shared_region) {
3276 		*task_snap_ss_flags |= kTaskSharedRegionSystem;
3277 	} else {
3278 		*task_snap_ss_flags |= kTaskSharedRegionOther;
3279 	}
3280 
3281 	if (task->shared_region && _stackshot_validate_kva((vm_offset_t)task->shared_region, sizeof(struct vm_shared_region))) {
3282 		struct vm_shared_region *sr = task->shared_region;
3283 		shared_cache_first_mapping = sr->sr_base_address + sr->sr_first_mapping;
3284 
3285 		shared_cache_id = sr->sr_id;
3286 	} else {
3287 		*task_snap_ss_flags |= kTaskSharedRegionInfoUnavailable;
3288 		goto error_exit;
3289 	}
3290 
3291 	/* We haven't copied in the shared region UUID yet as part of setup */
3292 	if (!shared_cache_first_mapping || !task->shared_region->sr_uuid_copied) {
3293 		goto error_exit;
3294 	}
3295 
3296 
3297 	/*
3298 	 * No refcounting here, but we are in debugger context, so that should be safe.
3299 	 */
3300 	shared_cache_slide = task->shared_region->sr_slide;
3301 
3302 	if (task->shared_region == primary_system_shared_region) {
3303 		/* skip adding shared cache info -- it's the same as the system level one */
3304 		goto error_exit;
3305 	}
3306 	/*
3307 	 * New-style shared cache reference: for non-primary shared regions,
3308 	 * just include the ID of the shared cache we're attached to.  Consumers
3309 	 * should use the following info from the task's ts_ss_flags as well:
3310 	 *
3311 	 * kTaskSharedRegionNone - task is not attached to a shared region
3312 	 * kTaskSharedRegionSystem - task is attached to the shared region
3313 	 *     with kSharedCacheSystemPrimary set in sharedCacheFlags.
3314 	 * kTaskSharedRegionOther - task is attached to the shared region with
3315 	 *     sharedCacheID matching the STACKSHOT_KCTYPE_SHAREDCACHE_ID entry.
3316 	 */
3317 	kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_ID, sizeof(shared_cache_id), &shared_cache_id));
3318 
3319 	/*
3320 	 * For backwards compatibility; this should eventually be removed.
3321 	 *
3322 	 * Historically, this data was in a dyld_uuid_info_64 structure, but the
3323 	 * naming of both the structure and fields for this use wasn't great.  The
3324 	 * dyld_shared_cache_loadinfo structure has better names, but the same
3325 	 * layout and content as the original.
3326 	 *
3327 	 * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
3328 	 * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
3329 	 * entries; here, it's the slid first mapping, and we leave it that way
3330 	 * for backwards compatibility.
3331 	 */
3332 	shared_cache_data.sharedCacheSlide = shared_cache_slide;
3333 	kdp_memcpy(&shared_cache_data.sharedCacheUUID, task->shared_region->sr_uuid, sizeof(task->shared_region->sr_uuid));
3334 	shared_cache_data.sharedCacheUnreliableSlidBaseAddress = shared_cache_first_mapping;
3335 	shared_cache_data.sharedCacheSlidFirstMapping = shared_cache_first_mapping;
3336 	kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO, sizeof(shared_cache_data), &shared_cache_data));
3337 
3338 error_exit:
3339 	if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
3340 		*task_snap_ss_flags |= kTaskUUIDInfoMissing;
3341 	}
3342 
3343 	if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) {
3344 		*task_snap_ss_flags |= kTaskUUIDInfoTriedFault;
3345 	}
3346 
3347 	if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) {
3348 		*task_snap_ss_flags |= kTaskUUIDInfoFaultedIn;
3349 	}
3350 
3351 	return error;
3352 }
3353 
3354 static kern_return_t
kcdata_record_uuid_info(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 * task_snap_ss_flags)3355 kcdata_record_uuid_info(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 *task_snap_ss_flags)
3356 {
3357 	bool save_loadinfo_p         = ((trace_flags & STACKSHOT_SAVE_LOADINFO) != 0);
3358 	bool save_kextloadinfo_p     = ((trace_flags & STACKSHOT_SAVE_KEXT_LOADINFO) != 0);
3359 	bool save_compactinfo_p      = ((trace_flags & STACKSHOT_SAVE_DYLD_COMPACTINFO) != 0);
3360 	bool should_fault            = (trace_flags & STACKSHOT_ENABLE_UUID_FAULTING);
3361 
3362 	kern_return_t error        = KERN_SUCCESS;
3363 	mach_vm_address_t out_addr = 0;
3364 
3365 	mach_vm_address_t dyld_compactinfo_addr = 0;
3366 	uint32_t dyld_compactinfo_size = 0;
3367 
3368 	uint32_t uuid_info_count         = 0;
3369 	mach_vm_address_t uuid_info_addr = 0;
3370 	uint64_t uuid_info_timestamp     = 0;
3371 	#pragma unused(uuid_info_timestamp)
3372 	kdp_fault_result_flags_t kdp_fault_results = 0;
3373 
3374 
3375 	assert(task_snap_ss_flags != NULL);
3376 
3377 	int task_pid     = pid_from_task(task);
3378 	boolean_t task_64bit_addr = task_has_64Bit_addr(task);
3379 
3380 	if ((save_loadinfo_p || save_compactinfo_p) && have_pmap && task->active && task_pid > 0) {
3381 		/* Read the dyld_all_image_infos struct from the task memory to get UUID array count and location */
3382 		if (task_64bit_addr) {
3383 			struct user64_dyld_all_image_infos task_image_infos;
3384 			if (stackshot_copyin(task->map, task->all_image_info_addr, &task_image_infos,
3385 			    sizeof(struct user64_dyld_all_image_infos), should_fault, &kdp_fault_results)) {
3386 				uuid_info_count = (uint32_t)task_image_infos.uuidArrayCount;
3387 				uuid_info_addr = task_image_infos.uuidArray;
3388 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION) {
3389 					uuid_info_timestamp = task_image_infos.timestamp;
3390 				}
3391 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_COMPACTINFO_MINIMUM_VERSION) {
3392 					dyld_compactinfo_addr = task_image_infos.compact_dyld_image_info_addr;
3393 					dyld_compactinfo_size = task_image_infos.compact_dyld_image_info_size;
3394 				}
3395 
3396 			}
3397 		} else {
3398 			struct user32_dyld_all_image_infos task_image_infos;
3399 			if (stackshot_copyin(task->map, task->all_image_info_addr, &task_image_infos,
3400 			    sizeof(struct user32_dyld_all_image_infos), should_fault, &kdp_fault_results)) {
3401 				uuid_info_count = task_image_infos.uuidArrayCount;
3402 				uuid_info_addr = task_image_infos.uuidArray;
3403 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_TIMESTAMP_MINIMUM_VERSION) {
3404 					uuid_info_timestamp = task_image_infos.timestamp;
3405 				}
3406 				if (task_image_infos.version >= DYLD_ALL_IMAGE_INFOS_COMPACTINFO_MINIMUM_VERSION) {
3407 					dyld_compactinfo_addr = task_image_infos.compact_dyld_image_info_addr;
3408 					dyld_compactinfo_size = task_image_infos.compact_dyld_image_info_size;
3409 				}
3410 			}
3411 		}
3412 
3413 		/*
3414 		 * If we get a NULL uuid_info_addr (which can happen when we catch dyld in the middle of updating
3415 		 * this data structure), we zero the uuid_info_count so that we won't even try to save load info
3416 		 * for this task.
3417 		 */
3418 		if (!uuid_info_addr) {
3419 			uuid_info_count = 0;
3420 		}
3421 
3422 		if (!dyld_compactinfo_addr) {
3423 			dyld_compactinfo_size = 0;
3424 		}
3425 
3426 	}
3427 
3428 	if (have_pmap && task_pid == 0) {
3429 		if (save_kextloadinfo_p && _stackshot_validate_kva((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader))) {
3430 			uuid_info_count = gLoadedKextSummaries->numSummaries + 1; /* include main kernel UUID */
3431 		} else {
3432 			uuid_info_count = 1; /* include kernelcache UUID (embedded) or kernel UUID (desktop) */
3433 		}
3434 	}
3435 
3436 	if (save_compactinfo_p && task_pid > 0) {
3437 		if (dyld_compactinfo_size == 0) {
3438 			*task_snap_ss_flags |= kTaskDyldCompactInfoNone;
3439 		} else if (dyld_compactinfo_size > MAX_DYLD_COMPACTINFO) {
3440 			*task_snap_ss_flags |= kTaskDyldCompactInfoTooBig;
3441 		} else {
3442 			kdp_fault_result_flags_t ci_kdp_fault_results = 0;
3443 
3444 			/* Open a compression window to avoid overflowing the stack */
3445 			kcdata_compression_window_open(kcd);
3446 			kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_DYLD_COMPACTINFO,
3447 			    dyld_compactinfo_size, &out_addr));
3448 
3449 			if (!stackshot_copyin(task->map, dyld_compactinfo_addr, (void *)out_addr,
3450 			    dyld_compactinfo_size, should_fault, &ci_kdp_fault_results)) {
3451 				bzero((void *)out_addr, dyld_compactinfo_size);
3452 			}
3453 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
3454 				*task_snap_ss_flags |= kTaskDyldCompactInfoMissing;
3455 			}
3456 
3457 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) {
3458 				*task_snap_ss_flags |= kTaskDyldCompactInfoTriedFault;
3459 			}
3460 
3461 			if (ci_kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) {
3462 				*task_snap_ss_flags |= kTaskDyldCompactInfoFaultedIn;
3463 			}
3464 
3465 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
3466 		}
3467 	}
3468 	if (save_loadinfo_p && task_pid > 0 && (uuid_info_count < MAX_LOADINFOS)) {
3469 		uint32_t copied_uuid_count = 0;
3470 		uint32_t uuid_info_size = (uint32_t)(task_64bit_addr ? sizeof(struct user64_dyld_uuid_info) : sizeof(struct user32_dyld_uuid_info));
3471 		uint32_t uuid_info_array_size = 0;
3472 
3473 		/* Open a compression window to avoid overflowing the stack */
3474 		kcdata_compression_window_open(kcd);
3475 
3476 		/* If we found some UUID information, first try to copy it in -- this will only be non-zero if we had a pmap above */
3477 		if (uuid_info_count > 0) {
3478 			uuid_info_array_size = uuid_info_count * uuid_info_size;
3479 
3480 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
3481 			    uuid_info_size, uuid_info_count, &out_addr));
3482 
3483 			if (!stackshot_copyin(task->map, uuid_info_addr, (void *)out_addr, uuid_info_array_size, should_fault, &kdp_fault_results)) {
3484 				bzero((void *)out_addr, uuid_info_array_size);
3485 			} else {
3486 				copied_uuid_count = uuid_info_count;
3487 			}
3488 		}
3489 
3490 		uuid_t binary_uuid;
3491 		if (!copied_uuid_count && proc_binary_uuid_kdp(task, binary_uuid)) {
3492 			/* We failed to copyin the UUID information, try to store the UUID of the main binary we have in the proc */
3493 			if (uuid_info_array_size == 0) {
3494 				/* We just need to store one UUID */
3495 				uuid_info_array_size = uuid_info_size;
3496 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, (task_64bit_addr ? KCDATA_TYPE_LIBRARY_LOADINFO64 : KCDATA_TYPE_LIBRARY_LOADINFO),
3497 				    uuid_info_size, 1, &out_addr));
3498 			}
3499 
3500 			if (task_64bit_addr) {
3501 				struct user64_dyld_uuid_info *uuid_info = (struct user64_dyld_uuid_info *)out_addr;
3502 				uint64_t image_load_address = task->mach_header_vm_address;
3503 
3504 				kdp_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
3505 				kdp_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
3506 			} else {
3507 				struct user32_dyld_uuid_info *uuid_info = (struct user32_dyld_uuid_info *)out_addr;
3508 				uint32_t image_load_address = (uint32_t) task->mach_header_vm_address;
3509 
3510 				kdp_memcpy(&uuid_info->imageUUID, binary_uuid, sizeof(uuid_t));
3511 				kdp_memcpy(&uuid_info->imageLoadAddress, &image_load_address, sizeof(image_load_address));
3512 			}
3513 		}
3514 
3515 		kcd_exit_on_error(kcdata_compression_window_close(kcd));
3516 	} else if (task_pid == 0 && uuid_info_count > 0 && uuid_info_count < MAX_LOADINFOS) {
3517 		uintptr_t image_load_address;
3518 
3519 		do {
3520 #if defined(__arm64__)
3521 			if (kernelcache_uuid_valid && !save_kextloadinfo_p) {
3522 				struct dyld_uuid_info_64 kc_uuid = {0};
3523 				kc_uuid.imageLoadAddress = VM_MIN_KERNEL_AND_KEXT_ADDRESS;
3524 				kdp_memcpy(&kc_uuid.imageUUID, &kernelcache_uuid, sizeof(uuid_t));
3525 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_KERNELCACHE_LOADINFO, sizeof(struct dyld_uuid_info_64), &kc_uuid));
3526 				break;
3527 			}
3528 #endif /* defined(__arm64__) */
3529 
3530 			if (!kernel_uuid || !_stackshot_validate_kva((vm_offset_t)kernel_uuid, sizeof(uuid_t))) {
3531 				/* Kernel UUID not found or inaccessible */
3532 				break;
3533 			}
3534 
3535 			uint32_t uuid_type = KCDATA_TYPE_LIBRARY_LOADINFO;
3536 			if ((sizeof(kernel_uuid_info) == sizeof(struct user64_dyld_uuid_info))) {
3537 				uuid_type = KCDATA_TYPE_LIBRARY_LOADINFO64;
3538 #if  defined(__arm64__)
3539 				kc_format_t primary_kc_type = KCFormatUnknown;
3540 				if (PE_get_primary_kc_format(&primary_kc_type) && (primary_kc_type == KCFormatFileset)) {
3541 					/* return TEXT_EXEC based load information on arm devices running with fileset kernelcaches */
3542 					uuid_type = STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC;
3543 				}
3544 #endif
3545 			}
3546 
3547 			/*
3548 			 * The element count of the array can vary - avoid overflowing the
3549 			 * stack by opening a window.
3550 			 */
3551 			kcdata_compression_window_open(kcd);
3552 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, uuid_type,
3553 			    sizeof(kernel_uuid_info), uuid_info_count, &out_addr));
3554 			kernel_uuid_info *uuid_info_array = (kernel_uuid_info *)out_addr;
3555 
3556 			image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(vm_kernel_stext);
3557 #if defined(__arm64__)
3558 			if (uuid_type == STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC) {
3559 				/* If we're reporting TEXT_EXEC load info, populate the TEXT_EXEC base instead */
3560 				extern vm_offset_t segTEXTEXECB;
3561 				image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(segTEXTEXECB);
3562 			}
3563 #endif
3564 			uuid_info_array[0].imageLoadAddress = image_load_address;
3565 			kdp_memcpy(&uuid_info_array[0].imageUUID, kernel_uuid, sizeof(uuid_t));
3566 
3567 			if (save_kextloadinfo_p &&
3568 			    _stackshot_validate_kva((vm_offset_t)(gLoadedKextSummaries), sizeof(OSKextLoadedKextSummaryHeader)) &&
3569 			    _stackshot_validate_kva((vm_offset_t)(&gLoadedKextSummaries->summaries[0]),
3570 			    gLoadedKextSummaries->entry_size * gLoadedKextSummaries->numSummaries)) {
3571 				uint32_t kexti;
3572 				for (kexti = 0; kexti < gLoadedKextSummaries->numSummaries; kexti++) {
3573 					image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].address);
3574 #if defined(__arm64__)
3575 					if (uuid_type == STACKSHOT_KCTYPE_LOADINFO64_TEXT_EXEC) {
3576 						/* If we're reporting TEXT_EXEC load info, populate the TEXT_EXEC base instead */
3577 						image_load_address = (uintptr_t)VM_KERNEL_UNSLIDE(gLoadedKextSummaries->summaries[kexti].text_exec_address);
3578 					}
3579 #endif
3580 					uuid_info_array[kexti + 1].imageLoadAddress = image_load_address;
3581 					kdp_memcpy(&uuid_info_array[kexti + 1].imageUUID, &gLoadedKextSummaries->summaries[kexti].uuid, sizeof(uuid_t));
3582 				}
3583 			}
3584 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
3585 		} while (0);
3586 	}
3587 
3588 error_exit:
3589 	if (kdp_fault_results & KDP_FAULT_RESULT_PAGED_OUT) {
3590 		*task_snap_ss_flags |= kTaskUUIDInfoMissing;
3591 	}
3592 
3593 	if (kdp_fault_results & KDP_FAULT_RESULT_TRIED_FAULT) {
3594 		*task_snap_ss_flags |= kTaskUUIDInfoTriedFault;
3595 	}
3596 
3597 	if (kdp_fault_results & KDP_FAULT_RESULT_FAULTED_IN) {
3598 		*task_snap_ss_flags |= kTaskUUIDInfoFaultedIn;
3599 	}
3600 
3601 	return error;
3602 }
3603 
3604 uint64_t kdp_task_exec_meta_flags(task_t task);
3605 
3606 uint64_t
kdp_task_exec_meta_flags(task_t task)3607 kdp_task_exec_meta_flags(task_t task)
3608 {
3609 	uint64_t flags = 0;
3610 
3611 #if CONFIG_ROSETTA
3612 	if (task_is_translated(task)) {
3613 		flags |= kTaskExecTranslated;
3614 	}
3615 #endif /* CONFIG_ROSETTA */
3616 
3617 	if (task_has_hardened_heap(task)) {
3618 		flags |= kTaskExecHardenedHeap;
3619 	}
3620 
3621 
3622 	return flags;
3623 }
3624 
3625 /* Compute the set of flags that kdp_task_exec_meta_flags can return based on the kernel config */
3626 static uint64_t
stackshot_available_task_exec_flags(void)3627 stackshot_available_task_exec_flags(void)
3628 {
3629 	uint64_t flags_mask = 0;
3630 
3631 #if CONFIG_ROSETTA
3632 	flags_mask |= kTaskExecTranslated;
3633 #endif /* CONFIG_ROSETTA */
3634 
3635 	flags_mask |= kTaskExecHardenedHeap;
3636 
3637 
3638 	return flags_mask;
3639 }
3640 
3641 static kern_return_t
kcdata_record_task_exec_meta(kcdata_descriptor_t kcd,task_t task)3642 kcdata_record_task_exec_meta(kcdata_descriptor_t kcd, task_t task)
3643 {
3644 	struct task_exec_meta tem = {};
3645 	kern_return_t error = KERN_SUCCESS;
3646 
3647 	tem.tem_flags = kdp_task_exec_meta_flags(task);
3648 
3649 	if (tem.tem_flags != 0) {
3650 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_TASK_EXEC_META, sizeof(struct task_exec_meta), &tem));
3651 	}
3652 
3653 error_exit:
3654 	return error;
3655 }
3656 
3657 static kern_return_t
kcdata_record_task_iostats(kcdata_descriptor_t kcd,task_t task)3658 kcdata_record_task_iostats(kcdata_descriptor_t kcd, task_t task)
3659 {
3660 	kern_return_t error = KERN_SUCCESS;
3661 	mach_vm_address_t out_addr = 0;
3662 
3663 	/* I/O Statistics if any counters are non zero */
3664 	assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES);
3665 	if (task->task_io_stats && !memory_iszero(task->task_io_stats, sizeof(struct io_stat_info))) {
3666 		/* struct io_stats_snapshot is quite large - avoid overflowing the stack. */
3667 		kcdata_compression_window_open(kcd);
3668 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr));
3669 		struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr;
3670 		_iostat->ss_disk_reads_count = task->task_io_stats->disk_reads.count;
3671 		_iostat->ss_disk_reads_size = task->task_io_stats->disk_reads.size;
3672 		_iostat->ss_disk_writes_count = (task->task_io_stats->total_io.count - task->task_io_stats->disk_reads.count);
3673 		_iostat->ss_disk_writes_size = (task->task_io_stats->total_io.size - task->task_io_stats->disk_reads.size);
3674 		_iostat->ss_paging_count = task->task_io_stats->paging.count;
3675 		_iostat->ss_paging_size = task->task_io_stats->paging.size;
3676 		_iostat->ss_non_paging_count = (task->task_io_stats->total_io.count - task->task_io_stats->paging.count);
3677 		_iostat->ss_non_paging_size = (task->task_io_stats->total_io.size - task->task_io_stats->paging.size);
3678 		_iostat->ss_metadata_count = task->task_io_stats->metadata.count;
3679 		_iostat->ss_metadata_size = task->task_io_stats->metadata.size;
3680 		_iostat->ss_data_count = (task->task_io_stats->total_io.count - task->task_io_stats->metadata.count);
3681 		_iostat->ss_data_size = (task->task_io_stats->total_io.size - task->task_io_stats->metadata.size);
3682 		for (int i = 0; i < IO_NUM_PRIORITIES; i++) {
3683 			_iostat->ss_io_priority_count[i] = task->task_io_stats->io_priority[i].count;
3684 			_iostat->ss_io_priority_size[i] = task->task_io_stats->io_priority[i].size;
3685 		}
3686 		kcd_exit_on_error(kcdata_compression_window_close(kcd));
3687 	}
3688 
3689 
3690 error_exit:
3691 	return error;
3692 }
3693 
3694 #if CONFIG_PERVASIVE_CPI
3695 static kern_return_t
kcdata_record_task_instrs_cycles(kcdata_descriptor_t kcd,task_t task)3696 kcdata_record_task_instrs_cycles(kcdata_descriptor_t kcd, task_t task)
3697 {
3698 	struct instrs_cycles_snapshot_v2 instrs_cycles = { 0 };
3699 	struct recount_usage usage = { 0 };
3700 	struct recount_usage perf_only = { 0 };
3701 	recount_task_terminated_usage_perf_only(task, &usage, &perf_only);
3702 	instrs_cycles.ics_instructions = recount_usage_instructions(&usage);
3703 	instrs_cycles.ics_cycles = recount_usage_cycles(&usage);
3704 	instrs_cycles.ics_p_instructions = recount_usage_instructions(&perf_only);
3705 	instrs_cycles.ics_p_cycles = recount_usage_cycles(&perf_only);
3706 
3707 	return kcdata_push_data(kcd, STACKSHOT_KCTYPE_INSTRS_CYCLES, sizeof(instrs_cycles), &instrs_cycles);
3708 }
3709 #endif /* CONFIG_PERVASIVE_CPI */
3710 
3711 static kern_return_t
kcdata_record_task_cpu_architecture(kcdata_descriptor_t kcd,task_t task)3712 kcdata_record_task_cpu_architecture(kcdata_descriptor_t kcd, task_t task)
3713 {
3714 	struct stackshot_cpu_architecture cpu_architecture = {0};
3715 	int32_t cputype;
3716 	int32_t cpusubtype;
3717 
3718 	proc_archinfo_kdp(get_bsdtask_info(task), &cputype, &cpusubtype);
3719 	cpu_architecture.cputype = cputype;
3720 	cpu_architecture.cpusubtype = cpusubtype;
3721 
3722 	return kcdata_push_data(kcd, STACKSHOT_KCTYPE_TASK_CPU_ARCHITECTURE, sizeof(struct stackshot_cpu_architecture), &cpu_architecture);
3723 }
3724 
3725 static kern_return_t
kcdata_record_task_codesigning_info(kcdata_descriptor_t kcd,task_t task)3726 kcdata_record_task_codesigning_info(kcdata_descriptor_t kcd, task_t task)
3727 {
3728 	struct stackshot_task_codesigning_info codesigning_info = {};
3729 	void * bsdtask_info = NULL;
3730 	uint32_t trust = 0;
3731 	kern_return_t ret = 0;
3732 	pmap_t pmap = get_task_pmap(task);
3733 	uint64_t cs_auxiliary_info = 0;
3734 	if (task != kernel_task) {
3735 		bsdtask_info = get_bsdtask_info(task);
3736 		codesigning_info.csflags = proc_getcsflags_kdp(bsdtask_info);
3737 		ret = get_trust_level_kdp(pmap, &trust);
3738 		if (ret != KERN_SUCCESS) {
3739 			trust = KCDATA_INVALID_CS_TRUST_LEVEL;
3740 		}
3741 		codesigning_info.cs_trust_level = trust;
3742 		cs_auxiliary_info = task_get_cs_auxiliary_info_kdp(task);
3743 	} else {
3744 		return KERN_SUCCESS;
3745 	}
3746 	ret = kcdata_push_data(kcd, STACKSHOT_KCTYPE_CODESIGNING_INFO, sizeof(struct stackshot_task_codesigning_info), &codesigning_info);
3747 	if (ret != KERN_SUCCESS) {
3748 		return ret;
3749 	}
3750 	return kcdata_push_data(kcd, TASK_CRASHINFO_CS_AUXILIARY_INFO, sizeof(cs_auxiliary_info), &cs_auxiliary_info);
3751 }
3752 
3753 static kern_return_t
kcdata_record_task_jit_address_range(kcdata_descriptor_t kcd,task_t task)3754 kcdata_record_task_jit_address_range(kcdata_descriptor_t kcd, task_t task)
3755 {
3756 	uint64_t jit_start_addr = 0;
3757 	uint64_t jit_end_addr = 0;
3758 	struct crashinfo_jit_address_range range = {};
3759 	kern_return_t ret = 0;
3760 	pmap_t pmap = get_task_pmap(task);
3761 	if (task == kernel_task || NULL == pmap) {
3762 		return KERN_SUCCESS;
3763 	}
3764 	ret = get_jit_address_range_kdp(pmap, (uintptr_t*)&jit_start_addr, (uintptr_t*)&jit_end_addr);
3765 	if (KERN_SUCCESS == ret) {
3766 		range.start_address = jit_start_addr;
3767 		range.end_address = jit_end_addr;
3768 		return kcdata_push_data(kcd, TASK_CRASHINFO_JIT_ADDRESS_RANGE, sizeof(struct crashinfo_jit_address_range), &range);
3769 	} else {
3770 		return KERN_SUCCESS;
3771 	}
3772 }
3773 
3774 #if CONFIG_TASK_SUSPEND_STATS
3775 static kern_return_t
kcdata_record_task_suspension_info(kcdata_descriptor_t kcd,task_t task)3776 kcdata_record_task_suspension_info(kcdata_descriptor_t kcd, task_t task)
3777 {
3778 	kern_return_t ret = KERN_SUCCESS;
3779 	struct stackshot_suspension_info suspension_info = {};
3780 	task_suspend_stats_data_t suspend_stats;
3781 	task_suspend_source_array_t suspend_sources;
3782 	struct stackshot_suspension_source suspension_sources[TASK_SUSPEND_SOURCES_MAX];
3783 	int i;
3784 
3785 	if (task == kernel_task) {
3786 		return KERN_SUCCESS;
3787 	}
3788 
3789 	ret = task_get_suspend_stats_kdp(task, &suspend_stats);
3790 	if (ret != KERN_SUCCESS) {
3791 		return ret;
3792 	}
3793 
3794 	suspension_info.tss_count = suspend_stats.tss_count;
3795 	suspension_info.tss_duration = suspend_stats.tss_duration;
3796 	suspension_info.tss_last_end = suspend_stats.tss_last_end;
3797 	suspension_info.tss_last_start = suspend_stats.tss_last_start;
3798 	ret = kcdata_push_data(kcd, STACKSHOT_KCTYPE_SUSPENSION_INFO, sizeof(suspension_info), &suspension_info);
3799 	if (ret != KERN_SUCCESS) {
3800 		return ret;
3801 	}
3802 
3803 	ret = task_get_suspend_sources_kdp(task, suspend_sources);
3804 	if (ret != KERN_SUCCESS) {
3805 		return ret;
3806 	}
3807 
3808 	for (i = 0; i < TASK_SUSPEND_SOURCES_MAX; ++i) {
3809 		suspension_sources[i].tss_pid = suspend_sources[i].tss_pid;
3810 		strlcpy(suspension_sources[i].tss_procname, suspend_sources[i].tss_procname, sizeof(suspend_sources[i].tss_procname));
3811 		suspension_sources[i].tss_tid = suspend_sources[i].tss_tid;
3812 		suspension_sources[i].tss_time = suspend_sources[i].tss_time;
3813 	}
3814 	return kcdata_push_array(kcd, STACKSHOT_KCTYPE_SUSPENSION_SOURCE, sizeof(suspension_sources[0]), TASK_SUSPEND_SOURCES_MAX, &suspension_sources);
3815 }
3816 #endif /* CONFIG_TASK_SUSPEND_STATS */
3817 
3818 static kern_return_t
kcdata_record_transitioning_task_snapshot(kcdata_descriptor_t kcd,task_t task,unaligned_u64 task_snap_ss_flags,uint64_t transition_type)3819 kcdata_record_transitioning_task_snapshot(kcdata_descriptor_t kcd, task_t task, unaligned_u64 task_snap_ss_flags, uint64_t transition_type)
3820 {
3821 	kern_return_t error                 = KERN_SUCCESS;
3822 	mach_vm_address_t out_addr          = 0;
3823 	struct transitioning_task_snapshot * cur_tsnap = NULL;
3824 
3825 	int task_pid           = pid_from_task(task);
3826 	/* Is returning -1 ok for terminating task ok ??? */
3827 	uint64_t task_uniqueid = get_task_uniqueid(task);
3828 
3829 	if (task_pid && (task_did_exec_internal(task) || task_is_exec_copy_internal(task))) {
3830 		/*
3831 		 * if this task is a transit task from another one, show the pid as
3832 		 * negative
3833 		 */
3834 		task_pid = 0 - task_pid;
3835 	}
3836 
3837 	/* the task_snapshot_v2 struct is large - avoid overflowing the stack */
3838 	kcdata_compression_window_open(kcd);
3839 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TRANSITIONING_TASK_SNAPSHOT, sizeof(struct transitioning_task_snapshot), &out_addr));
3840 	cur_tsnap = (struct transitioning_task_snapshot *)out_addr;
3841 	bzero(cur_tsnap, sizeof(*cur_tsnap));
3842 
3843 	cur_tsnap->tts_unique_pid = task_uniqueid;
3844 	cur_tsnap->tts_ss_flags = kcdata_get_task_ss_flags(task);
3845 	cur_tsnap->tts_ss_flags |= task_snap_ss_flags;
3846 	cur_tsnap->tts_transition_type = transition_type;
3847 	cur_tsnap->tts_pid = task_pid;
3848 
3849 	/* Add the BSD process identifiers */
3850 	if (task_pid != -1 && get_bsdtask_info(task) != NULL) {
3851 		proc_name_kdp(get_bsdtask_info(task), cur_tsnap->tts_p_comm, sizeof(cur_tsnap->tts_p_comm));
3852 	} else {
3853 		cur_tsnap->tts_p_comm[0] = '\0';
3854 	}
3855 
3856 	kcd_exit_on_error(kcdata_compression_window_close(kcd));
3857 
3858 error_exit:
3859 	return error;
3860 }
3861 
3862 static kern_return_t
3863 #if STACKSHOT_COLLECTS_LATENCY_INFO
kcdata_record_task_snapshot(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 task_snap_ss_flags,struct stackshot_latency_task * latency_info)3864 kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags, struct stackshot_latency_task *latency_info)
3865 #else
3866 kcdata_record_task_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags)
3867 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3868 {
3869 	bool collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
3870 	bool collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
3871 #if CONFIG_PERVASIVE_CPI
3872 	bool collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
3873 #endif /* CONFIG_PERVASIVE_CPI */
3874 #if __arm64__
3875 	bool collect_asid            = ((trace_flags & STACKSHOT_ASID) != 0);
3876 #endif
3877 	bool collect_pagetables      = ((trace_flags & STACKSHOT_PAGE_TABLES) != 0);
3878 
3879 
3880 	kern_return_t error                 = KERN_SUCCESS;
3881 	mach_vm_address_t out_addr          = 0;
3882 	struct task_snapshot_v2 * cur_tsnap = NULL;
3883 #if STACKSHOT_COLLECTS_LATENCY_INFO
3884 	latency_info->cur_tsnap_latency = mach_absolute_time();
3885 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3886 
3887 	int task_pid           = pid_from_task(task);
3888 	uint64_t task_uniqueid = get_task_uniqueid(task);
3889 	void *bsd_info = get_bsdtask_info(task);
3890 	uint64_t proc_starttime_secs = 0;
3891 
3892 	if (task_pid && (task_did_exec_internal(task) || task_is_exec_copy_internal(task))) {
3893 		/*
3894 		 * if this task is a transit task from another one, show the pid as
3895 		 * negative
3896 		 */
3897 		task_pid = 0 - task_pid;
3898 	}
3899 
3900 	/* the task_snapshot_v2 struct is large - avoid overflowing the stack */
3901 	kcdata_compression_window_open(kcd);
3902 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_SNAPSHOT, sizeof(struct task_snapshot_v2), &out_addr));
3903 	cur_tsnap = (struct task_snapshot_v2 *)out_addr;
3904 	bzero(cur_tsnap, sizeof(*cur_tsnap));
3905 
3906 	cur_tsnap->ts_unique_pid = task_uniqueid;
3907 	cur_tsnap->ts_ss_flags = kcdata_get_task_ss_flags(task);
3908 	cur_tsnap->ts_ss_flags |= task_snap_ss_flags;
3909 
3910 	struct recount_usage term_usage = { 0 };
3911 	recount_task_terminated_usage(task, &term_usage);
3912 	struct recount_times_mach term_times = recount_usage_times_mach(&term_usage);
3913 	cur_tsnap->ts_user_time_in_terminated_threads = term_times.rtm_user;
3914 	cur_tsnap->ts_system_time_in_terminated_threads = term_times.rtm_system;
3915 
3916 	proc_starttime_kdp(bsd_info, &proc_starttime_secs, NULL, NULL);
3917 	cur_tsnap->ts_p_start_sec = proc_starttime_secs;
3918 	cur_tsnap->ts_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
3919 	cur_tsnap->ts_max_resident_size = get_task_resident_max(task);
3920 	cur_tsnap->ts_was_throttled = (uint32_t) proc_was_throttled_from_task(task);
3921 	cur_tsnap->ts_did_throttle = (uint32_t) proc_did_throttle_from_task(task);
3922 
3923 	cur_tsnap->ts_suspend_count = task->suspend_count;
3924 	cur_tsnap->ts_faults = counter_load(&task->faults);
3925 	cur_tsnap->ts_pageins = counter_load(&task->pageins);
3926 	cur_tsnap->ts_cow_faults = counter_load(&task->cow_faults);
3927 	cur_tsnap->ts_latency_qos = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED) ?
3928 	    LATENCY_QOS_TIER_UNSPECIFIED : ((0xFF << 16) | task->effective_policy.tep_latency_qos);
3929 	cur_tsnap->ts_pid = task_pid;
3930 
3931 	/* Add the BSD process identifiers */
3932 	if (task_pid != -1 && bsd_info != NULL) {
3933 		proc_name_kdp(bsd_info, cur_tsnap->ts_p_comm, sizeof(cur_tsnap->ts_p_comm));
3934 	} else {
3935 		cur_tsnap->ts_p_comm[0] = '\0';
3936 #if IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG)
3937 		if (task->task_imp_base != NULL) {
3938 			kdp_strlcpy(cur_tsnap->ts_p_comm, &task->task_imp_base->iit_procname[0],
3939 			    MIN((int)sizeof(task->task_imp_base->iit_procname), (int)sizeof(cur_tsnap->ts_p_comm)));
3940 		}
3941 #endif /* IMPORTANCE_INHERITANCE && (DEVELOPMENT || DEBUG) */
3942 	}
3943 
3944 	kcd_exit_on_error(kcdata_compression_window_close(kcd));
3945 
3946 #if CONFIG_COALITIONS
3947 	if (task_pid != -1 && bsd_info != NULL &&
3948 	    (task->coalition[COALITION_TYPE_JETSAM] != NULL)) {
3949 		/*
3950 		 * The jetsam coalition ID is always saved, even if
3951 		 * STACKSHOT_SAVE_JETSAM_COALITIONS is not set.
3952 		 */
3953 		uint64_t jetsam_coal_id = coalition_id(task->coalition[COALITION_TYPE_JETSAM]);
3954 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_JETSAM_COALITION, sizeof(jetsam_coal_id), &jetsam_coal_id));
3955 	}
3956 #endif /* CONFIG_COALITIONS */
3957 
3958 #if __arm64__
3959 	if (collect_asid && have_pmap) {
3960 		uint32_t asid = PMAP_VASID(task->map->pmap);
3961 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_ASID, sizeof(asid), &asid));
3962 	}
3963 #endif
3964 
3965 #if STACKSHOT_COLLECTS_LATENCY_INFO
3966 	latency_info->cur_tsnap_latency = mach_absolute_time() - latency_info->cur_tsnap_latency;
3967 	latency_info->pmap_latency = mach_absolute_time();
3968 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3969 
3970 	if (collect_pagetables && have_pmap) {
3971 #if SCHED_HYGIENE_DEBUG
3972 		// pagetable dumps can be large; reset the interrupt timeout to avoid a panic
3973 		ml_spin_debug_clear_self();
3974 #endif
3975 		assert(stackshot_ctx.sc_is_singlethreaded);
3976 		size_t bytes_dumped = 0;
3977 		error = pmap_dump_page_tables(task->map->pmap, kcd_end_address(kcd), kcd_max_address(kcd), stackshot_args.pagetable_mask, &bytes_dumped);
3978 		if (error != KERN_SUCCESS) {
3979 			goto error_exit;
3980 		} else {
3981 			/* Variable size array - better not have it on the stack. */
3982 			kcdata_compression_window_open(kcd);
3983 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(kcd, STACKSHOT_KCTYPE_PAGE_TABLES,
3984 			    sizeof(uint64_t), (uint32_t)(bytes_dumped / sizeof(uint64_t)), &out_addr));
3985 			kcd_exit_on_error(kcdata_compression_window_close(kcd));
3986 		}
3987 	}
3988 
3989 #if STACKSHOT_COLLECTS_LATENCY_INFO
3990 	latency_info->pmap_latency = mach_absolute_time() - latency_info->pmap_latency;
3991 	latency_info->bsd_proc_ids_latency = mach_absolute_time();
3992 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3993 
3994 #if STACKSHOT_COLLECTS_LATENCY_INFO
3995 	latency_info->bsd_proc_ids_latency = mach_absolute_time() - latency_info->bsd_proc_ids_latency;
3996 	latency_info->end_latency = mach_absolute_time();
3997 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
3998 
3999 	if (collect_iostats) {
4000 		kcd_exit_on_error(kcdata_record_task_iostats(kcd, task));
4001 	}
4002 
4003 #if CONFIG_PERVASIVE_CPI
4004 	if (collect_instrs_cycles) {
4005 		kcd_exit_on_error(kcdata_record_task_instrs_cycles(kcd, task));
4006 	}
4007 #endif /* CONFIG_PERVASIVE_CPI */
4008 
4009 	kcd_exit_on_error(kcdata_record_task_cpu_architecture(kcd, task));
4010 	kcd_exit_on_error(kcdata_record_task_codesigning_info(kcd, task));
4011 	kcd_exit_on_error(kcdata_record_task_jit_address_range(kcd, task));
4012 
4013 #if CONFIG_TASK_SUSPEND_STATS
4014 	kcd_exit_on_error(kcdata_record_task_suspension_info(kcd, task));
4015 #endif /* CONFIG_TASK_SUSPEND_STATS */
4016 
4017 #if STACKSHOT_COLLECTS_LATENCY_INFO
4018 	latency_info->end_latency = mach_absolute_time() - latency_info->end_latency;
4019 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4020 
4021 error_exit:
4022 	return error;
4023 }
4024 
4025 static kern_return_t
kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd,task_t task,uint64_t trace_flags,boolean_t have_pmap,unaligned_u64 task_snap_ss_flags)4026 kcdata_record_task_delta_snapshot(kcdata_descriptor_t kcd, task_t task, uint64_t trace_flags, boolean_t have_pmap, unaligned_u64 task_snap_ss_flags)
4027 {
4028 #if !CONFIG_PERVASIVE_CPI
4029 #pragma unused(trace_flags)
4030 #endif /* !CONFIG_PERVASIVE_CPI */
4031 	kern_return_t error                       = KERN_SUCCESS;
4032 	struct task_delta_snapshot_v2 * cur_tsnap = NULL;
4033 	mach_vm_address_t out_addr                = 0;
4034 	(void) trace_flags;
4035 #if __arm64__
4036 	boolean_t collect_asid                    = ((trace_flags & STACKSHOT_ASID) != 0);
4037 #endif
4038 #if CONFIG_PERVASIVE_CPI
4039 	boolean_t collect_instrs_cycles           = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
4040 #endif /* CONFIG_PERVASIVE_CPI */
4041 
4042 	uint64_t task_uniqueid = get_task_uniqueid(task);
4043 
4044 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_TASK_DELTA_SNAPSHOT, sizeof(struct task_delta_snapshot_v2), &out_addr));
4045 
4046 	cur_tsnap = (struct task_delta_snapshot_v2 *)out_addr;
4047 
4048 	cur_tsnap->tds_unique_pid = task_uniqueid;
4049 	cur_tsnap->tds_ss_flags = kcdata_get_task_ss_flags(task);
4050 	cur_tsnap->tds_ss_flags |= task_snap_ss_flags;
4051 
4052 	struct recount_usage usage = { 0 };
4053 	recount_task_terminated_usage(task, &usage);
4054 	struct recount_times_mach term_times = recount_usage_times_mach(&usage);
4055 
4056 	cur_tsnap->tds_user_time_in_terminated_threads = term_times.rtm_user;
4057 	cur_tsnap->tds_system_time_in_terminated_threads = term_times.rtm_system;
4058 
4059 	cur_tsnap->tds_task_size = have_pmap ? get_task_phys_footprint(task) : 0;
4060 
4061 	cur_tsnap->tds_max_resident_size = get_task_resident_max(task);
4062 	cur_tsnap->tds_suspend_count = task->suspend_count;
4063 	cur_tsnap->tds_faults            = counter_load(&task->faults);
4064 	cur_tsnap->tds_pageins           = counter_load(&task->pageins);
4065 	cur_tsnap->tds_cow_faults        = counter_load(&task->cow_faults);
4066 	cur_tsnap->tds_was_throttled     = (uint32_t)proc_was_throttled_from_task(task);
4067 	cur_tsnap->tds_did_throttle      = (uint32_t)proc_did_throttle_from_task(task);
4068 	cur_tsnap->tds_latency_qos       = (task->effective_policy.tep_latency_qos == LATENCY_QOS_TIER_UNSPECIFIED)
4069 	    ? LATENCY_QOS_TIER_UNSPECIFIED
4070 	    : ((0xFF << 16) | task->effective_policy.tep_latency_qos);
4071 
4072 #if __arm64__
4073 	if (collect_asid && have_pmap) {
4074 		uint32_t asid = PMAP_VASID(task->map->pmap);
4075 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_ASID, sizeof(uint32_t), &out_addr));
4076 		kdp_memcpy((void*)out_addr, &asid, sizeof(asid));
4077 	}
4078 #endif
4079 
4080 #if CONFIG_PERVASIVE_CPI
4081 	if (collect_instrs_cycles) {
4082 		kcd_exit_on_error(kcdata_record_task_instrs_cycles(kcd, task));
4083 	}
4084 #endif /* CONFIG_PERVASIVE_CPI */
4085 
4086 error_exit:
4087 	return error;
4088 }
4089 
4090 static kern_return_t
kcdata_record_thread_iostats(kcdata_descriptor_t kcd,thread_t thread)4091 kcdata_record_thread_iostats(kcdata_descriptor_t kcd, thread_t thread)
4092 {
4093 	kern_return_t error = KERN_SUCCESS;
4094 	mach_vm_address_t out_addr = 0;
4095 
4096 	/* I/O Statistics */
4097 	assert(IO_NUM_PRIORITIES == STACKSHOT_IO_NUM_PRIORITIES);
4098 	if (thread->thread_io_stats && !memory_iszero(thread->thread_io_stats, sizeof(struct io_stat_info))) {
4099 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_IOSTATS, sizeof(struct io_stats_snapshot), &out_addr));
4100 		struct io_stats_snapshot *_iostat = (struct io_stats_snapshot *)out_addr;
4101 		_iostat->ss_disk_reads_count = thread->thread_io_stats->disk_reads.count;
4102 		_iostat->ss_disk_reads_size = thread->thread_io_stats->disk_reads.size;
4103 		_iostat->ss_disk_writes_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->disk_reads.count);
4104 		_iostat->ss_disk_writes_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->disk_reads.size);
4105 		_iostat->ss_paging_count = thread->thread_io_stats->paging.count;
4106 		_iostat->ss_paging_size = thread->thread_io_stats->paging.size;
4107 		_iostat->ss_non_paging_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->paging.count);
4108 		_iostat->ss_non_paging_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->paging.size);
4109 		_iostat->ss_metadata_count = thread->thread_io_stats->metadata.count;
4110 		_iostat->ss_metadata_size = thread->thread_io_stats->metadata.size;
4111 		_iostat->ss_data_count = (thread->thread_io_stats->total_io.count - thread->thread_io_stats->metadata.count);
4112 		_iostat->ss_data_size = (thread->thread_io_stats->total_io.size - thread->thread_io_stats->metadata.size);
4113 		for (int i = 0; i < IO_NUM_PRIORITIES; i++) {
4114 			_iostat->ss_io_priority_count[i] = thread->thread_io_stats->io_priority[i].count;
4115 			_iostat->ss_io_priority_size[i] = thread->thread_io_stats->io_priority[i].size;
4116 		}
4117 	}
4118 
4119 error_exit:
4120 	return error;
4121 }
4122 
4123 bool
machine_trace_thread_validate_kva(vm_offset_t addr)4124 machine_trace_thread_validate_kva(vm_offset_t addr)
4125 {
4126 	return _stackshot_validate_kva(addr, sizeof(uintptr_t));
4127 }
4128 
4129 struct _stackshot_backtrace_context {
4130 	vm_map_t sbc_map;
4131 	vm_offset_t sbc_prev_page;
4132 	vm_offset_t sbc_prev_kva;
4133 	uint32_t sbc_flags;
4134 	bool sbc_allow_faulting;
4135 };
4136 
4137 static errno_t
_stackshot_backtrace_copy(void * vctx,void * dst,user_addr_t src,size_t size)4138 _stackshot_backtrace_copy(void *vctx, void *dst, user_addr_t src, size_t size)
4139 {
4140 	struct _stackshot_backtrace_context *ctx = vctx;
4141 	size_t map_page_mask = 0;
4142 	size_t __assert_only map_page_size = kdp_vm_map_get_page_size(ctx->sbc_map,
4143 	    &map_page_mask);
4144 	assert(size < map_page_size);
4145 	if (src & (size - 1)) {
4146 		// The source should be aligned to the size passed in, like a stack
4147 		// frame or word.
4148 		return EINVAL;
4149 	}
4150 
4151 	vm_offset_t src_page = src & ~map_page_mask;
4152 	vm_offset_t src_kva = 0;
4153 
4154 	if (src_page != ctx->sbc_prev_page) {
4155 		uint32_t res = 0;
4156 		uint32_t flags = 0;
4157 		vm_offset_t src_pa = stackshot_find_phys(ctx->sbc_map, src,
4158 		    ctx->sbc_allow_faulting, &res);
4159 
4160 		flags |= (res & KDP_FAULT_RESULT_PAGED_OUT) ? kThreadTruncatedBT : 0;
4161 		flags |= (res & KDP_FAULT_RESULT_TRIED_FAULT) ? kThreadTriedFaultBT : 0;
4162 		flags |= (res & KDP_FAULT_RESULT_FAULTED_IN) ? kThreadFaultedBT : 0;
4163 		ctx->sbc_flags |= flags;
4164 		if (src_pa == 0) {
4165 			return EFAULT;
4166 		}
4167 
4168 		src_kva = phystokv(src_pa);
4169 		ctx->sbc_prev_page = src_page;
4170 		ctx->sbc_prev_kva = (src_kva & ~map_page_mask);
4171 	} else {
4172 		src_kva = ctx->sbc_prev_kva + (src & map_page_mask);
4173 	}
4174 
4175 #if KASAN
4176 	/*
4177 	 * KASan does not monitor accesses to userspace pages. Therefore, it is
4178 	 * pointless to maintain a shadow map for them. Instead, they are all
4179 	 * mapped to a single, always valid shadow map page. This approach saves
4180 	 * a considerable amount of shadow map pages which are limited and
4181 	 * precious.
4182 	 */
4183 	kasan_notify_address_nopoison(src_kva, size);
4184 #endif
4185 	memcpy(dst, (const void *)src_kva, size);
4186 
4187 	return 0;
4188 }
4189 
4190 static kern_return_t
kcdata_record_thread_snapshot(kcdata_descriptor_t kcd,thread_t thread,task_t task,uint64_t trace_flags,boolean_t have_pmap,boolean_t thread_on_core)4191 kcdata_record_thread_snapshot(kcdata_descriptor_t kcd, thread_t thread, task_t task, uint64_t trace_flags, boolean_t have_pmap, boolean_t thread_on_core)
4192 {
4193 	boolean_t dispatch_p              = ((trace_flags & STACKSHOT_GET_DQ) != 0);
4194 	boolean_t active_kthreads_only_p  = ((trace_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
4195 	boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
4196 	boolean_t collect_iostats         = !collect_delta_stackshot && !(trace_flags & STACKSHOT_NO_IO_STATS);
4197 #if CONFIG_PERVASIVE_CPI
4198 	boolean_t collect_instrs_cycles   = ((trace_flags & STACKSHOT_INSTRS_CYCLES) != 0);
4199 #endif /* CONFIG_PERVASIVE_CPI */
4200 	kern_return_t error        = KERN_SUCCESS;
4201 
4202 #if STACKSHOT_COLLECTS_LATENCY_INFO
4203 	struct stackshot_latency_thread latency_info;
4204 	latency_info.cur_thsnap1_latency = mach_absolute_time();
4205 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4206 
4207 	mach_vm_address_t out_addr = 0;
4208 	int saved_count            = 0;
4209 
4210 	struct thread_snapshot_v4 * cur_thread_snap = NULL;
4211 	char cur_thread_name[STACKSHOT_MAX_THREAD_NAME_SIZE];
4212 
4213 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_SNAPSHOT, sizeof(struct thread_snapshot_v4), &out_addr));
4214 	cur_thread_snap = (struct thread_snapshot_v4 *)out_addr;
4215 
4216 	/* Populate the thread snapshot header */
4217 	cur_thread_snap->ths_ss_flags = 0;
4218 	cur_thread_snap->ths_thread_id = thread_tid(thread);
4219 	cur_thread_snap->ths_wait_event = VM_KERNEL_UNSLIDE_OR_PERM(thread->wait_event);
4220 	cur_thread_snap->ths_continuation = VM_KERNEL_UNSLIDE(thread->continuation);
4221 	cur_thread_snap->ths_total_syscalls = thread->syscalls_mach + thread->syscalls_unix;
4222 
4223 	if (IPC_VOUCHER_NULL != thread->ith_voucher) {
4224 		cur_thread_snap->ths_voucher_identifier = VM_KERNEL_ADDRPERM(thread->ith_voucher);
4225 	} else {
4226 		cur_thread_snap->ths_voucher_identifier = 0;
4227 	}
4228 
4229 #if STACKSHOT_COLLECTS_LATENCY_INFO
4230 	latency_info.cur_thsnap1_latency = mach_absolute_time() - latency_info.cur_thsnap1_latency;
4231 	latency_info.dispatch_serial_latency = mach_absolute_time();
4232 	latency_info.dispatch_label_latency = 0;
4233 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4234 
4235 	cur_thread_snap->ths_dqserialnum = 0;
4236 	if (dispatch_p && (task != kernel_task) && (task->active) && have_pmap) {
4237 		uint64_t dqkeyaddr = thread_dispatchqaddr(thread);
4238 		if (dqkeyaddr != 0) {
4239 			uint64_t dqaddr = 0;
4240 			boolean_t copyin_ok = stackshot_copyin_word(task, dqkeyaddr, &dqaddr, FALSE, NULL);
4241 			if (copyin_ok && dqaddr != 0) {
4242 				uint64_t dqserialnumaddr = dqaddr + get_task_dispatchqueue_serialno_offset(task);
4243 				uint64_t dqserialnum = 0;
4244 				copyin_ok = stackshot_copyin_word(task, dqserialnumaddr, &dqserialnum, FALSE, NULL);
4245 				if (copyin_ok) {
4246 					cur_thread_snap->ths_ss_flags |= kHasDispatchSerial;
4247 					cur_thread_snap->ths_dqserialnum = dqserialnum;
4248 				}
4249 
4250 #if STACKSHOT_COLLECTS_LATENCY_INFO
4251 				latency_info.dispatch_serial_latency = mach_absolute_time() - latency_info.dispatch_serial_latency;
4252 				latency_info.dispatch_label_latency = mach_absolute_time();
4253 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4254 
4255 				/* try copying in the queue label */
4256 				uint64_t label_offs = get_task_dispatchqueue_label_offset(task);
4257 				if (label_offs) {
4258 					uint64_t dqlabeladdr = dqaddr + label_offs;
4259 					uint64_t actual_dqlabeladdr = 0;
4260 
4261 					copyin_ok = stackshot_copyin_word(task, dqlabeladdr, &actual_dqlabeladdr, FALSE, NULL);
4262 					if (copyin_ok && actual_dqlabeladdr != 0) {
4263 						char label_buf[STACKSHOT_QUEUE_LABEL_MAXSIZE];
4264 						int len;
4265 
4266 						bzero(label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE * sizeof(char));
4267 						len = stackshot_copyin_string(task, actual_dqlabeladdr, label_buf, STACKSHOT_QUEUE_LABEL_MAXSIZE, FALSE, NULL);
4268 						if (len > 0) {
4269 							mach_vm_address_t label_addr = 0;
4270 							kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_DISPATCH_QUEUE_LABEL, len, &label_addr));
4271 							kdp_strlcpy((char*)label_addr, &label_buf[0], len);
4272 						}
4273 					}
4274 				}
4275 #if STACKSHOT_COLLECTS_LATENCY_INFO
4276 				latency_info.dispatch_label_latency = mach_absolute_time() - latency_info.dispatch_label_latency;
4277 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4278 			}
4279 		}
4280 	}
4281 
4282 #if STACKSHOT_COLLECTS_LATENCY_INFO
4283 	if ((cur_thread_snap->ths_ss_flags & kHasDispatchSerial) == 0) {
4284 		latency_info.dispatch_serial_latency = 0;
4285 	}
4286 	latency_info.cur_thsnap2_latency = mach_absolute_time();
4287 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4288 
4289 	struct recount_times_mach times = recount_thread_times(thread);
4290 	cur_thread_snap->ths_user_time = times.rtm_user;
4291 	cur_thread_snap->ths_sys_time = times.rtm_system;
4292 
4293 	if (thread->thread_tag & THREAD_TAG_MAINTHREAD) {
4294 		cur_thread_snap->ths_ss_flags |= kThreadMain;
4295 	}
4296 	if (thread->effective_policy.thep_darwinbg) {
4297 		cur_thread_snap->ths_ss_flags |= kThreadDarwinBG;
4298 	}
4299 	if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) {
4300 		cur_thread_snap->ths_ss_flags |= kThreadIOPassive;
4301 	}
4302 	if (thread->suspend_count > 0) {
4303 		cur_thread_snap->ths_ss_flags |= kThreadSuspended;
4304 	}
4305 	if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
4306 		cur_thread_snap->ths_ss_flags |= kGlobalForcedIdle;
4307 	}
4308 #if CONFIG_EXCLAVES
4309 	/* save exclave thread for later collection */
4310 	if ((thread->th_exclaves_state & TH_EXCLAVES_RPC) && stackshot_exclave_inspect_ctids && !stackshot_ctx.sc_panic_stackshot) {
4311 		/* certain threads, like the collector, must never be inspected */
4312 		if ((os_atomic_load(&thread->th_exclaves_inspection_state, relaxed) & TH_EXCLAVES_INSPECTION_NOINSPECT) == 0) {
4313 			uint32_t ctid_index = os_atomic_inc_orig(&stackshot_exclave_inspect_ctid_count, acq_rel);
4314 			if (ctid_index < stackshot_exclave_inspect_ctid_capacity) {
4315 				stackshot_exclave_inspect_ctids[ctid_index] = thread_get_ctid(thread);
4316 			} else {
4317 				os_atomic_store(&stackshot_exclave_inspect_ctid_count, stackshot_exclave_inspect_ctid_capacity, release);
4318 			}
4319 			if ((os_atomic_load(&thread->th_exclaves_inspection_state, relaxed) & TH_EXCLAVES_INSPECTION_STACKSHOT) != 0) {
4320 				panic("stackshot: trying to inspect already-queued thread");
4321 			}
4322 		}
4323 	}
4324 #endif /* CONFIG_EXCLAVES */
4325 	if (thread_on_core) {
4326 		cur_thread_snap->ths_ss_flags |= kThreadOnCore;
4327 	}
4328 	if (stackshot_thread_is_idle_worker_unsafe(thread)) {
4329 		cur_thread_snap->ths_ss_flags |= kThreadIdleWorker;
4330 	}
4331 
4332 	/* make sure state flags defined in kcdata.h still match internal flags */
4333 	static_assert(SS_TH_WAIT == TH_WAIT);
4334 	static_assert(SS_TH_SUSP == TH_SUSP);
4335 	static_assert(SS_TH_RUN == TH_RUN);
4336 	static_assert(SS_TH_UNINT == TH_UNINT);
4337 	static_assert(SS_TH_TERMINATE == TH_TERMINATE);
4338 	static_assert(SS_TH_TERMINATE2 == TH_TERMINATE2);
4339 	static_assert(SS_TH_IDLE == TH_IDLE);
4340 
4341 	cur_thread_snap->ths_last_run_time           = thread->last_run_time;
4342 	cur_thread_snap->ths_last_made_runnable_time = thread->last_made_runnable_time;
4343 	cur_thread_snap->ths_state                   = thread->state;
4344 	cur_thread_snap->ths_sched_flags             = thread->sched_flags;
4345 	cur_thread_snap->ths_base_priority = thread->base_pri;
4346 	cur_thread_snap->ths_sched_priority = thread->sched_pri;
4347 	cur_thread_snap->ths_eqos = thread->effective_policy.thep_qos;
4348 	cur_thread_snap->ths_rqos = thread->requested_policy.thrp_qos;
4349 	cur_thread_snap->ths_rqos_override = MAX(thread->requested_policy.thrp_qos_override,
4350 	    thread->requested_policy.thrp_qos_workq_override);
4351 	cur_thread_snap->ths_io_tier = (uint8_t) proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
4352 	cur_thread_snap->ths_thread_t = VM_KERNEL_UNSLIDE_OR_PERM(thread);
4353 
4354 	static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t));
4355 	static_assert(sizeof(thread->requested_policy) == sizeof(uint64_t));
4356 	cur_thread_snap->ths_requested_policy = *(unaligned_u64 *) &thread->requested_policy;
4357 	cur_thread_snap->ths_effective_policy = *(unaligned_u64 *) &thread->effective_policy;
4358 
4359 #if STACKSHOT_COLLECTS_LATENCY_INFO
4360 	latency_info.cur_thsnap2_latency = mach_absolute_time()  - latency_info.cur_thsnap2_latency;
4361 	latency_info.thread_name_latency = mach_absolute_time();
4362 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4363 
4364 	/* if there is thread name then add to buffer */
4365 	cur_thread_name[0] = '\0';
4366 	proc_threadname_kdp(get_bsdthread_info(thread), cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE);
4367 	if (strnlen(cur_thread_name, STACKSHOT_MAX_THREAD_NAME_SIZE) > 0) {
4368 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_NAME, sizeof(cur_thread_name), &out_addr));
4369 		kdp_memcpy((void *)out_addr, (void *)cur_thread_name, sizeof(cur_thread_name));
4370 	}
4371 
4372 #if STACKSHOT_COLLECTS_LATENCY_INFO
4373 	latency_info.thread_name_latency = mach_absolute_time()  - latency_info.thread_name_latency;
4374 	latency_info.sur_times_latency = mach_absolute_time();
4375 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4376 
4377 	/* record system, user, and runnable times */
4378 	time_value_t runnable_time;
4379 	thread_read_times(thread, NULL, NULL, &runnable_time);
4380 	clock_sec_t user_sec = 0, system_sec = 0;
4381 	clock_usec_t user_usec = 0, system_usec = 0;
4382 	absolutetime_to_microtime(times.rtm_user, &user_sec, &user_usec);
4383 	absolutetime_to_microtime(times.rtm_system, &system_sec, &system_usec);
4384 
4385 	kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_CPU_TIMES, sizeof(struct stackshot_cpu_times_v2), &out_addr));
4386 	struct stackshot_cpu_times_v2 *stackshot_cpu_times = (struct stackshot_cpu_times_v2 *)out_addr;
4387 	*stackshot_cpu_times = (struct stackshot_cpu_times_v2){
4388 		.user_usec = user_sec * USEC_PER_SEC + user_usec,
4389 		.system_usec = system_sec * USEC_PER_SEC + system_usec,
4390 		.runnable_usec = (uint64_t)runnable_time.seconds * USEC_PER_SEC + runnable_time.microseconds,
4391 	};
4392 
4393 #if STACKSHOT_COLLECTS_LATENCY_INFO
4394 	latency_info.sur_times_latency = mach_absolute_time()  - latency_info.sur_times_latency;
4395 	latency_info.user_stack_latency = mach_absolute_time();
4396 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4397 
4398 	/* Trace user stack, if any */
4399 	if (!active_kthreads_only_p && task->active && task->map != kernel_map) {
4400 		uint32_t user_ths_ss_flags = 0;
4401 
4402 		/*
4403 		 * We don't know how big the stacktrace will be, so read it into our
4404 		 * per-cpu buffer, then copy it to the kcdata.
4405 		 */
4406 		struct _stackshot_backtrace_context ctx = {
4407 			.sbc_map = task->map,
4408 			.sbc_allow_faulting = stackshot_ctx.sc_enable_faulting,
4409 			.sbc_prev_page = -1,
4410 			.sbc_prev_kva = -1,
4411 		};
4412 		struct backtrace_control ctl = {
4413 			.btc_user_thread = thread,
4414 			.btc_user_copy = _stackshot_backtrace_copy,
4415 			.btc_user_copy_context = &ctx,
4416 		};
4417 		struct backtrace_user_info info = BTUINFO_INIT;
4418 
4419 		saved_count = backtrace_user(stackshot_cpu_ctx.scc_stack_buffer, MAX_FRAMES, &ctl,
4420 		    &info);
4421 		if (saved_count > 0) {
4422 #if __LP64__
4423 #define STACKLR_WORDS STACKSHOT_KCTYPE_USER_STACKLR64
4424 #else // __LP64__
4425 #define STACKLR_WORDS STACKSHOT_KCTYPE_USER_STACKLR
4426 #endif // !__LP64__
4427 			/* Now, copy the stacktrace into kcdata. */
4428 			kcd_exit_on_error(kcdata_push_array(kcd, STACKLR_WORDS, sizeof(uintptr_t),
4429 			    saved_count, stackshot_cpu_ctx.scc_stack_buffer));
4430 			if (info.btui_info & BTI_64_BIT) {
4431 				user_ths_ss_flags |= kUser64_p;
4432 			}
4433 			if ((info.btui_info & BTI_TRUNCATED) ||
4434 			    (ctx.sbc_flags & kThreadTruncatedBT)) {
4435 				user_ths_ss_flags |= kThreadTruncatedBT;
4436 				user_ths_ss_flags |= kThreadTruncUserBT;
4437 			}
4438 			user_ths_ss_flags |= ctx.sbc_flags;
4439 			ctx.sbc_flags = 0;
4440 #if __LP64__
4441 			/* We only support async stacks on 64-bit kernels */
4442 			if (info.btui_async_frame_addr != 0) {
4443 				uint32_t async_start_offset = info.btui_async_start_index;
4444 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_USER_ASYNC_START_INDEX,
4445 				    sizeof(async_start_offset), &async_start_offset));
4446 				ctl.btc_frame_addr = info.btui_async_frame_addr;
4447 				ctl.btc_addr_offset = BTCTL_ASYNC_ADDR_OFFSET;
4448 				info = BTUINFO_INIT;
4449 				unsigned int async_count = backtrace_user(stackshot_cpu_ctx.scc_stack_buffer, MAX_FRAMES, &ctl,
4450 				    &info);
4451 				if (async_count > 0) {
4452 					kcd_exit_on_error(kcdata_push_array(kcd, STACKSHOT_KCTYPE_USER_ASYNC_STACKLR64,
4453 					    sizeof(uintptr_t), async_count, stackshot_cpu_ctx.scc_stack_buffer));
4454 					if ((info.btui_info & BTI_TRUNCATED) ||
4455 					    (ctx.sbc_flags & kThreadTruncatedBT)) {
4456 						user_ths_ss_flags |= kThreadTruncatedBT;
4457 						user_ths_ss_flags |= kThreadTruncUserAsyncBT;
4458 					}
4459 					user_ths_ss_flags |= ctx.sbc_flags;
4460 				}
4461 			}
4462 #endif /* _LP64 */
4463 		}
4464 		if (user_ths_ss_flags != 0) {
4465 			cur_thread_snap->ths_ss_flags |= user_ths_ss_flags;
4466 		}
4467 	}
4468 
4469 #if STACKSHOT_COLLECTS_LATENCY_INFO
4470 	latency_info.user_stack_latency = mach_absolute_time()  - latency_info.user_stack_latency;
4471 	latency_info.kernel_stack_latency = mach_absolute_time();
4472 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4473 
4474 	/* Call through to the machine specific trace routines
4475 	 * Frames are added past the snapshot header.
4476 	 */
4477 	if (thread->kernel_stack != 0) {
4478 		uint32_t kern_ths_ss_flags = 0;
4479 #if defined(__LP64__)
4480 		uint32_t stack_kcdata_type = STACKSHOT_KCTYPE_KERN_STACKLR64;
4481 		extern int machine_trace_thread64(thread_t thread, char *tracepos,
4482 		    char *tracebound, int nframes, uint32_t *thread_trace_flags);
4483 		saved_count = machine_trace_thread64(
4484 #else
4485 		uint32_t stack_kcdata_type = STACKSHOT_KCTYPE_KERN_STACKLR;
4486 		extern int machine_trace_thread(thread_t thread, char *tracepos,
4487 		    char *tracebound, int nframes, uint32_t *thread_trace_flags);
4488 		saved_count = machine_trace_thread(
4489 #endif
4490 			thread, (char*) stackshot_cpu_ctx.scc_stack_buffer,
4491 			(char *) (stackshot_cpu_ctx.scc_stack_buffer + MAX_FRAMES), MAX_FRAMES,
4492 			&kern_ths_ss_flags);
4493 		if (saved_count > 0) {
4494 			int frame_size = sizeof(uintptr_t);
4495 #if defined(__LP64__)
4496 			cur_thread_snap->ths_ss_flags |= kKernel64_p;
4497 #endif
4498 #if CONFIG_EXCLAVES
4499 			if (thread->th_exclaves_state & TH_EXCLAVES_RPC) {
4500 				struct thread_exclaves_info info = { 0 };
4501 
4502 				info.tei_flags = kExclaveRPCActive;
4503 				if (thread->th_exclaves_state & TH_EXCLAVES_SCHEDULER_REQUEST) {
4504 					info.tei_flags |= kExclaveSchedulerRequest;
4505 				}
4506 				if (thread->th_exclaves_state & TH_EXCLAVES_UPCALL) {
4507 					info.tei_flags |= kExclaveUpcallActive;
4508 				}
4509 				info.tei_scid = thread->th_exclaves_ipc_ctx.scid;
4510 				info.tei_thread_offset = exclaves_stack_offset(stackshot_cpu_ctx.scc_stack_buffer, saved_count / frame_size, false);
4511 
4512 				kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_KERN_EXCLAVES_THREADINFO, sizeof(struct thread_exclaves_info), &info));
4513 			}
4514 #endif /* CONFIG_EXCLAVES */
4515 			kcd_exit_on_error(kcdata_push_array(kcd, stack_kcdata_type,
4516 			    frame_size, saved_count / frame_size, stackshot_cpu_ctx.scc_stack_buffer));
4517 		}
4518 		if (kern_ths_ss_flags & kThreadTruncatedBT) {
4519 			kern_ths_ss_flags |= kThreadTruncKernBT;
4520 		}
4521 		if (kern_ths_ss_flags != 0) {
4522 			cur_thread_snap->ths_ss_flags |= kern_ths_ss_flags;
4523 		}
4524 	}
4525 
4526 #if STACKSHOT_COLLECTS_LATENCY_INFO
4527 	latency_info.kernel_stack_latency = mach_absolute_time()  - latency_info.kernel_stack_latency;
4528 	latency_info.misc_latency = mach_absolute_time();
4529 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4530 
4531 #if CONFIG_THREAD_GROUPS
4532 	if (trace_flags & STACKSHOT_THREAD_GROUP) {
4533 		uint64_t thread_group_id = thread->thread_group ? thread_group_get_id(thread->thread_group) : 0;
4534 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_THREAD_GROUP, sizeof(thread_group_id), &out_addr));
4535 		kdp_memcpy((void*)out_addr, &thread_group_id, sizeof(uint64_t));
4536 	}
4537 #endif /* CONFIG_THREAD_GROUPS */
4538 
4539 	if (collect_iostats) {
4540 		kcd_exit_on_error(kcdata_record_thread_iostats(kcd, thread));
4541 	}
4542 
4543 #if CONFIG_PERVASIVE_CPI
4544 	if (collect_instrs_cycles) {
4545 		struct recount_usage usage = { 0 };
4546 		recount_sum_unsafe(&recount_thread_plan, thread->th_recount.rth_lifetime,
4547 		    &usage);
4548 
4549 		kcd_exit_on_error(kcdata_get_memory_addr(kcd, STACKSHOT_KCTYPE_INSTRS_CYCLES, sizeof(struct instrs_cycles_snapshot), &out_addr));
4550 		struct instrs_cycles_snapshot *instrs_cycles = (struct instrs_cycles_snapshot *)out_addr;
4551 		    instrs_cycles->ics_instructions = recount_usage_instructions(&usage);
4552 		    instrs_cycles->ics_cycles = recount_usage_cycles(&usage);
4553 	}
4554 #endif /* CONFIG_PERVASIVE_CPI */
4555 
4556 #if STACKSHOT_COLLECTS_LATENCY_INFO
4557 	latency_info.misc_latency = mach_absolute_time() - latency_info.misc_latency;
4558 	if (collect_latency_info) {
4559 		kcd_exit_on_error(kcdata_push_data(kcd, STACKSHOT_KCTYPE_LATENCY_INFO_THREAD, sizeof(latency_info), &latency_info));
4560 	}
4561 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4562 
4563 error_exit:
4564 	return error;
4565 }
4566 
4567 static int
kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread_snap,thread_t thread,boolean_t thread_on_core)4568 kcdata_record_thread_delta_snapshot(struct thread_delta_snapshot_v3 * cur_thread_snap, thread_t thread, boolean_t thread_on_core)
4569 {
4570 	cur_thread_snap->tds_thread_id = thread_tid(thread);
4571 	if (IPC_VOUCHER_NULL != thread->ith_voucher) {
4572 		cur_thread_snap->tds_voucher_identifier  = VM_KERNEL_ADDRPERM(thread->ith_voucher);
4573 	} else {
4574 		cur_thread_snap->tds_voucher_identifier = 0;
4575 	}
4576 
4577 	cur_thread_snap->tds_ss_flags = 0;
4578 	if (thread->effective_policy.thep_darwinbg) {
4579 		cur_thread_snap->tds_ss_flags |= kThreadDarwinBG;
4580 	}
4581 	if (proc_get_effective_thread_policy(thread, TASK_POLICY_PASSIVE_IO)) {
4582 		cur_thread_snap->tds_ss_flags |= kThreadIOPassive;
4583 	}
4584 	if (thread->suspend_count > 0) {
4585 		cur_thread_snap->tds_ss_flags |= kThreadSuspended;
4586 	}
4587 	if (thread->options & TH_OPT_GLOBAL_FORCED_IDLE) {
4588 		cur_thread_snap->tds_ss_flags |= kGlobalForcedIdle;
4589 	}
4590 	if (thread_on_core) {
4591 		cur_thread_snap->tds_ss_flags |= kThreadOnCore;
4592 	}
4593 	if (stackshot_thread_is_idle_worker_unsafe(thread)) {
4594 		cur_thread_snap->tds_ss_flags |= kThreadIdleWorker;
4595 	}
4596 
4597 	cur_thread_snap->tds_last_made_runnable_time = thread->last_made_runnable_time;
4598 	cur_thread_snap->tds_state                   = thread->state;
4599 	cur_thread_snap->tds_sched_flags             = thread->sched_flags;
4600 	cur_thread_snap->tds_base_priority           = thread->base_pri;
4601 	cur_thread_snap->tds_sched_priority          = thread->sched_pri;
4602 	cur_thread_snap->tds_eqos                    = thread->effective_policy.thep_qos;
4603 	cur_thread_snap->tds_rqos                    = thread->requested_policy.thrp_qos;
4604 	cur_thread_snap->tds_rqos_override           = MAX(thread->requested_policy.thrp_qos_override,
4605 	    thread->requested_policy.thrp_qos_workq_override);
4606 	cur_thread_snap->tds_io_tier                 = (uint8_t) proc_get_effective_thread_policy(thread, TASK_POLICY_IO);
4607 
4608 	static_assert(sizeof(thread->effective_policy) == sizeof(uint64_t));
4609 	static_assert(sizeof(thread->requested_policy) == sizeof(uint64_t));
4610 	cur_thread_snap->tds_requested_policy = *(unaligned_u64 *) &thread->requested_policy;
4611 	cur_thread_snap->tds_effective_policy = *(unaligned_u64 *) &thread->effective_policy;
4612 
4613 	return 0;
4614 }
4615 
4616 /*
4617  * Why 12?  12 strikes a decent balance between allocating a large array on
4618  * the stack and having large kcdata item overheads for recording nonrunable
4619  * tasks.
4620  */
4621 #define UNIQUEIDSPERFLUSH 12
4622 
4623 struct saved_uniqueids {
4624 	uint64_t ids[UNIQUEIDSPERFLUSH];
4625 	unsigned count;
4626 };
4627 
4628 enum thread_classification {
4629 	tc_full_snapshot,  /* take a full snapshot */
4630 	tc_delta_snapshot, /* take a delta snapshot */
4631 };
4632 
4633 static enum thread_classification
classify_thread(thread_t thread,boolean_t * thread_on_core_p,boolean_t collect_delta_stackshot)4634 classify_thread(thread_t thread, boolean_t * thread_on_core_p, boolean_t collect_delta_stackshot)
4635 {
4636 	processor_t last_processor = thread->last_processor;
4637 
4638 	boolean_t thread_on_core = FALSE;
4639 	if (last_processor != PROCESSOR_NULL) {
4640 		/* Idle threads are always treated as on-core, since the processor state can change while they are running. */
4641 		thread_on_core = (thread == last_processor->idle_thread) ||
4642 		    (last_processor->state == PROCESSOR_RUNNING &&
4643 		    last_processor->active_thread == thread);
4644 	}
4645 
4646 	*thread_on_core_p = thread_on_core;
4647 
4648 	/* Capture the full thread snapshot if this is not a delta stackshot or if the thread has run subsequent to the
4649 	 * previous full stackshot */
4650 	if (!collect_delta_stackshot || thread_on_core || (thread->last_run_time > stackshot_args.since_timestamp)) {
4651 		return tc_full_snapshot;
4652 	} else {
4653 		return tc_delta_snapshot;
4654 	}
4655 }
4656 
4657 
4658 static kern_return_t
kdp_stackshot_record_task(task_t task)4659 kdp_stackshot_record_task(task_t task)
4660 {
4661 	boolean_t active_kthreads_only_p  = ((stackshot_flags & STACKSHOT_ACTIVE_KERNEL_THREADS_ONLY) != 0);
4662 	boolean_t save_donating_pids_p    = ((stackshot_flags & STACKSHOT_SAVE_IMP_DONATION_PIDS) != 0);
4663 	boolean_t collect_delta_stackshot = ((stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
4664 	boolean_t save_owner_info         = ((stackshot_flags & STACKSHOT_THREAD_WAITINFO) != 0);
4665 	boolean_t include_drivers         = ((stackshot_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) != 0);
4666 
4667 	kern_return_t error = KERN_SUCCESS;
4668 	mach_vm_address_t out_addr = 0;
4669 	int saved_count = 0;
4670 
4671 	int task_pid                   = 0;
4672 	uint64_t task_uniqueid         = 0;
4673 	int num_delta_thread_snapshots = 0;
4674 	int num_waitinfo_threads       = 0;
4675 	int num_turnstileinfo_threads  = 0;
4676 
4677 	uint64_t task_start_abstime    = 0;
4678 	boolean_t have_map = FALSE, have_pmap = FALSE;
4679 	boolean_t some_thread_ran = FALSE;
4680 	unaligned_u64 task_snap_ss_flags = 0;
4681 #if STACKSHOT_COLLECTS_LATENCY_INFO
4682 	struct stackshot_latency_task latency_info;
4683 	latency_info.setup_latency = mach_absolute_time();
4684 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4685 
4686 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
4687 	uint64_t task_begin_cpu_cycle_count = 0;
4688 	if (!stackshot_ctx.sc_panic_stackshot) {
4689 		task_begin_cpu_cycle_count = mt_cur_cpu_cycles();
4690 	}
4691 #endif
4692 
4693 	if ((task == NULL) || !_stackshot_validate_kva((vm_offset_t)task, sizeof(struct task))) {
4694 		error = KERN_FAILURE;
4695 		goto error_exit;
4696 	}
4697 
4698 	void *bsd_info = get_bsdtask_info(task);
4699 	boolean_t task_in_teardown        = (bsd_info == NULL) || proc_in_teardown(bsd_info);// has P_LPEXIT set during proc_exit()
4700 	boolean_t task_in_transition      = task_in_teardown;         // here we can add other types of transition.
4701 	uint32_t  container_type          = (task_in_transition) ? STACKSHOT_KCCONTAINER_TRANSITIONING_TASK : STACKSHOT_KCCONTAINER_TASK;
4702 	uint32_t  transition_type         = (task_in_teardown) ? kTaskIsTerminated : 0;
4703 
4704 	if (task_in_transition) {
4705 		collect_delta_stackshot = FALSE;
4706 	}
4707 
4708 	have_map = (task->map != NULL) && (_stackshot_validate_kva((vm_offset_t)(task->map), sizeof(struct _vm_map)));
4709 	have_pmap = have_map && (task->map->pmap != NULL) && (_stackshot_validate_kva((vm_offset_t)(task->map->pmap), sizeof(struct pmap)));
4710 
4711 	task_pid = pid_from_task(task);
4712 	/* Is returning -1 ok for terminating task ok ??? */
4713 	task_uniqueid = get_task_uniqueid(task);
4714 
4715 	if (!task->active || task_is_a_corpse(task) || task_is_a_corpse_fork(task)) {
4716 		/*
4717 		 * Not interested in terminated tasks without threads.
4718 		 */
4719 		if (queue_empty(&task->threads) || task_pid == -1) {
4720 			return KERN_SUCCESS;
4721 		}
4722 	}
4723 
4724 	/* All PIDs should have the MSB unset */
4725 	assert((task_pid & (1ULL << 31)) == 0);
4726 
4727 #if STACKSHOT_COLLECTS_LATENCY_INFO
4728 	latency_info.setup_latency = mach_absolute_time() - latency_info.setup_latency;
4729 	latency_info.task_uniqueid = task_uniqueid;
4730 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4731 
4732 	/* Trace everything, unless a process was specified. Add in driver tasks if requested. */
4733 	if ((stackshot_args.pid == -1) || (stackshot_args.pid == task_pid) || (include_drivers && task_is_driver(task))) {
4734 #if STACKSHOT_COLLECTS_LATENCY_INFO
4735 		stackshot_cpu_latency.tasks_processed++;
4736 #endif
4737 
4738 		/* add task snapshot marker */
4739 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
4740 		    container_type, task_uniqueid));
4741 
4742 		if (collect_delta_stackshot) {
4743 			/*
4744 			 * For delta stackshots we need to know if a thread from this task has run since the
4745 			 * previous timestamp to decide whether we're going to record a full snapshot and UUID info.
4746 			 */
4747 			thread_t thread = THREAD_NULL;
4748 			queue_iterate(&task->threads, thread, thread_t, task_threads)
4749 			{
4750 				if ((thread == NULL) || !_stackshot_validate_kva((vm_offset_t)thread, sizeof(struct thread))) {
4751 					error = KERN_FAILURE;
4752 					goto error_exit;
4753 				}
4754 
4755 				if (active_kthreads_only_p && thread->kernel_stack == 0) {
4756 					continue;
4757 				}
4758 
4759 				boolean_t thread_on_core;
4760 				enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, collect_delta_stackshot);
4761 
4762 				switch (thread_classification) {
4763 				case tc_full_snapshot:
4764 					some_thread_ran = TRUE;
4765 					break;
4766 				case tc_delta_snapshot:
4767 					num_delta_thread_snapshots++;
4768 					break;
4769 				}
4770 			}
4771 		}
4772 
4773 		if (collect_delta_stackshot) {
4774 			proc_starttime_kdp(get_bsdtask_info(task), NULL, NULL, &task_start_abstime);
4775 		}
4776 
4777 		/* Next record any relevant UUID info and store the task snapshot */
4778 		if (task_in_transition ||
4779 		    !collect_delta_stackshot ||
4780 		    (task_start_abstime == 0) ||
4781 		    (task_start_abstime > stackshot_args.since_timestamp) ||
4782 		    some_thread_ran) {
4783 			/*
4784 			 * Collect full task information in these scenarios:
4785 			 *
4786 			 * 1) a full stackshot or the task is in transition
4787 			 * 2) a delta stackshot where the task started after the previous full stackshot
4788 			 * 3) a delta stackshot where any thread from the task has run since the previous full stackshot
4789 			 *
4790 			 * because the task may have exec'ed, changing its name, architecture, load info, etc
4791 			 */
4792 
4793 			kcd_exit_on_error(kcdata_record_shared_cache_info(stackshot_kcdata_p, task, &task_snap_ss_flags));
4794 			kcd_exit_on_error(kcdata_record_uuid_info(stackshot_kcdata_p, task, stackshot_flags, have_pmap, &task_snap_ss_flags));
4795 			kcd_exit_on_error(kcdata_record_task_exec_meta(stackshot_kcdata_p, task));
4796 #if STACKSHOT_COLLECTS_LATENCY_INFO
4797 			if (!task_in_transition) {
4798 				kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags, &latency_info));
4799 			} else {
4800 				kcd_exit_on_error(kcdata_record_transitioning_task_snapshot(stackshot_kcdata_p, task, task_snap_ss_flags, transition_type));
4801 			}
4802 #else
4803 			if (!task_in_transition) {
4804 				kcd_exit_on_error(kcdata_record_task_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags));
4805 			} else {
4806 				kcd_exit_on_error(kcdata_record_transitioning_task_snapshot(stackshot_kcdata_p, task, task_snap_ss_flags, transition_type));
4807 			}
4808 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4809 		} else {
4810 			kcd_exit_on_error(kcdata_record_task_delta_snapshot(stackshot_kcdata_p, task, stackshot_flags, have_pmap, task_snap_ss_flags));
4811 		}
4812 
4813 #if STACKSHOT_COLLECTS_LATENCY_INFO
4814 		latency_info.misc_latency = mach_absolute_time();
4815 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4816 
4817 		struct thread_delta_snapshot_v3 * delta_snapshots = NULL;
4818 		int current_delta_snapshot_index                  = 0;
4819 		if (num_delta_thread_snapshots > 0) {
4820 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_DELTA_SNAPSHOT,
4821 			    sizeof(struct thread_delta_snapshot_v3),
4822 			    num_delta_thread_snapshots, &out_addr));
4823 			delta_snapshots = (struct thread_delta_snapshot_v3 *)out_addr;
4824 		}
4825 
4826 
4827 #if STACKSHOT_COLLECTS_LATENCY_INFO
4828 		latency_info.task_thread_count_loop_latency = mach_absolute_time();
4829 #endif
4830 		/*
4831 		 * Iterate over the task threads to save thread snapshots and determine
4832 		 * how much space we need for waitinfo and turnstile info
4833 		 */
4834 		thread_t thread = THREAD_NULL;
4835 		queue_iterate(&task->threads, thread, thread_t, task_threads)
4836 		{
4837 			if ((thread == NULL) || !_stackshot_validate_kva((vm_offset_t)thread, sizeof(struct thread))) {
4838 				error = KERN_FAILURE;
4839 				goto error_exit;
4840 			}
4841 
4842 			uint64_t thread_uniqueid;
4843 			if (active_kthreads_only_p && thread->kernel_stack == 0) {
4844 				continue;
4845 			}
4846 			thread_uniqueid = thread_tid(thread);
4847 
4848 			boolean_t thread_on_core;
4849 			enum thread_classification thread_classification = classify_thread(thread, &thread_on_core, collect_delta_stackshot);
4850 
4851 #if STACKSHOT_COLLECTS_LATENCY_INFO
4852 			stackshot_cpu_latency.threads_processed++;
4853 #endif
4854 
4855 			switch (thread_classification) {
4856 			case tc_full_snapshot:
4857 				/* add thread marker */
4858 				kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
4859 				    STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
4860 
4861 				/* thread snapshot can be large, including strings, avoid overflowing the stack. */
4862 				kcdata_compression_window_open(stackshot_kcdata_p);
4863 
4864 				kcd_exit_on_error(kcdata_record_thread_snapshot(stackshot_kcdata_p, thread, task, stackshot_flags, have_pmap, thread_on_core));
4865 
4866 				kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
4867 
4868 				/* mark end of thread snapshot data */
4869 				kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
4870 				    STACKSHOT_KCCONTAINER_THREAD, thread_uniqueid));
4871 				break;
4872 			case tc_delta_snapshot:
4873 				kcd_exit_on_error(kcdata_record_thread_delta_snapshot(&delta_snapshots[current_delta_snapshot_index++], thread, thread_on_core));
4874 				break;
4875 			}
4876 
4877 			/*
4878 			 * We want to report owner information regardless of whether a thread
4879 			 * has changed since the last delta, whether it's a normal stackshot,
4880 			 * or whether it's nonrunnable
4881 			 */
4882 			if (save_owner_info) {
4883 				if (stackshot_thread_has_valid_waitinfo(thread)) {
4884 					num_waitinfo_threads++;
4885 				}
4886 
4887 				if (stackshot_thread_has_valid_turnstileinfo(thread)) {
4888 					num_turnstileinfo_threads++;
4889 				}
4890 			}
4891 		}
4892 #if STACKSHOT_COLLECTS_LATENCY_INFO
4893 		latency_info.task_thread_count_loop_latency = mach_absolute_time() - latency_info.task_thread_count_loop_latency;
4894 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4895 
4896 		thread_waitinfo_v2_t *thread_waitinfo           = NULL;
4897 		thread_turnstileinfo_v2_t *thread_turnstileinfo = NULL;
4898 		int current_waitinfo_index              = 0;
4899 		int current_turnstileinfo_index         = 0;
4900 		/* allocate space for the wait and turnstil info */
4901 		if (num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) {
4902 			/* thread waitinfo and turnstileinfo can be quite large, avoid overflowing the stack */
4903 			kcdata_compression_window_open(stackshot_kcdata_p);
4904 
4905 			if (num_waitinfo_threads > 0) {
4906 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_WAITINFO,
4907 				    sizeof(thread_waitinfo_v2_t), num_waitinfo_threads, &out_addr));
4908 				thread_waitinfo = (thread_waitinfo_v2_t *)out_addr;
4909 			}
4910 
4911 			if (num_turnstileinfo_threads > 0) {
4912 				/* get space for the turnstile info */
4913 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_TURNSTILEINFO,
4914 				    sizeof(thread_turnstileinfo_v2_t), num_turnstileinfo_threads, &out_addr));
4915 				thread_turnstileinfo = (thread_turnstileinfo_v2_t *)out_addr;
4916 			}
4917 
4918 			stackshot_plh_resetgen();  // so we know which portlabel_ids are referenced
4919 		}
4920 
4921 #if STACKSHOT_COLLECTS_LATENCY_INFO
4922 		latency_info.misc_latency = mach_absolute_time() - latency_info.misc_latency;
4923 		latency_info.task_thread_data_loop_latency = mach_absolute_time();
4924 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4925 
4926 		/* Iterate over the task's threads to save the wait and turnstile info */
4927 		queue_iterate(&task->threads, thread, thread_t, task_threads)
4928 		{
4929 			uint64_t thread_uniqueid;
4930 			#pragma unused(thread_uniqueid)
4931 
4932 			if (active_kthreads_only_p && thread->kernel_stack == 0) {
4933 				continue;
4934 			}
4935 
4936 			thread_uniqueid = thread_tid(thread);
4937 
4938 			/* If we want owner info, we should capture it regardless of its classification */
4939 			if (save_owner_info) {
4940 				if (stackshot_thread_has_valid_waitinfo(thread)) {
4941 					stackshot_thread_wait_owner_info(
4942 						thread,
4943 						&thread_waitinfo[current_waitinfo_index++]);
4944 				}
4945 
4946 				if (stackshot_thread_has_valid_turnstileinfo(thread)) {
4947 					stackshot_thread_turnstileinfo(
4948 						thread,
4949 						&thread_turnstileinfo[current_turnstileinfo_index++]);
4950 				}
4951 			}
4952 		}
4953 
4954 #if STACKSHOT_COLLECTS_LATENCY_INFO
4955 		latency_info.task_thread_data_loop_latency = mach_absolute_time() - latency_info.task_thread_data_loop_latency;
4956 		latency_info.misc2_latency = mach_absolute_time();
4957 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
4958 
4959 #if DEBUG || DEVELOPMENT
4960 		if (current_delta_snapshot_index != num_delta_thread_snapshots) {
4961 			panic("delta thread snapshot count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
4962 			    num_delta_thread_snapshots, current_delta_snapshot_index);
4963 		}
4964 		if (current_waitinfo_index != num_waitinfo_threads) {
4965 			panic("thread wait info count mismatch while capturing snapshots for task %p. expected %d, found %d", task,
4966 			    num_waitinfo_threads, current_waitinfo_index);
4967 		}
4968 #endif
4969 
4970 		if (num_waitinfo_threads > 0 || num_turnstileinfo_threads > 0) {
4971 			kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
4972 			// now, record the portlabel hashes.
4973 			kcd_exit_on_error(kdp_stackshot_plh_record());
4974 		}
4975 
4976 #if IMPORTANCE_INHERITANCE
4977 		if (save_donating_pids_p) {
4978 			/* Ensure the buffer is big enough, since we're using the stack buffer for this. */
4979 			static_assert(TASK_IMP_WALK_LIMIT * sizeof(int32_t) <= MAX_FRAMES * sizeof(uintptr_t));
4980 			saved_count = task_importance_list_pids(task, TASK_IMP_LIST_DONATING_PIDS,
4981 			    (char*) stackshot_cpu_ctx.scc_stack_buffer, TASK_IMP_WALK_LIMIT);
4982 			if (saved_count > 0) {
4983 				/* Variable size array - better not have it on the stack. */
4984 				kcdata_compression_window_open(stackshot_kcdata_p);
4985 				kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_DONATING_PIDS,
4986 				    sizeof(int32_t), saved_count, stackshot_cpu_ctx.scc_stack_buffer));
4987 				kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
4988 			}
4989 		}
4990 #endif
4991 
4992 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
4993 		if (!stackshot_ctx.sc_panic_stackshot) {
4994 			kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - task_begin_cpu_cycle_count),
4995 			    "task_cpu_cycle_count"));
4996 		}
4997 #endif
4998 
4999 #if STACKSHOT_COLLECTS_LATENCY_INFO
5000 		latency_info.misc2_latency = mach_absolute_time() - latency_info.misc2_latency;
5001 		if (collect_latency_info) {
5002 			kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_LATENCY_INFO_TASK, sizeof(latency_info), &latency_info));
5003 		}
5004 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5005 
5006 		/* mark end of task snapshot data */
5007 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END, container_type,
5008 		    task_uniqueid));
5009 	}
5010 
5011 
5012 error_exit:
5013 	return error;
5014 }
5015 
5016 /* Record global shared regions */
5017 static kern_return_t
kdp_stackshot_shared_regions(uint64_t trace_flags)5018 kdp_stackshot_shared_regions(uint64_t trace_flags)
5019 {
5020 	kern_return_t error        = KERN_SUCCESS;
5021 
5022 	boolean_t collect_delta_stackshot = ((trace_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
5023 	extern queue_head_t vm_shared_region_queue;
5024 	vm_shared_region_t sr;
5025 
5026 	extern queue_head_t vm_shared_region_queue;
5027 	queue_iterate(&vm_shared_region_queue,
5028 	    sr,
5029 	    vm_shared_region_t,
5030 	    sr_q) {
5031 		struct dyld_shared_cache_loadinfo_v2 scinfo = {0};
5032 		if (!_stackshot_validate_kva((vm_offset_t)sr, sizeof(*sr))) {
5033 			break;
5034 		}
5035 		if (collect_delta_stackshot && sr->sr_install_time < stackshot_args.since_timestamp) {
5036 			continue; // only include new shared caches in delta stackshots
5037 		}
5038 		uint32_t sharedCacheFlags = ((sr == primary_system_shared_region) ? kSharedCacheSystemPrimary : 0) |
5039 		    (sr->sr_driverkit ? kSharedCacheDriverkit : 0);
5040 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
5041 		    STACKSHOT_KCCONTAINER_SHAREDCACHE, sr->sr_id));
5042 		kdp_memcpy(scinfo.sharedCacheUUID, sr->sr_uuid, sizeof(sr->sr_uuid));
5043 		scinfo.sharedCacheSlide = sr->sr_slide;
5044 		scinfo.sharedCacheUnreliableSlidBaseAddress = sr->sr_base_address + sr->sr_first_mapping;
5045 		scinfo.sharedCacheSlidFirstMapping = sr->sr_base_address + sr->sr_first_mapping;
5046 		scinfo.sharedCacheID = sr->sr_id;
5047 		scinfo.sharedCacheFlags = sharedCacheFlags;
5048 
5049 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_INFO,
5050 		    sizeof(scinfo), &scinfo));
5051 
5052 		if ((trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) && sr->sr_images != NULL &&
5053 		    _stackshot_validate_kva((vm_offset_t)sr->sr_images, sr->sr_images_count * sizeof(struct dyld_uuid_info_64))) {
5054 			assert(sr->sr_images_count != 0);
5055 			kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), sr->sr_images_count, sr->sr_images));
5056 		}
5057 		kcd_exit_on_error(kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
5058 		    STACKSHOT_KCCONTAINER_SHAREDCACHE, sr->sr_id));
5059 	}
5060 
5061 	/*
5062 	 * For backwards compatibility; this will eventually be removed.
5063 	 * Another copy of the Primary System Shared Region, for older readers.
5064 	 */
5065 	sr = primary_system_shared_region;
5066 	/* record system level shared cache load info (if available) */
5067 	if (!collect_delta_stackshot && sr &&
5068 	    _stackshot_validate_kva((vm_offset_t)sr, sizeof(struct vm_shared_region))) {
5069 		struct dyld_shared_cache_loadinfo scinfo = {0};
5070 
5071 		/*
5072 		 * Historically, this data was in a dyld_uuid_info_64 structure, but the
5073 		 * naming of both the structure and fields for this use isn't great.  The
5074 		 * dyld_shared_cache_loadinfo structure has better names, but the same
5075 		 * layout and content as the original.
5076 		 *
5077 		 * The imageSlidBaseAddress/sharedCacheUnreliableSlidBaseAddress field
5078 		 * has been used inconsistently for STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT
5079 		 * entries; here, it's the slid base address, and we leave it that way
5080 		 * for backwards compatibility.
5081 		 */
5082 		kdp_memcpy(scinfo.sharedCacheUUID, &sr->sr_uuid, sizeof(sr->sr_uuid));
5083 		scinfo.sharedCacheSlide = sr->sr_slide;
5084 		scinfo.sharedCacheUnreliableSlidBaseAddress = sr->sr_slide + sr->sr_base_address;
5085 		scinfo.sharedCacheSlidFirstMapping = sr->sr_base_address + sr->sr_first_mapping;
5086 
5087 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_SHAREDCACHE_LOADINFO,
5088 		    sizeof(scinfo), &scinfo));
5089 
5090 		if (trace_flags & STACKSHOT_COLLECT_SHAREDCACHE_LAYOUT) {
5091 			/*
5092 			 * Include a map of the system shared cache layout if it has been populated
5093 			 * (which is only when the system is using a custom shared cache).
5094 			 */
5095 			if (sr->sr_images && _stackshot_validate_kva((vm_offset_t)sr->sr_images,
5096 			    (sr->sr_images_count * sizeof(struct dyld_uuid_info_64)))) {
5097 				assert(sr->sr_images_count != 0);
5098 				kcd_exit_on_error(kcdata_push_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_SYS_SHAREDCACHE_LAYOUT, sizeof(struct dyld_uuid_info_64), sr->sr_images_count, sr->sr_images));
5099 			}
5100 		}
5101 	}
5102 
5103 error_exit:
5104 	return error;
5105 }
5106 
5107 static kern_return_t
kdp_stackshot_kcdata_format(void)5108 kdp_stackshot_kcdata_format(void)
5109 {
5110 	kern_return_t error        = KERN_SUCCESS;
5111 	mach_vm_address_t out_addr = 0;
5112 	uint64_t abs_time = 0;
5113 	uint64_t system_state_flags = 0;
5114 	task_t task = TASK_NULL;
5115 	mach_timebase_info_data_t timebase = {0, 0};
5116 	uint32_t length_to_copy = 0, tmp32 = 0;
5117 	abs_time = mach_absolute_time();
5118 	uint64_t last_task_start_time = 0;
5119 	int cur_workitem_index = 0;
5120 	uint64_t tasks_in_stackshot = 0;
5121 	uint64_t threads_in_stackshot = 0;
5122 
5123 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5124 	uint64_t stackshot_begin_cpu_cycle_count = 0;
5125 
5126 	if (!stackshot_ctx.sc_panic_stackshot) {
5127 		stackshot_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5128 	}
5129 #endif
5130 
5131 	/* the CPU entering here is participating in the stackshot */
5132 	stackshot_cpu_ctx.scc_did_work = true;
5133 
5134 #if STACKSHOT_COLLECTS_LATENCY_INFO
5135 	collect_latency_info = stackshot_flags & STACKSHOT_DISABLE_LATENCY_INFO ? false : true;
5136 #endif
5137 	/* process the flags */
5138 	bool collect_delta_stackshot = ((stackshot_flags & STACKSHOT_COLLECT_DELTA_SNAPSHOT) != 0);
5139 	bool collect_exclaves        = !disable_exclave_stackshot && ((stackshot_flags & STACKSHOT_SKIP_EXCLAVES) == 0);
5140 	stackshot_ctx.sc_enable_faulting = (stackshot_flags & (STACKSHOT_ENABLE_BT_FAULTING));
5141 
5142 	/* Currently we only support returning explicit KEXT load info on fileset kernels */
5143 	kc_format_t primary_kc_type = KCFormatUnknown;
5144 	if (PE_get_primary_kc_format(&primary_kc_type) && (primary_kc_type != KCFormatFileset)) {
5145 		stackshot_flags &= ~(STACKSHOT_SAVE_KEXT_LOADINFO);
5146 	}
5147 
5148 	if (sizeof(void *) == 8) {
5149 		system_state_flags |= kKernel64_p;
5150 	}
5151 
5152 #if CONFIG_EXCLAVES
5153 	if (!stackshot_ctx.sc_panic_stackshot && collect_exclaves) {
5154 		kcd_exit_on_error(stackshot_setup_exclave_waitlist()); /* Allocate list of exclave threads */
5155 	}
5156 #else
5157 #pragma unused(collect_exclaves)
5158 #endif /* CONFIG_EXCLAVES */
5159 
5160 	/* setup mach_absolute_time and timebase info -- copy out in some cases and needed to convert since_timestamp to seconds for proc start time */
5161 	clock_timebase_info(&timebase);
5162 
5163 	/* begin saving data into the buffer */
5164 	if (stackshot_ctx.sc_bytes_uncompressed) {
5165 		stackshot_ctx.sc_bytes_uncompressed = 0;
5166 	}
5167 
5168 	/*
5169 	 * Setup pre-task linked kcdata buffer.
5170 	 * The idea here is that we want the kcdata to be in (roughly) the same order as it was
5171 	 * before we made this multithreaded, so we have separate buffers for pre and post task-iteration,
5172 	 * since that's the parallelized part.
5173 	 */
5174 	if (!stackshot_ctx.sc_is_singlethreaded) {
5175 		kcd_exit_on_error(stackshot_new_linked_kcdata());
5176 		stackshot_ctx.sc_pretask_kcdata = stackshot_cpu_ctx.scc_kcdata_head;
5177 	}
5178 
5179 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, stackshot_flags, "stackshot_in_flags"));
5180 	kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, (uint32_t)stackshot_flags, "stackshot_in_pid"));
5181 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, system_state_flags, "system_state_flags"));
5182 	if (stackshot_flags & STACKSHOT_PAGE_TABLES) {
5183 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_args.pagetable_mask, "stackshot_pagetable_mask"));
5184 	}
5185 	if (stackshot_initial_estimate != 0) {
5186 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate, "stackshot_size_estimate"));
5187 		kcd_exit_on_error(kcdata_add_uint32_with_description(stackshot_kcdata_p, stackshot_initial_estimate_adj, "stackshot_size_estimate_adj"));
5188 	}
5189 	kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, stackshot_available_task_exec_flags(), "stackshot_te_flags_mask"));
5190 
5191 
5192 #if STACKSHOT_COLLECTS_LATENCY_INFO
5193 	stackshot_ctx.sc_latency.setup_latency_mt = mach_absolute_time();
5194 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5195 
5196 #if CONFIG_JETSAM
5197 	tmp32 = memorystatus_get_pressure_status_kdp();
5198 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_LEVEL, sizeof(uint32_t), &tmp32));
5199 #endif
5200 
5201 	if (!collect_delta_stackshot) {
5202 		tmp32 = THREAD_POLICY_INTERNAL_STRUCT_VERSION;
5203 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_POLICY_VERSION, sizeof(uint32_t), &tmp32));
5204 
5205 		tmp32 = PAGE_SIZE;
5206 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_KERN_PAGE_SIZE, sizeof(uint32_t), &tmp32));
5207 
5208 		/* save boot-args and osversion string */
5209 		length_to_copy =  MIN((uint32_t)(strlen(version) + 1), OSVERSIZE);
5210 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_OSVERSION, length_to_copy, (const void *)version));
5211 		length_to_copy = MIN((uint32_t)(strlen(osversion) + 1), OSVERSIZE);
5212 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_OS_BUILD_VERSION, length_to_copy, (void *)osversion));
5213 
5214 
5215 		length_to_copy =  MIN((uint32_t)(strlen(PE_boot_args()) + 1), BOOT_LINE_LENGTH);
5216 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_BOOTARGS, length_to_copy, PE_boot_args()));
5217 
5218 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_TIMEBASE, sizeof(timebase), &timebase));
5219 	} else {
5220 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_DELTA_SINCE_TIMESTAMP, sizeof(uint64_t), &stackshot_args.since_timestamp));
5221 	}
5222 
5223 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_MACH_ABSOLUTE_TIME, sizeof(uint64_t), &abs_time));
5224 
5225 	kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, KCDATA_TYPE_USECS_SINCE_EPOCH, sizeof(uint64_t), &stackshot_ctx.sc_microsecs));
5226 
5227 	kcd_exit_on_error(kdp_stackshot_shared_regions(stackshot_flags));
5228 
5229 	/* Add requested information first */
5230 	if (stackshot_flags & STACKSHOT_GET_GLOBAL_MEM_STATS) {
5231 		struct mem_and_io_snapshot mais = {0};
5232 		kdp_mem_and_io_snapshot(&mais);
5233 		kcd_exit_on_error(kcdata_push_data(stackshot_kcdata_p, STACKSHOT_KCTYPE_GLOBAL_MEM_STATS, sizeof(mais), &mais));
5234 	}
5235 
5236 #if CONFIG_THREAD_GROUPS
5237 	struct thread_group_snapshot_v3 *thread_groups = NULL;
5238 	int num_thread_groups = 0;
5239 
5240 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5241 	uint64_t thread_group_begin_cpu_cycle_count = 0;
5242 
5243 	if (!stackshot_ctx.sc_is_singlethreaded && (stackshot_flags & STACKSHOT_THREAD_GROUP)) {
5244 		thread_group_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5245 	}
5246 #endif
5247 
5248 	/* Iterate over thread group names */
5249 	if (stackshot_flags & STACKSHOT_THREAD_GROUP) {
5250 		/* Variable size array - better not have it on the stack. */
5251 		kcdata_compression_window_open(stackshot_kcdata_p);
5252 
5253 		if (thread_group_iterate_stackshot(stackshot_thread_group_count, &num_thread_groups) != KERN_SUCCESS) {
5254 			stackshot_flags &= ~(STACKSHOT_THREAD_GROUP);
5255 		}
5256 
5257 		if (num_thread_groups > 0) {
5258 			kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_THREAD_GROUP_SNAPSHOT, sizeof(struct thread_group_snapshot_v3), num_thread_groups, &out_addr));
5259 			thread_groups = (struct thread_group_snapshot_v3 *)out_addr;
5260 		}
5261 
5262 		if (thread_group_iterate_stackshot(stackshot_thread_group_snapshot, thread_groups) != KERN_SUCCESS) {
5263 			error = KERN_FAILURE;
5264 			goto error_exit;
5265 		}
5266 
5267 		kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5268 	}
5269 
5270 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5271 	if (!stackshot_ctx.sc_panic_stackshot && (thread_group_begin_cpu_cycle_count != 0)) {
5272 		kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - thread_group_begin_cpu_cycle_count),
5273 		    "thread_groups_cpu_cycle_count"));
5274 	}
5275 #endif
5276 #else
5277 	stackshot_flags &= ~(STACKSHOT_THREAD_GROUP);
5278 #endif /* CONFIG_THREAD_GROUPS */
5279 
5280 
5281 #if STACKSHOT_COLLECTS_LATENCY_INFO
5282 	stackshot_ctx.sc_latency.setup_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.setup_latency_mt;
5283 	if (stackshot_ctx.sc_is_singlethreaded) {
5284 		stackshot_ctx.sc_latency.total_task_iteration_latency_mt = mach_absolute_time();
5285 	} else {
5286 		stackshot_ctx.sc_latency.task_queue_building_latency_mt = mach_absolute_time();
5287 	}
5288 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5289 
5290 	bool const process_scoped = (stackshot_args.pid != -1) &&
5291 	    ((stackshot_flags & STACKSHOT_INCLUDE_DRIVER_THREADS_IN_KERNEL) == 0);
5292 
5293 	/* Iterate over tasks */
5294 	queue_iterate(&tasks, task, task_t, tasks)
5295 	{
5296 		stackshot_panic_guard();
5297 
5298 		if (collect_delta_stackshot) {
5299 			uint64_t abstime;
5300 			proc_starttime_kdp(get_bsdtask_info(task), NULL, NULL, &abstime);
5301 
5302 			if (abstime > last_task_start_time) {
5303 				last_task_start_time = abstime;
5304 			}
5305 		}
5306 
5307 		pid_t task_pid = pid_from_task(task);
5308 
5309 		if (process_scoped && (task_pid != stackshot_args.pid)) {
5310 			continue;
5311 		}
5312 
5313 		if ((task->active && !task_is_a_corpse(task) && !task_is_a_corpse_fork(task)) ||
5314 		    (!queue_empty(&task->threads) && task_pid != -1)) {
5315 			tasks_in_stackshot++;
5316 			threads_in_stackshot += task->thread_count;
5317 		}
5318 
5319 		/* If this is a singlethreaded stackshot, don't use the work queues. */
5320 		if (stackshot_ctx.sc_is_singlethreaded) {
5321 			kcd_exit_on_error(kdp_stackshot_record_task(task));
5322 		} else {
5323 			kcd_exit_on_error(stackshot_put_workitem((struct stackshot_workitem) {
5324 				.sswi_task = task,
5325 				.sswi_data = NULL,
5326 				.sswi_idx = cur_workitem_index++
5327 			}));
5328 		}
5329 
5330 		if (process_scoped) {
5331 			/* Only targeting one process, we're done now. */
5332 			break;
5333 		}
5334 	}
5335 
5336 #if STACKSHOT_COLLECTS_LATENCY_INFO
5337 	if (stackshot_ctx.sc_is_singlethreaded) {
5338 		stackshot_ctx.sc_latency.total_task_iteration_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.total_task_iteration_latency_mt;
5339 	} else {
5340 		stackshot_ctx.sc_latency.task_queue_building_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.task_queue_building_latency_mt;
5341 	}
5342 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5343 
5344 	/* Setup post-task kcdata buffer */
5345 	if (!stackshot_ctx.sc_is_singlethreaded) {
5346 		stackshot_finalize_linked_kcdata();
5347 		kcd_exit_on_error(stackshot_new_linked_kcdata());
5348 		stackshot_ctx.sc_posttask_kcdata = stackshot_cpu_ctx.scc_kcdata_head;
5349 	}
5350 
5351 #if CONFIG_COALITIONS
5352 	/* Don't collect jetsam coalition snapshots in delta stackshots - these don't change */
5353 	if (!collect_delta_stackshot || (last_task_start_time > stackshot_args.since_timestamp)) {
5354 		int num_coalitions = 0;
5355 		struct jetsam_coalition_snapshot *coalitions = NULL;
5356 
5357 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5358 		uint64_t coalition_begin_cpu_cycle_count = 0;
5359 
5360 		if (!stackshot_ctx.sc_panic_stackshot && (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS)) {
5361 			coalition_begin_cpu_cycle_count = mt_cur_cpu_cycles();
5362 		}
5363 #endif /* SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI */
5364 
5365 		/* Iterate over coalitions */
5366 		if (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
5367 			if (coalition_iterate_stackshot(stackshot_coalition_jetsam_count, &num_coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) {
5368 				stackshot_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
5369 			}
5370 		}
5371 		if (stackshot_flags & STACKSHOT_SAVE_JETSAM_COALITIONS) {
5372 			if (num_coalitions > 0) {
5373 				/* Variable size array - better not have it on the stack. */
5374 				kcdata_compression_window_open(stackshot_kcdata_p);
5375 				kcd_exit_on_error(kcdata_get_memory_addr_for_array(stackshot_kcdata_p, STACKSHOT_KCTYPE_JETSAM_COALITION_SNAPSHOT, sizeof(struct jetsam_coalition_snapshot), num_coalitions, &out_addr));
5376 				coalitions = (struct jetsam_coalition_snapshot*)out_addr;
5377 
5378 				if (coalition_iterate_stackshot(stackshot_coalition_jetsam_snapshot, coalitions, COALITION_TYPE_JETSAM) != KERN_SUCCESS) {
5379 					error = KERN_FAILURE;
5380 					goto error_exit;
5381 				}
5382 
5383 				kcd_exit_on_error(kcdata_compression_window_close(stackshot_kcdata_p));
5384 			}
5385 		}
5386 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5387 		if (!stackshot_ctx.sc_panic_stackshot && (coalition_begin_cpu_cycle_count != 0)) {
5388 			kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - coalition_begin_cpu_cycle_count),
5389 			    "coalitions_cpu_cycle_count"));
5390 		}
5391 #endif /* SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI */
5392 	}
5393 #else
5394 	stackshot_flags &= ~(STACKSHOT_SAVE_JETSAM_COALITIONS);
5395 #endif /* CONFIG_COALITIONS */
5396 
5397 	stackshot_panic_guard();
5398 
5399 #if STACKSHOT_COLLECTS_LATENCY_INFO
5400 	if (stackshot_ctx.sc_is_singlethreaded) {
5401 		stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt = mach_absolute_time();
5402 	} else {
5403 		stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt = mach_absolute_time();
5404 	}
5405 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5406 
5407 	/*
5408 	 * Iterate over the tasks in the terminated tasks list. We only inspect
5409 	 * tasks that have a valid bsd_info pointer. The check for task transition
5410 	 * like past P_LPEXIT during proc_exit() is now checked for inside the
5411 	 * kdp_stackshot_record_task(), and then a safer and minimal
5412 	 * transitioning_task_snapshot struct is collected via
5413 	 * kcdata_record_transitioning_task_snapshot()
5414 	 */
5415 	queue_iterate(&terminated_tasks, task, task_t, tasks)
5416 	{
5417 		stackshot_panic_guard();
5418 
5419 		if ((task->active && !task_is_a_corpse(task) && !task_is_a_corpse_fork(task)) ||
5420 		    (!queue_empty(&task->threads) && pid_from_task(task) != -1)) {
5421 			tasks_in_stackshot++;
5422 			threads_in_stackshot += task->thread_count;
5423 		}
5424 
5425 		/* Only use workqueues on non-panic and non-scoped stackshots. */
5426 		if (stackshot_ctx.sc_is_singlethreaded) {
5427 			kcd_exit_on_error(kdp_stackshot_record_task(task));
5428 		} else {
5429 			kcd_exit_on_error(stackshot_put_workitem((struct stackshot_workitem) {
5430 				.sswi_task = task,
5431 				.sswi_data = NULL,
5432 				.sswi_idx = cur_workitem_index++
5433 			}));
5434 		}
5435 	}
5436 
5437 	/* Mark the queue(s) as populated. */
5438 	for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
5439 		os_atomic_store(&stackshot_ctx.sc_workqueues[i].sswq_populated, true, release);
5440 	}
5441 
5442 #if DEVELOPMENT || DEBUG
5443 	kcd_exit_on_error(kdp_stackshot_plh_stats());
5444 #endif /* DEVELOPMENT || DEBUG */
5445 
5446 #if STACKSHOT_COLLECTS_LATENCY_INFO
5447 	if (stackshot_ctx.sc_is_singlethreaded) {
5448 		stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.total_terminated_task_iteration_latency_mt;
5449 	} else {
5450 		stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.terminated_task_queue_building_latency_mt;
5451 	}
5452 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5453 
5454 #if STACKSHOT_COLLECTS_LATENCY_INFO
5455 	if (collect_latency_info) {
5456 		stackshot_ctx.sc_latency.latency_version = 2;
5457 		stackshot_ctx.sc_latency.main_cpu_number = stackshot_ctx.sc_main_cpuid;
5458 		stackshot_ctx.sc_latency.calling_cpu_number = stackshot_ctx.sc_calling_cpuid;
5459 	}
5460 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5461 
5462 #if SCHED_HYGIENE_DEBUG && CONFIG_PERVASIVE_CPI
5463 	if (!stackshot_ctx.sc_panic_stackshot) {
5464 		kcd_exit_on_error(kcdata_add_uint64_with_description(stackshot_kcdata_p, (mt_cur_cpu_cycles() - stackshot_begin_cpu_cycle_count),
5465 		    "stackshot_total_cpu_cycle_cnt"));
5466 	}
5467 #endif
5468 
5469 	kcdata_add_uint64_with_description(stackshot_kcdata_p, tasks_in_stackshot, "stackshot_tasks_count");
5470 	kcdata_add_uint64_with_description(stackshot_kcdata_p, threads_in_stackshot, "stackshot_threads_count");
5471 
5472 	stackshot_panic_guard();
5473 
5474 	if (!stackshot_ctx.sc_is_singlethreaded) {
5475 		/* Chip away at the queue. */
5476 		stackshot_finalize_linked_kcdata();
5477 		stackshot_cpu_do_work();
5478 		*stackshot_kcdata_p = stackshot_cpu_ctx.scc_kcdata_tail->kcdata;
5479 	}
5480 
5481 #if CONFIG_EXCLAVES
5482 	/* If this is the panic stackshot, check if Exclaves panic left its stackshot in the shared region */
5483 	if (stackshot_ctx.sc_panic_stackshot) {
5484 		struct exclaves_panic_stackshot excl_ss;
5485 		kdp_read_panic_exclaves_stackshot(&excl_ss);
5486 
5487 		if (excl_ss.stackshot_buffer != NULL && excl_ss.stackshot_buffer_size != 0) {
5488 			tb_error_t tberr = TB_ERROR_SUCCESS;
5489 			exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_FOUND;
5490 
5491 			/* this block does not escape, so this is okay... */
5492 			kern_return_t *error_in_block = &error;
5493 			kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_BEGIN,
5494 			    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
5495 			tberr = stackshot_stackshotresult__unmarshal(excl_ss.stackshot_buffer, excl_ss.stackshot_buffer_size, ^(stackshot_stackshotresult_s result){
5496 				*error_in_block = stackshot_exclaves_process_stackshot(&result, stackshot_kcdata_p, false);
5497 			});
5498 			kcdata_add_container_marker(stackshot_kcdata_p, KCDATA_TYPE_CONTAINER_END,
5499 			    STACKSHOT_KCCONTAINER_EXCLAVES, 0);
5500 			if (tberr != TB_ERROR_SUCCESS) {
5501 				exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_DECODE_FAILED;
5502 			}
5503 		} else {
5504 			exclaves_panic_ss_status = EXCLAVES_PANIC_STACKSHOT_NOT_FOUND;
5505 		}
5506 
5507 		/* check error from the block */
5508 		kcd_exit_on_error(error);
5509 	}
5510 #endif
5511 
5512 	/*  === END of populating stackshot data === */
5513 error_exit:;
5514 	if (error != KERN_SUCCESS) {
5515 		stackshot_set_error(error);
5516 	}
5517 
5518 	stackshot_panic_guard();
5519 
5520 	return error;
5521 }
5522 
5523 static uint64_t
proc_was_throttled_from_task(task_t task)5524 proc_was_throttled_from_task(task_t task)
5525 {
5526 	uint64_t was_throttled = 0;
5527 	void *bsd_info = get_bsdtask_info(task);
5528 
5529 	if (bsd_info) {
5530 		was_throttled = proc_was_throttled(bsd_info);
5531 	}
5532 
5533 	return was_throttled;
5534 }
5535 
5536 static uint64_t
proc_did_throttle_from_task(task_t task)5537 proc_did_throttle_from_task(task_t task)
5538 {
5539 	uint64_t did_throttle = 0;
5540 	void *bsd_info = get_bsdtask_info(task);
5541 
5542 	if (bsd_info) {
5543 		did_throttle = proc_did_throttle(bsd_info);
5544 	}
5545 
5546 	return did_throttle;
5547 }
5548 
5549 static void
kdp_mem_and_io_snapshot(struct mem_and_io_snapshot * memio_snap)5550 kdp_mem_and_io_snapshot(struct mem_and_io_snapshot *memio_snap)
5551 {
5552 	unsigned int pages_reclaimed;
5553 	unsigned int pages_wanted;
5554 	kern_return_t kErr;
5555 
5556 	uint64_t compressions = 0;
5557 	uint64_t decompressions = 0;
5558 
5559 	compressions = counter_load(&vm_statistics_compressions);
5560 	decompressions = counter_load(&vm_statistics_decompressions);
5561 
5562 	memio_snap->snapshot_magic = STACKSHOT_MEM_AND_IO_SNAPSHOT_MAGIC;
5563 	memio_snap->free_pages = vm_page_free_count;
5564 	memio_snap->active_pages = vm_page_active_count;
5565 	memio_snap->inactive_pages = vm_page_inactive_count;
5566 	memio_snap->purgeable_pages = vm_page_purgeable_count;
5567 	memio_snap->wired_pages = vm_page_wire_count;
5568 	memio_snap->speculative_pages = vm_page_speculative_count;
5569 	memio_snap->throttled_pages = vm_page_throttled_count;
5570 	memio_snap->busy_buffer_count = count_busy_buffers();
5571 	memio_snap->filebacked_pages = vm_page_pageable_external_count;
5572 	memio_snap->compressions = (uint32_t)compressions;
5573 	memio_snap->decompressions = (uint32_t)decompressions;
5574 	memio_snap->compressor_size = VM_PAGE_COMPRESSOR_COUNT;
5575 	kErr = mach_vm_pressure_monitor(FALSE, VM_PRESSURE_TIME_WINDOW, &pages_reclaimed, &pages_wanted);
5576 
5577 	if (!kErr) {
5578 		memio_snap->pages_wanted = (uint32_t)pages_wanted;
5579 		memio_snap->pages_reclaimed = (uint32_t)pages_reclaimed;
5580 		memio_snap->pages_wanted_reclaimed_valid = 1;
5581 	} else {
5582 		memio_snap->pages_wanted = 0;
5583 		memio_snap->pages_reclaimed = 0;
5584 		memio_snap->pages_wanted_reclaimed_valid = 0;
5585 	}
5586 }
5587 
5588 static vm_offset_t
stackshot_find_phys(vm_map_t map,vm_offset_t target_addr,kdp_fault_flags_t fault_flags,uint32_t * kdp_fault_result_flags)5589 stackshot_find_phys(vm_map_t map, vm_offset_t target_addr, kdp_fault_flags_t fault_flags, uint32_t *kdp_fault_result_flags)
5590 {
5591 	vm_offset_t result;
5592 	struct kdp_fault_result fault_results = {0};
5593 	if (stackshot_cpu_ctx.scc_fault_stats.sfs_stopped_faulting) {
5594 		fault_flags &= ~KDP_FAULT_FLAGS_ENABLE_FAULTING;
5595 	}
5596 	if (!stackshot_ctx.sc_panic_stackshot) {
5597 		fault_flags |= KDP_FAULT_FLAGS_MULTICPU;
5598 	}
5599 
5600 	result = kdp_find_phys(map, target_addr, fault_flags, &fault_results);
5601 
5602 	if ((fault_results.flags & KDP_FAULT_RESULT_TRIED_FAULT) || (fault_results.flags & KDP_FAULT_RESULT_FAULTED_IN)) {
5603 		stackshot_cpu_ctx.scc_fault_stats.sfs_time_spent_faulting += fault_results.time_spent_faulting;
5604 
5605 #if STACKSHOT_COLLECTS_LATENCY_INFO
5606 		stackshot_cpu_latency.faulting_time_mt += fault_results.time_spent_faulting;
5607 #endif
5608 
5609 		if ((stackshot_cpu_ctx.scc_fault_stats.sfs_time_spent_faulting >= stackshot_max_fault_time) && !stackshot_ctx.sc_panic_stackshot) {
5610 			stackshot_cpu_ctx.scc_fault_stats.sfs_stopped_faulting = (uint8_t) TRUE;
5611 		}
5612 	}
5613 
5614 	if (fault_results.flags & KDP_FAULT_RESULT_FAULTED_IN) {
5615 		stackshot_cpu_ctx.scc_fault_stats.sfs_pages_faulted_in++;
5616 	}
5617 
5618 	if (kdp_fault_result_flags) {
5619 		*kdp_fault_result_flags = fault_results.flags;
5620 	}
5621 
5622 	return result;
5623 }
5624 
5625 /*
5626  * Wrappers around kdp_generic_copyin, kdp_generic_copyin_word, kdp_generic_copyin_string that use stackshot_find_phys
5627  * in order to:
5628  *   1. collect statistics on the number of pages faulted in
5629  *   2. stop faulting if the time spent faulting has exceeded the limit.
5630  */
5631 static boolean_t
stackshot_copyin(vm_map_t map,uint64_t uaddr,void * dest,size_t size,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5632 stackshot_copyin(vm_map_t map, uint64_t uaddr, void *dest, size_t size, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5633 {
5634 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5635 	if (try_fault) {
5636 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5637 	}
5638 	return kdp_generic_copyin(map, uaddr, dest, size, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags) == KERN_SUCCESS;
5639 }
5640 static boolean_t
stackshot_copyin_word(task_t task,uint64_t addr,uint64_t * result,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5641 stackshot_copyin_word(task_t task, uint64_t addr, uint64_t *result, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5642 {
5643 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5644 	if (try_fault) {
5645 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5646 	}
5647 	return kdp_generic_copyin_word(task, addr, result, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags) == KERN_SUCCESS;
5648 }
5649 static int
stackshot_copyin_string(task_t task,uint64_t addr,char * buf,int buf_sz,boolean_t try_fault,kdp_fault_result_flags_t * kdp_fault_result_flags)5650 stackshot_copyin_string(task_t task, uint64_t addr, char *buf, int buf_sz, boolean_t try_fault, kdp_fault_result_flags_t *kdp_fault_result_flags)
5651 {
5652 	kdp_fault_flags_t fault_flags = KDP_FAULT_FLAGS_NONE;
5653 	if (try_fault) {
5654 		fault_flags |= KDP_FAULT_FLAGS_ENABLE_FAULTING;
5655 	}
5656 	return kdp_generic_copyin_string(task, addr, buf, buf_sz, fault_flags, (find_phys_fn_t)stackshot_find_phys, kdp_fault_result_flags);
5657 }
5658 
5659 kern_return_t
do_stackshot(void * context)5660 do_stackshot(void *context)
5661 {
5662 #pragma unused(context)
5663 	kern_return_t error;
5664 	size_t queue_size;
5665 	uint64_t abs_time = mach_absolute_time(), abs_time_end = 0;
5666 	kdp_snapshot++;
5667 
5668 	_stackshot_validation_reset();
5669 	error = stackshot_plh_setup(); /* set up port label hash */
5670 
5671 	if (!stackshot_ctx.sc_is_singlethreaded) {
5672 		/* Set up queues. These numbers shouldn't change, but slightly fudge queue size just in case. */
5673 		queue_size = FUDGED_SIZE(tasks_count + terminated_tasks_count, 10);
5674 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
5675 			stackshot_ctx.sc_workqueues[i] = (struct stackshot_workqueue) {
5676 				.sswq_items     = stackshot_alloc_arr(struct stackshot_workitem, queue_size, &error),
5677 				.sswq_capacity  = queue_size,
5678 				.sswq_num_items = 0,
5679 				.sswq_cur_item  = 0,
5680 				.sswq_populated = false
5681 			};
5682 			if (error != KERN_SUCCESS) {
5683 				break;
5684 			}
5685 		}
5686 	}
5687 
5688 	if (error != KERN_SUCCESS) {
5689 		stackshot_set_error(error);
5690 		return error;
5691 	}
5692 
5693 	/*
5694 	 * If no main CPU has been selected at this point, (since every CPU has
5695 	 * called stackshot_cpu_preflight by now), then there was no CLPC
5696 	 * recommended P-core available. In that case, we should volunteer ourself
5697 	 * to be the main CPU, because someone has to do it.
5698 	 */
5699 	if (stackshot_ctx.sc_main_cpuid == -1) {
5700 		os_atomic_cmpxchg(&stackshot_ctx.sc_main_cpuid, -1, cpu_number(), acquire);
5701 		stackshot_cpu_ctx.scc_can_work = true;
5702 	}
5703 
5704 	/* After this, auxiliary CPUs can begin work. */
5705 	os_atomic_store(&stackshot_ctx.sc_state, SS_RUNNING, release);
5706 
5707 	/* If we are the main CPU, populate the queues / do other main CPU work. */
5708 	if (stackshot_ctx.sc_panic_stackshot || (stackshot_ctx.sc_main_cpuid == cpu_number())) {
5709 		stackshot_ctx.sc_retval = kdp_stackshot_kcdata_format();
5710 	} else if (stackshot_cpu_ctx.scc_can_work) {
5711 		stackshot_cpu_do_work();
5712 	}
5713 
5714 	/* Wait for every CPU to finish. */
5715 #if STACKSHOT_COLLECTS_LATENCY_INFO
5716 	stackshot_ctx.sc_latency.cpu_wait_latency_mt = mach_absolute_time();
5717 #endif
5718 	if (stackshot_cpu_ctx.scc_can_work) {
5719 		os_atomic_dec(&stackshot_ctx.sc_cpus_working, seq_cst);
5720 		stackshot_cpu_ctx.scc_can_work = false;
5721 	}
5722 	while (os_atomic_load(&stackshot_ctx.sc_cpus_working, seq_cst) != 0) {
5723 		loop_wait();
5724 	}
5725 	stackshot_panic_guard();
5726 #if STACKSHOT_COLLECTS_LATENCY_INFO
5727 	stackshot_ctx.sc_latency.cpu_wait_latency_mt = mach_absolute_time() - stackshot_ctx.sc_latency.cpu_wait_latency_mt;
5728 #endif
5729 
5730 	/* update timestamp of the stackshot */
5731 	abs_time_end = mach_absolute_time();
5732 	stackshot_ctx.sc_duration = (struct stackshot_duration_v2) {
5733 		.stackshot_duration       = (abs_time_end - abs_time),
5734 		.stackshot_duration_outer = 0,
5735 		.stackshot_duration_prior = stackshot_duration_prior_abs,
5736 	};
5737 
5738 	stackshot_plh_reset();
5739 
5740 	/* Check interrupts disabled time. */
5741 #if SCHED_HYGIENE_DEBUG
5742 	bool disable_interrupts_masked_check = kern_feature_override(
5743 		KF_INTERRUPT_MASKED_DEBUG_STACKSHOT_OVRD) ||
5744 	    (stackshot_flags & STACKSHOT_DO_COMPRESS) != 0;
5745 
5746 #if STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED
5747 	disable_interrupts_masked_check = true;
5748 #endif /* STACKSHOT_INTERRUPTS_MASKED_CHECK_DISABLED */
5749 
5750 	if (disable_interrupts_masked_check) {
5751 		ml_spin_debug_clear_self();
5752 	}
5753 
5754 	if (!stackshot_ctx.sc_panic_stackshot && interrupt_masked_debug_mode) {
5755 		/*
5756 		 * Try to catch instances where stackshot takes too long BEFORE returning from
5757 		 * the debugger
5758 		 */
5759 		ml_handle_stackshot_interrupt_disabled_duration(current_thread());
5760 	}
5761 #endif /* SCHED_HYGIENE_DEBUG */
5762 
5763 	kdp_snapshot--;
5764 
5765 	/* If any other CPU had an error, make sure we return it */
5766 	if (stackshot_ctx.sc_retval == KERN_SUCCESS) {
5767 		stackshot_ctx.sc_retval = stackshot_status_check();
5768 	}
5769 
5770 #if CONFIG_EXCLAVES
5771 	/* Avoid setting AST until as late as possible, in case the stackshot fails */
5772 	if (!stackshot_ctx.sc_panic_stackshot && stackshot_ctx.sc_retval == KERN_SUCCESS) {
5773 		commit_exclaves_ast();
5774 	}
5775 	if (stackshot_ctx.sc_retval != KERN_SUCCESS && stackshot_exclave_inspect_ctids) {
5776 		/* Clear inspection CTID list: no need to wait for these threads */
5777 		stackshot_exclave_inspect_ctid_count = 0;
5778 		stackshot_exclave_inspect_ctid_capacity = 0;
5779 		stackshot_exclave_inspect_ctids = NULL;
5780 	}
5781 #endif
5782 
5783 	/* If this is a singlethreaded stackshot, the "final" kcdata buffer is just our CPU's kcdata buffer */
5784 	if (stackshot_ctx.sc_is_singlethreaded) {
5785 		stackshot_ctx.sc_finalized_kcdata = stackshot_kcdata_p;
5786 	}
5787 
5788 	return stackshot_ctx.sc_retval;
5789 }
5790 
5791 kern_return_t
do_panic_stackshot(void * context)5792 do_panic_stackshot(void *context)
5793 {
5794 	kern_return_t ret = do_stackshot(context);
5795 	if (ret != KERN_SUCCESS) {
5796 		goto out;
5797 	}
5798 
5799 	ret = stackshot_finalize_singlethreaded_kcdata();
5800 
5801 out:
5802 	return ret;
5803 }
5804 
5805 /*
5806  * Set up needed state for this CPU before participating in a stackshot.
5807  * Namely, we want to signal that we're available to do work.
5808  * Called while interrupts are disabled & in the debugger trap.
5809  */
5810 void
stackshot_cpu_preflight(void)5811 stackshot_cpu_preflight(void)
5812 {
5813 	bool is_recommended, is_calling_cpu;
5814 	int my_cpu_no = cpu_number();
5815 
5816 #if STACKSHOT_COLLECTS_LATENCY_INFO
5817 	stackshot_cpu_latency = (typeof(stackshot_cpu_latency)) {
5818 		.cpu_number            =  cpu_number(),
5819 #if defined(__AMP__)
5820 		.cluster_type          =  current_cpu_datap()->cpu_cluster_type,
5821 #else /* __AMP__ */
5822 		.cluster_type = CLUSTER_TYPE_SMP,
5823 #endif /* __AMP__ */
5824 		.faulting_time_mt      = 0,
5825 		.total_buf             = 0,
5826 		.intercluster_buf_used = 0
5827 	};
5828 #if CONFIG_PERVASIVE_CPI
5829 	mt_cur_cpu_cycles_instrs_speculative(&stackshot_cpu_latency.total_cycles, &stackshot_cpu_latency.total_instrs);
5830 #endif /* CONFIG_PERVASIVE_CPI */
5831 	stackshot_cpu_latency.init_latency_mt = stackshot_cpu_latency.total_latency_mt = mach_absolute_time();
5832 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5833 
5834 	is_recommended = current_processor()->is_recommended;
5835 
5836 	/* If this is a recommended P-core (or SMP), try making it the main CPU */
5837 	if (is_recommended
5838 #if defined(__AMP__)
5839 	    && current_cpu_datap()->cpu_cluster_type == CLUSTER_TYPE_P
5840 #endif /* __AMP__ */
5841 	    ) {
5842 		os_atomic_cmpxchg(&stackshot_ctx.sc_main_cpuid, -1, my_cpu_no, acquire);
5843 	}
5844 
5845 	is_calling_cpu = stackshot_ctx.sc_calling_cpuid == my_cpu_no;
5846 
5847 	stackshot_cpu_ctx.scc_did_work = false;
5848 	stackshot_cpu_ctx.scc_can_work = is_calling_cpu || (is_recommended && !stackshot_ctx.sc_is_singlethreaded);
5849 
5850 	if (stackshot_cpu_ctx.scc_can_work) {
5851 		os_atomic_inc(&stackshot_ctx.sc_cpus_working, relaxed);
5852 	}
5853 }
5854 
5855 __result_use_check
5856 static kern_return_t
stackshot_cpu_work_on_queue(struct stackshot_workqueue * queue)5857 stackshot_cpu_work_on_queue(struct stackshot_workqueue *queue)
5858 {
5859 	struct stackshot_workitem     *cur_workitemp;
5860 	kern_return_t                  error = KERN_SUCCESS;
5861 
5862 	while (((cur_workitemp = stackshot_get_workitem(queue)) != NULL || !os_atomic_load(&queue->sswq_populated, acquire))) {
5863 		/* Check to make sure someone hasn't errored out or panicked. */
5864 		if (__improbable(stackshot_status_check() != KERN_SUCCESS)) {
5865 			return KERN_ABORTED;
5866 		}
5867 
5868 		if (cur_workitemp) {
5869 			kcd_exit_on_error(stackshot_new_linked_kcdata());
5870 			cur_workitemp->sswi_data = stackshot_cpu_ctx.scc_kcdata_head;
5871 			kcd_exit_on_error(kdp_stackshot_record_task(cur_workitemp->sswi_task));
5872 			stackshot_finalize_linked_kcdata();
5873 		} else {
5874 #if STACKSHOT_COLLECTS_LATENCY_INFO
5875 			uint64_t time_begin = mach_absolute_time();
5876 #endif
5877 			loop_wait();
5878 #if STACKSHOT_COLLECTS_LATENCY_INFO
5879 			stackshot_cpu_latency.workqueue_latency_mt += mach_absolute_time() - time_begin;
5880 #endif
5881 		}
5882 	}
5883 
5884 error_exit:
5885 	return error;
5886 }
5887 
5888 static void
stackshot_cpu_do_work(void)5889 stackshot_cpu_do_work(void)
5890 {
5891 	kern_return_t                  error;
5892 
5893 	stackshot_cpu_ctx.scc_stack_buffer = stackshot_alloc_arr(uintptr_t, MAX_FRAMES, &error);
5894 	if (error != KERN_SUCCESS) {
5895 		goto error_exit;
5896 	}
5897 
5898 #if STACKSHOT_COLLECTS_LATENCY_INFO
5899 	stackshot_cpu_latency.init_latency_mt = mach_absolute_time() - stackshot_cpu_latency.init_latency_mt;
5900 #endif
5901 
5902 	bool high_perf = true;
5903 
5904 #if defined(__AMP__)
5905 	if (current_cpu_datap()->cpu_cluster_type == CLUSTER_TYPE_E) {
5906 		high_perf = false;
5907 	}
5908 #endif /* __AMP__ */
5909 
5910 	if (high_perf) {
5911 		/* Non-E cores: Work from most difficult to least difficult */
5912 		for (size_t i = STACKSHOT_NUM_WORKQUEUES; i > 0; i--) {
5913 			kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i - 1]));
5914 		}
5915 	} else {
5916 		/* E: Work from least difficult to most difficult */
5917 		for (size_t i = 0; i < STACKSHOT_NUM_WORKQUEUES; i++) {
5918 			kcd_exit_on_error(stackshot_cpu_work_on_queue(&stackshot_ctx.sc_workqueues[i]));
5919 		}
5920 	}
5921 #if STACKSHOT_COLLECTS_LATENCY_INFO
5922 	stackshot_cpu_latency.total_latency_mt = mach_absolute_time() - stackshot_cpu_latency.total_latency_mt;
5923 #if CONFIG_PERVASIVE_CPI
5924 	uint64_t cycles, instrs;
5925 	mt_cur_cpu_cycles_instrs_speculative(&cycles, &instrs);
5926 	stackshot_cpu_latency.total_cycles = cycles - stackshot_cpu_latency.total_cycles;
5927 	stackshot_cpu_latency.total_instrs = instrs - stackshot_cpu_latency.total_instrs;
5928 #endif /* CONFIG_PERVASIVE_CPI */
5929 #endif /* STACKSHOT_COLLECTS_LATENCY_INFO */
5930 
5931 error_exit:
5932 	if (error != KERN_SUCCESS) {
5933 		stackshot_set_error(error);
5934 	}
5935 	stackshot_panic_guard();
5936 }
5937 
5938 /*
5939  * This is where the other CPUs will end up when we take a stackshot.
5940  * If they're available to do work, they'll do so here.
5941  * Called with interrupts disabled & from the debugger trap.
5942  */
5943 void
stackshot_aux_cpu_entry(void)5944 stackshot_aux_cpu_entry(void)
5945 {
5946 	/*
5947 	 * This is where the other CPUs will end up when we take a stackshot.
5948 	 * Also, the main CPU will call this in the middle of its work to chip
5949 	 * away at the queue.
5950 	 */
5951 
5952 	/* Don't do work if we said we couldn't... */
5953 	if (!stackshot_cpu_ctx.scc_can_work) {
5954 		return;
5955 	}
5956 
5957 	/* Spin until we're ready to run. */
5958 	while (os_atomic_load(&stackshot_ctx.sc_state, acquire) == SS_SETUP) {
5959 		loop_wait();
5960 	}
5961 
5962 	/* Check to make sure the setup didn't error out or panic. */
5963 	if (stackshot_status_check() != KERN_SUCCESS) {
5964 		goto exit;
5965 	}
5966 
5967 	/* the CPU entering here is participating in the stackshot */
5968 	stackshot_cpu_ctx.scc_did_work = true;
5969 
5970 	if (stackshot_ctx.sc_main_cpuid == cpu_number()) {
5971 		stackshot_ctx.sc_retval = kdp_stackshot_kcdata_format();
5972 	} else {
5973 		stackshot_cpu_do_work();
5974 	}
5975 
5976 exit:
5977 	os_atomic_dec(&stackshot_ctx.sc_cpus_working, release);
5978 }
5979 
5980 boolean_t
stackshot_thread_is_idle_worker_unsafe(thread_t thread)5981 stackshot_thread_is_idle_worker_unsafe(thread_t thread)
5982 {
5983 	/* When the pthread kext puts a worker thread to sleep, it will
5984 	 * set kThreadWaitParkedWorkQueue in the block_hint of the thread
5985 	 * struct. See parkit() in kern/kern_support.c in libpthread.
5986 	 */
5987 	return (thread->state & TH_WAIT) &&
5988 	       (thread->block_hint == kThreadWaitParkedWorkQueue);
5989 }
5990 
5991 #if CONFIG_COALITIONS
5992 static void
stackshot_coalition_jetsam_count(void * arg,int i,coalition_t coal)5993 stackshot_coalition_jetsam_count(void *arg, int i, coalition_t coal)
5994 {
5995 #pragma unused(i, coal)
5996 	unsigned int *coalition_count = (unsigned int*)arg;
5997 	(*coalition_count)++;
5998 }
5999 
6000 static void
stackshot_coalition_jetsam_snapshot(void * arg,int i,coalition_t coal)6001 stackshot_coalition_jetsam_snapshot(void *arg, int i, coalition_t coal)
6002 {
6003 	if (coalition_type(coal) != COALITION_TYPE_JETSAM) {
6004 		return;
6005 	}
6006 
6007 	struct jetsam_coalition_snapshot *coalitions = (struct jetsam_coalition_snapshot*)arg;
6008 	struct jetsam_coalition_snapshot *jcs = &coalitions[i];
6009 	task_t leader = TASK_NULL;
6010 	jcs->jcs_id = coalition_id(coal);
6011 	jcs->jcs_flags = 0;
6012 	jcs->jcs_thread_group = 0;
6013 
6014 	if (coalition_term_requested(coal)) {
6015 		jcs->jcs_flags |= kCoalitionTermRequested;
6016 	}
6017 	if (coalition_is_terminated(coal)) {
6018 		jcs->jcs_flags |= kCoalitionTerminated;
6019 	}
6020 	if (coalition_is_reaped(coal)) {
6021 		jcs->jcs_flags |= kCoalitionReaped;
6022 	}
6023 	if (coalition_is_privileged(coal)) {
6024 		jcs->jcs_flags |= kCoalitionPrivileged;
6025 	}
6026 
6027 #if CONFIG_THREAD_GROUPS
6028 	struct thread_group *thread_group = kdp_coalition_get_thread_group(coal);
6029 	if (thread_group) {
6030 		jcs->jcs_thread_group = thread_group_get_id(thread_group);
6031 	}
6032 #endif /* CONFIG_THREAD_GROUPS */
6033 
6034 	leader = kdp_coalition_get_leader(coal);
6035 	if (leader) {
6036 		jcs->jcs_leader_task_uniqueid = get_task_uniqueid(leader);
6037 	} else {
6038 		jcs->jcs_leader_task_uniqueid = 0;
6039 	}
6040 }
6041 #endif /* CONFIG_COALITIONS */
6042 
6043 #if CONFIG_THREAD_GROUPS
6044 static void
stackshot_thread_group_count(void * arg,int i,struct thread_group * tg)6045 stackshot_thread_group_count(void *arg, int i, struct thread_group *tg)
6046 {
6047 #pragma unused(i, tg)
6048 	unsigned int *n = (unsigned int*)arg;
6049 	(*n)++;
6050 }
6051 
6052 static void
stackshot_thread_group_snapshot(void * arg,int i,struct thread_group * tg)6053 stackshot_thread_group_snapshot(void *arg, int i, struct thread_group *tg)
6054 {
6055 	struct thread_group_snapshot_v3 *thread_groups = arg;
6056 	struct thread_group_snapshot_v3 *tgs = &thread_groups[i];
6057 	const char *name = thread_group_get_name(tg);
6058 	uint32_t flags = thread_group_get_flags(tg);
6059 	tgs->tgs_id = thread_group_get_id(tg);
6060 	static_assert(THREAD_GROUP_MAXNAME > sizeof(tgs->tgs_name));
6061 	kdp_memcpy(tgs->tgs_name, name, sizeof(tgs->tgs_name));
6062 	kdp_memcpy(tgs->tgs_name_cont, name + sizeof(tgs->tgs_name),
6063 	    sizeof(tgs->tgs_name_cont));
6064 	tgs->tgs_flags =
6065 	    ((flags & THREAD_GROUP_FLAGS_EFFICIENT)     ? kThreadGroupEfficient     : 0) |
6066 	    ((flags & THREAD_GROUP_FLAGS_APPLICATION)   ? kThreadGroupApplication   : 0) |
6067 	    ((flags & THREAD_GROUP_FLAGS_CRITICAL)      ? kThreadGroupCritical      : 0) |
6068 	    ((flags & THREAD_GROUP_FLAGS_BEST_EFFORT)   ? kThreadGroupBestEffort    : 0) |
6069 	    ((flags & THREAD_GROUP_FLAGS_UI_APP)        ? kThreadGroupUIApplication : 0) |
6070 	    ((flags & THREAD_GROUP_FLAGS_MANAGED)       ? kThreadGroupManaged       : 0) |
6071 	    ((flags & THREAD_GROUP_FLAGS_STRICT_TIMERS) ? kThreadGroupStrictTimers  : 0) |
6072 	    0;
6073 }
6074 #endif /* CONFIG_THREAD_GROUPS */
6075 
6076 /* Determine if a thread has waitinfo that stackshot can provide */
6077 static int
stackshot_thread_has_valid_waitinfo(thread_t thread)6078 stackshot_thread_has_valid_waitinfo(thread_t thread)
6079 {
6080 	if (!(thread->state & TH_WAIT)) {
6081 		return 0;
6082 	}
6083 
6084 	switch (thread->block_hint) {
6085 	// If set to None or is a parked work queue, ignore it
6086 	case kThreadWaitParkedWorkQueue:
6087 	case kThreadWaitNone:
6088 		return 0;
6089 	// There is a short window where the pthread kext removes a thread
6090 	// from its ksyn wait queue before waking the thread up
6091 	case kThreadWaitPThreadMutex:
6092 	case kThreadWaitPThreadRWLockRead:
6093 	case kThreadWaitPThreadRWLockWrite:
6094 	case kThreadWaitPThreadCondVar:
6095 		return kdp_pthread_get_thread_kwq(thread) != NULL;
6096 	// All other cases are valid block hints if in a wait state
6097 	default:
6098 		return 1;
6099 	}
6100 }
6101 
6102 /* Determine if a thread has turnstileinfo that stackshot can provide */
6103 static int
stackshot_thread_has_valid_turnstileinfo(thread_t thread)6104 stackshot_thread_has_valid_turnstileinfo(thread_t thread)
6105 {
6106 	struct turnstile *ts = thread_get_waiting_turnstile(thread);
6107 
6108 	return stackshot_thread_has_valid_waitinfo(thread) &&
6109 	       ts != TURNSTILE_NULL;
6110 }
6111 
6112 static void
stackshot_thread_turnstileinfo(thread_t thread,thread_turnstileinfo_v2_t * tsinfo)6113 stackshot_thread_turnstileinfo(thread_t thread, thread_turnstileinfo_v2_t *tsinfo)
6114 {
6115 	struct turnstile *ts;
6116 	struct ipc_service_port_label *ispl = NULL;
6117 
6118 	/* acquire turnstile information and store it in the stackshot */
6119 	ts = thread_get_waiting_turnstile(thread);
6120 	tsinfo->waiter = thread_tid(thread);
6121 	kdp_turnstile_fill_tsinfo(ts, tsinfo, &ispl);
6122 	tsinfo->portlabel_id = stackshot_plh_lookup(ispl,
6123 	    (tsinfo->turnstile_flags & STACKSHOT_TURNSTILE_STATUS_SENDPORT) ? STACKSHOT_PLH_LOOKUP_SEND :
6124 	    (tsinfo->turnstile_flags & STACKSHOT_TURNSTILE_STATUS_RECEIVEPORT) ? STACKSHOT_PLH_LOOKUP_RECEIVE :
6125 	    STACKSHOT_PLH_LOOKUP_UNKNOWN);
6126 }
6127 
6128 static void
stackshot_thread_wait_owner_info(thread_t thread,thread_waitinfo_v2_t * waitinfo)6129 stackshot_thread_wait_owner_info(thread_t thread, thread_waitinfo_v2_t *waitinfo)
6130 {
6131 	thread_waitinfo_t *waitinfo_v1 = (thread_waitinfo_t *)waitinfo;
6132 	struct ipc_service_port_label *ispl = NULL;
6133 
6134 	waitinfo->waiter        = thread_tid(thread);
6135 	waitinfo->wait_type     = thread->block_hint;
6136 	waitinfo->wait_flags    = 0;
6137 
6138 	switch (waitinfo->wait_type) {
6139 	case kThreadWaitKernelMutex:
6140 		kdp_lck_mtx_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6141 		break;
6142 	case kThreadWaitPortReceive:
6143 		kdp_mqueue_recv_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo, &ispl);
6144 		waitinfo->portlabel_id  = stackshot_plh_lookup(ispl, STACKSHOT_PLH_LOOKUP_RECEIVE);
6145 		break;
6146 	case kThreadWaitPortSend:
6147 		kdp_mqueue_send_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo, &ispl);
6148 		waitinfo->portlabel_id  = stackshot_plh_lookup(ispl, STACKSHOT_PLH_LOOKUP_SEND);
6149 		break;
6150 	case kThreadWaitSemaphore:
6151 		kdp_sema_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6152 		break;
6153 	case kThreadWaitUserLock:
6154 		kdp_ulock_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6155 		break;
6156 	case kThreadWaitKernelRWLockRead:
6157 	case kThreadWaitKernelRWLockWrite:
6158 	case kThreadWaitKernelRWLockUpgrade:
6159 		kdp_rwlck_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6160 		break;
6161 	case kThreadWaitPThreadMutex:
6162 	case kThreadWaitPThreadRWLockRead:
6163 	case kThreadWaitPThreadRWLockWrite:
6164 	case kThreadWaitPThreadCondVar:
6165 		kdp_pthread_find_owner(thread, waitinfo_v1);
6166 		break;
6167 	case kThreadWaitWorkloopSyncWait:
6168 		kdp_workloop_sync_wait_find_owner(thread, thread->wait_event, waitinfo_v1);
6169 		break;
6170 	case kThreadWaitOnProcess:
6171 		kdp_wait4_find_process(thread, thread->wait_event, waitinfo_v1);
6172 		break;
6173 	case kThreadWaitSleepWithInheritor:
6174 		kdp_sleep_with_inheritor_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6175 		break;
6176 	case kThreadWaitEventlink:
6177 		kdp_eventlink_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6178 		break;
6179 	case kThreadWaitCompressor:
6180 		kdp_compressor_busy_find_owner(thread->wait_event, waitinfo_v1);
6181 		break;
6182 #ifdef CONFIG_EXCLAVES
6183 	case kThreadWaitExclaveCore:
6184 	case kThreadWaitExclaveKit:
6185 		kdp_esync_find_owner(thread->waitq.wq_q, thread->wait_event, waitinfo_v1);
6186 		break;
6187 #endif /* CONFIG_EXCLAVES */
6188 	case kThreadWaitPageBusy:
6189 		kdp_vm_page_sleep_find_owner(thread->wait_event, waitinfo_v1);
6190 		break;
6191 	case kThreadWaitPagingInProgress:
6192 	case kThreadWaitPagingActivity:
6193 	case kThreadWaitPagerInit:
6194 	case kThreadWaitPagerReady:
6195 	case kThreadWaitMemoryBlocked:
6196 	case kThreadWaitPageInThrottle:
6197 		kdp_vm_object_sleep_find_owner(thread->wait_event, waitinfo->wait_type, waitinfo_v1);
6198 		break;
6199 	default:
6200 		waitinfo->owner = 0;
6201 		waitinfo->context = 0;
6202 		break;
6203 	}
6204 }
6205