xref: /xnu-8019.80.24/bsd/kern/kdebug_common.c (revision a325d9c4a84054e40bbe985afedcb50ab80993ea)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @Apple_LICENSE_HEADER_START@
5  *
6  * The contents of this file constitute Original Code as defined in and
7  * are subject to the Apple Public Source License Version 1.1 (the
8  * "License").  You may not use this file except in compliance with the
9  * License.  Please obtain a copy of the License at
10  * http://www.apple.com/publicsource and read it before using this file.
11  *
12  * This Original Code and all software distributed under the License are
13  * distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY KIND, EITHER
14  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
15  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE OR NON-INFRINGEMENT.  Please see the
17  * License for the specific language governing rights and limitations
18  * under the License.
19  *
20  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
21  */
22 
23 #include <sys/kdebug_common.h>
24 
25 LCK_GRP_DECLARE(kdebug_lck_grp, "kdebug");
26 int kdbg_debug = 0;
27 
28 extern struct kd_ctrl_page_t kd_ctrl_page_trace, kd_ctrl_page_triage;
29 
30 void
kdebug_lck_init(void)31 kdebug_lck_init(void)
32 {
33 	lck_spin_init(&kd_ctrl_page_trace.kds_spin_lock, &kdebug_lck_grp, LCK_ATTR_NULL);
34 	lck_spin_init(&kd_ctrl_page_triage.kds_spin_lock, &kdebug_lck_grp, LCK_ATTR_NULL);
35 }
36 
37 inline int
_storage_lock(struct kd_ctrl_page_t * kd_ctrl_page)38 _storage_lock(struct kd_ctrl_page_t *kd_ctrl_page)
39 {
40 	int intrs_en = ml_set_interrupts_enabled(false);
41 	lck_spin_lock_grp(&kd_ctrl_page->kds_spin_lock, &kdebug_lck_grp);
42 	return intrs_en;
43 }
44 
45 inline void
_storage_unlock(struct kd_ctrl_page_t * kd_ctrl_page,int intrs_en)46 _storage_unlock(struct kd_ctrl_page_t *kd_ctrl_page, int intrs_en)
47 {
48 	lck_spin_unlock(&kd_ctrl_page->kds_spin_lock);
49 	ml_set_interrupts_enabled(intrs_en);
50 }
51 
52 
53 extern int kernel_debug_trace_write_to_file(user_addr_t *buffer, size_t *number, size_t *count, size_t tempbuf_number, vnode_t vp, vfs_context_t ctx, uint32_t file_version);
54 
55 uint32_t
kdbg_cpu_count(bool early_trace)56 kdbg_cpu_count(bool early_trace)
57 {
58 	if (early_trace) {
59 #if defined(__x86_64__)
60 		return max_ncpus;
61 #else /* defined(__x86_64__) */
62 		return ml_get_cpu_count();
63 #endif /* !defined(__x86_64__) */
64 	}
65 
66 #if defined(__x86_64__)
67 	host_basic_info_data_t hinfo;
68 	mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
69 	host_info((host_t)1 /* BSD_HOST */, HOST_BASIC_INFO, (host_info_t)&hinfo, &count);
70 	assert(hinfo.logical_cpu_max > 0);
71 	return hinfo.logical_cpu_max;
72 #else /* defined(__x86_64__) */
73 	return ml_get_topology_info()->max_cpu_id + 1;
74 #endif /* !defined(__x86_64__) */
75 }
76 
77 /*
78  * Both kdebug_timestamp and kdebug_using_continuous_time are known
79  * to kexts. And going forward we always want to use mach_continuous_time().
80  * So we keep these 2 routines as-is to keep the TRACE mode use outside
81  * the kernel intact. TRIAGE mode will explicitly only use mach_continuous_time()
82  * for its timestamp.
83  */
84 bool
kdebug_using_continuous_time(void)85 kdebug_using_continuous_time(void)
86 {
87 	return kd_ctrl_page_trace.kdebug_flags & KDBG_CONTINUOUS_TIME;
88 }
89 
90 uint64_t
kdebug_timestamp(void)91 kdebug_timestamp(void)
92 {
93 	if (kdebug_using_continuous_time()) {
94 		return mach_continuous_time();
95 	} else {
96 		return mach_absolute_time();
97 	}
98 }
99 
100 
101 int
create_buffers(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page,vm_tag_t tag)102 create_buffers(struct kd_ctrl_page_t *kd_ctrl_page, struct kd_data_page_t *kd_data_page, vm_tag_t tag)
103 {
104 	unsigned int i;
105 	unsigned int p_buffer_size;
106 	unsigned int f_buffer_size;
107 	unsigned int f_buffers;
108 	int error = 0;
109 	int ncpus, count_storage_units = 0;
110 	vm_size_t kd_buf_size;
111 
112 	struct kd_bufinfo *kdbip = NULL;
113 	struct kd_storage_buffers *kd_bufs = NULL;
114 	int n_storage_units = kd_data_page->n_storage_units;
115 
116 	ncpus = kd_ctrl_page->kdebug_cpus;
117 
118 	if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kdbip, sizeof(struct kd_bufinfo) * ncpus, tag) != KERN_SUCCESS) {
119 		error = ENOSPC;
120 		goto out;
121 	}
122 	kdbip = kd_data_page->kdbip;
123 
124 	f_buffers = n_storage_units / N_STORAGE_UNITS_PER_BUFFER;
125 	kd_data_page->n_storage_buffer = f_buffers;
126 
127 	f_buffer_size = N_STORAGE_UNITS_PER_BUFFER * sizeof(struct kd_storage);
128 	p_buffer_size = (n_storage_units % N_STORAGE_UNITS_PER_BUFFER) * sizeof(struct kd_storage);
129 
130 	if (p_buffer_size) {
131 		kd_data_page->n_storage_buffer++;
132 	}
133 
134 	if (kd_data_page->kdcopybuf == 0) {
135 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kdcopybuf, (vm_size_t) kd_ctrl_page->kdebug_kdcopybuf_size, tag) != KERN_SUCCESS) {
136 			error = ENOSPC;
137 			goto out;
138 		}
139 	}
140 
141 	kd_buf_size = (vm_size_t) (kd_data_page->n_storage_buffer * sizeof(struct kd_storage_buffers));
142 	if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_data_page->kd_bufs, kd_buf_size, tag) != KERN_SUCCESS) {
143 		error = ENOSPC;
144 		goto out;
145 	}
146 	kd_bufs = kd_data_page->kd_bufs;
147 	bzero(kd_bufs, kd_buf_size);
148 
149 	for (i = 0; i < f_buffers; i++) {
150 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_bufs[i].kdsb_addr, (vm_size_t)f_buffer_size, tag) != KERN_SUCCESS) {
151 			error = ENOSPC;
152 			goto out;
153 		}
154 		bzero(kd_bufs[i].kdsb_addr, f_buffer_size);
155 
156 		kd_bufs[i].kdsb_size = f_buffer_size;
157 	}
158 	if (p_buffer_size) {
159 		if (kmem_alloc(kernel_map, (vm_offset_t *)&kd_bufs[i].kdsb_addr, (vm_size_t)p_buffer_size, tag) != KERN_SUCCESS) {
160 			error = ENOSPC;
161 			goto out;
162 		}
163 		bzero(kd_bufs[i].kdsb_addr, p_buffer_size);
164 
165 		kd_bufs[i].kdsb_size = p_buffer_size;
166 	}
167 
168 	count_storage_units = 0;
169 	for (i = 0; i < kd_data_page->n_storage_buffer; i++) {
170 		struct kd_storage *kds;
171 		uint16_t n_elements;
172 		static_assert(N_STORAGE_UNITS_PER_BUFFER <= UINT16_MAX);
173 		assert(kd_bufs[i].kdsb_size <= N_STORAGE_UNITS_PER_BUFFER *
174 		    sizeof(struct kd_storage));
175 
176 		n_elements = kd_bufs[i].kdsb_size / sizeof(struct kd_storage);
177 		kds = kd_bufs[i].kdsb_addr;
178 
179 		for (uint16_t n = 0; n < n_elements; n++) {
180 			kds[n].kds_next.buffer_index = kd_ctrl_page->kds_free_list.buffer_index;
181 			kds[n].kds_next.offset = kd_ctrl_page->kds_free_list.offset;
182 
183 			kd_ctrl_page->kds_free_list.buffer_index = i;
184 			kd_ctrl_page->kds_free_list.offset = n;
185 		}
186 		count_storage_units += n_elements;
187 	}
188 
189 	kd_data_page->n_storage_units = count_storage_units;
190 
191 	bzero((char *)kdbip, sizeof(struct kd_bufinfo) * ncpus);
192 
193 	for (i = 0; i < ncpus; i++) {
194 		kdbip[i].kd_list_head.raw = KDS_PTR_NULL;
195 		kdbip[i].kd_list_tail.raw = KDS_PTR_NULL;
196 		kdbip[i].kd_lostevents = false;
197 		kdbip[i].num_bufs = 0;
198 	}
199 
200 	kd_ctrl_page->kdebug_flags |= KDBG_BUFINIT;
201 
202 	kd_ctrl_page->kds_inuse_count = 0;
203 out:
204 	if (error) {
205 		delete_buffers(kd_ctrl_page, kd_data_page);
206 	}
207 
208 	return error;
209 }
210 
211 void
delete_buffers(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page)212 delete_buffers(struct kd_ctrl_page_t *kd_ctrl_page, struct kd_data_page_t *kd_data_page)
213 {
214 	unsigned int i;
215 	int n_storage_buffer = kd_data_page->n_storage_buffer;
216 
217 	struct kd_bufinfo *kdbip = kd_data_page->kdbip;
218 	struct kd_storage_buffers *kd_bufs = kd_data_page->kd_bufs;
219 
220 	if (kd_bufs) {
221 		for (i = 0; i < n_storage_buffer; i++) {
222 			if (kd_bufs[i].kdsb_addr) {
223 				kmem_free(kernel_map, (vm_offset_t)kd_bufs[i].kdsb_addr, (vm_size_t)kd_bufs[i].kdsb_size);
224 			}
225 		}
226 		kmem_free(kernel_map, (vm_offset_t)kd_bufs, (vm_size_t)(n_storage_buffer * sizeof(struct kd_storage_buffers)));
227 
228 		kd_data_page->kd_bufs = NULL;
229 		kd_data_page->n_storage_buffer = 0;
230 	}
231 	if (kd_data_page->kdcopybuf) {
232 		kmem_free(kernel_map, (vm_offset_t)kd_data_page->kdcopybuf, kd_ctrl_page->kdebug_kdcopybuf_size);
233 
234 		kd_data_page->kdcopybuf = NULL;
235 	}
236 	kd_ctrl_page->kds_free_list.raw = KDS_PTR_NULL;
237 
238 	if (kdbip) {
239 		kmem_free(kernel_map, (vm_offset_t)kdbip, sizeof(struct kd_bufinfo) * kd_ctrl_page->kdebug_cpus);
240 
241 		kd_data_page->kdbip = NULL;
242 	}
243 	kd_ctrl_page->kdebug_iops = NULL;
244 	kd_ctrl_page->kdebug_cpus = 0;
245 	kd_ctrl_page->kdebug_flags &= ~KDBG_BUFINIT;
246 }
247 
248 bool
allocate_storage_unit(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page,int cpu)249 allocate_storage_unit(struct kd_ctrl_page_t *kd_ctrl_page, struct kd_data_page_t *kd_data_page, int cpu)
250 {
251 	union kds_ptr kdsp;
252 	struct kd_storage *kdsp_actual, *kdsp_next_actual;
253 	struct kd_bufinfo *kdbip, *kdbp, *kdbp_vict, *kdbp_try;
254 	uint64_t oldest_ts, ts;
255 	bool retval = true;
256 	struct kd_storage_buffers *kd_bufs;
257 
258 	int intrs_en = _storage_lock(kd_ctrl_page);
259 
260 	kdbp = &kd_data_page->kdbip[cpu];
261 	kd_bufs = kd_data_page->kd_bufs;
262 	kdbip = kd_data_page->kdbip;
263 
264 	/* If someone beat us to the allocate, return success */
265 	if (kdbp->kd_list_tail.raw != KDS_PTR_NULL) {
266 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp->kd_list_tail);
267 
268 		if (kdsp_actual->kds_bufindx < kd_ctrl_page->kdebug_events_per_storage_unit) {
269 			goto out;
270 		}
271 	}
272 
273 	if ((kdsp = kd_ctrl_page->kds_free_list).raw != KDS_PTR_NULL) {
274 		/*
275 		 * If there's a free page, grab it from the free list.
276 		 */
277 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
278 		kd_ctrl_page->kds_free_list = kdsp_actual->kds_next;
279 
280 		kd_ctrl_page->kds_inuse_count++;
281 	} else {
282 		/*
283 		 * Otherwise, we're going to lose events and repurpose the oldest
284 		 * storage unit we can find.
285 		 */
286 		if (kd_ctrl_page->kdebug_flags & KDBG_NOWRAP) {
287 			kd_ctrl_page->kdebug_slowcheck |= SLOW_NOLOG;
288 			kdbp->kd_lostevents = true;
289 			retval = false;
290 			goto out;
291 		}
292 		kdbp_vict = NULL;
293 		oldest_ts = UINT64_MAX;
294 
295 		for (kdbp_try = &kdbip[0]; kdbp_try < &kdbip[kd_ctrl_page->kdebug_cpus]; kdbp_try++) {
296 			if (kdbp_try->kd_list_head.raw == KDS_PTR_NULL) {
297 				/*
298 				 * no storage unit to steal
299 				 */
300 				continue;
301 			}
302 
303 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp_try->kd_list_head);
304 
305 			if (kdsp_actual->kds_bufcnt < kd_ctrl_page->kdebug_events_per_storage_unit) {
306 				/*
307 				 * make sure we don't steal the storage unit
308 				 * being actively recorded to...  need to
309 				 * move on because we don't want an out-of-order
310 				 * set of events showing up later
311 				 */
312 				continue;
313 			}
314 
315 			/*
316 			 * When wrapping, steal the storage unit with the
317 			 * earliest timestamp on its last event, instead of the
318 			 * earliest timestamp on the first event.  This allows a
319 			 * storage unit with more recent events to be preserved,
320 			 * even if the storage unit contains events that are
321 			 * older than those found in other CPUs.
322 			 */
323 			ts = kdbg_get_timestamp(&kdsp_actual->kds_records[kd_ctrl_page->kdebug_events_per_storage_unit - 1]);
324 			if (ts < oldest_ts) {
325 				oldest_ts = ts;
326 				kdbp_vict = kdbp_try;
327 			}
328 		}
329 		if (kdbp_vict == NULL && kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
330 			kdebug_enable = 0;
331 			kd_ctrl_page->enabled = 0;
332 			commpage_update_kdebug_state();
333 			retval = false;
334 			goto out;
335 		}
336 		kdsp = kdbp_vict->kd_list_head;
337 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
338 		kdbp_vict->kd_list_head = kdsp_actual->kds_next;
339 
340 		if (kdbp_vict->kd_list_head.raw != KDS_PTR_NULL) {
341 			kdsp_next_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdbp_vict->kd_list_head);
342 			kdsp_next_actual->kds_lostevents = true;
343 		} else {
344 			kdbp_vict->kd_lostevents = true;
345 		}
346 
347 		if (kd_ctrl_page->oldest_time < oldest_ts) {
348 			kd_ctrl_page->oldest_time = oldest_ts;
349 		}
350 		kd_ctrl_page->kdebug_flags |= KDBG_WRAPPED;
351 	}
352 
353 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
354 		kdsp_actual->kds_timestamp = kdebug_timestamp();
355 	} else {
356 		kdsp_actual->kds_timestamp = mach_continuous_time();
357 	}
358 
359 	kdsp_actual->kds_next.raw = KDS_PTR_NULL;
360 	kdsp_actual->kds_bufcnt   = 0;
361 	kdsp_actual->kds_readlast = 0;
362 
363 	kdsp_actual->kds_lostevents = kdbp->kd_lostevents;
364 	kdbp->kd_lostevents = false;
365 	kdsp_actual->kds_bufindx = 0;
366 
367 	if (kdbp->kd_list_head.raw == KDS_PTR_NULL) {
368 		kdbp->kd_list_head = kdsp;
369 	} else {
370 		POINTER_FROM_KDS_PTR(kd_bufs, kdbp->kd_list_tail)->kds_next = kdsp;
371 	}
372 	kdbp->kd_list_tail = kdsp;
373 out:
374 	_storage_unlock(kd_ctrl_page, intrs_en);
375 
376 	return retval;
377 }
378 
379 void
release_storage_unit(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page,int cpu,uint32_t kdsp_raw)380 release_storage_unit(struct kd_ctrl_page_t *kd_ctrl_page, struct kd_data_page_t *kd_data_page, int cpu, uint32_t kdsp_raw)
381 {
382 	struct  kd_storage *kdsp_actual;
383 	struct kd_bufinfo *kdbp;
384 	union kds_ptr kdsp;
385 
386 	kdbp = &kd_data_page->kdbip[cpu];
387 
388 	kdsp.raw = kdsp_raw;
389 
390 	int intrs_en = _storage_lock(kd_ctrl_page);
391 
392 	if (kdsp.raw == kdbp->kd_list_head.raw) {
393 		/*
394 		 * it's possible for the storage unit pointed to
395 		 * by kdsp to have already been stolen... so
396 		 * check to see if it's still the head of the list
397 		 * now that we're behind the lock that protects
398 		 * adding and removing from the queue...
399 		 * since we only ever release and steal units from
400 		 * that position, if it's no longer the head
401 		 * we having nothing to do in this context
402 		 */
403 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kdsp);
404 		kdbp->kd_list_head = kdsp_actual->kds_next;
405 
406 		kdsp_actual->kds_next = kd_ctrl_page->kds_free_list;
407 		kd_ctrl_page->kds_free_list = kdsp;
408 
409 		kd_ctrl_page->kds_inuse_count--;
410 	}
411 
412 	_storage_unlock(kd_ctrl_page, intrs_en);
413 }
414 
415 
416 /*
417  * Disable wrapping and return true if trace wrapped, false otherwise.
418  */
419 bool
disable_wrap(struct kd_ctrl_page_t * kd_ctrl_page,uint32_t * old_slowcheck,uint32_t * old_flags)420 disable_wrap(struct kd_ctrl_page_t *kd_ctrl_page, uint32_t *old_slowcheck, uint32_t *old_flags)
421 {
422 	bool wrapped;
423 	int intrs_en = _storage_lock(kd_ctrl_page);
424 
425 	*old_slowcheck = kd_ctrl_page->kdebug_slowcheck;
426 	*old_flags = kd_ctrl_page->kdebug_flags;
427 
428 	wrapped = kd_ctrl_page->kdebug_flags & KDBG_WRAPPED;
429 	kd_ctrl_page->kdebug_flags &= ~KDBG_WRAPPED;
430 	kd_ctrl_page->kdebug_flags |= KDBG_NOWRAP;
431 
432 	_storage_unlock(kd_ctrl_page, intrs_en);
433 
434 	return wrapped;
435 }
436 
437 void
enable_wrap(struct kd_ctrl_page_t * kd_ctrl_page,uint32_t old_slowcheck)438 enable_wrap(struct kd_ctrl_page_t *kd_ctrl_page, uint32_t old_slowcheck)
439 {
440 	int intrs_en = _storage_lock(kd_ctrl_page);
441 
442 	kd_ctrl_page->kdebug_flags &= ~KDBG_NOWRAP;
443 
444 	if (!(old_slowcheck & SLOW_NOLOG)) {
445 		kd_ctrl_page->kdebug_slowcheck &= ~SLOW_NOLOG;
446 	}
447 	_storage_unlock(kd_ctrl_page, intrs_en);
448 }
449 
450 __attribute__((always_inline))
451 void
kernel_debug_write(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page,struct kd_record kd_rec)452 kernel_debug_write(struct kd_ctrl_page_t *kd_ctrl_page,
453     struct kd_data_page_t *kd_data_page,
454     struct kd_record      kd_rec)
455 {
456 	uint64_t now = 0;
457 	uint32_t bindx;
458 	kd_buf *kd;
459 	int cpu;
460 	struct kd_bufinfo *kdbp;
461 	struct kd_storage *kdsp_actual;
462 	union kds_ptr kds_raw;
463 
464 	disable_preemption();
465 
466 	if (kd_ctrl_page->enabled == 0) {
467 		goto out;
468 	}
469 
470 	if (kd_rec.cpu == -1) {
471 		cpu = cpu_number();
472 	} else {
473 		cpu = kd_rec.cpu;
474 	}
475 
476 	kdbp = &kd_data_page->kdbip[cpu];
477 
478 	bool timestamp_is_continuous = kdbp->continuous_timestamps;
479 
480 	if (kd_rec.timestamp != -1) {
481 		if (kdebug_using_continuous_time()) {
482 			if (!timestamp_is_continuous) {
483 				kd_rec.timestamp = absolutetime_to_continuoustime(kd_rec.timestamp);
484 			}
485 		} else {
486 			if (timestamp_is_continuous) {
487 				kd_rec.timestamp = continuoustime_to_absolutetime(kd_rec.timestamp);
488 			}
489 		}
490 
491 		if (kd_rec.timestamp < kd_ctrl_page->oldest_time) {
492 			goto out;
493 		}
494 
495 		kd_rec.timestamp &= KDBG_TIMESTAMP_MASK;
496 	}
497 
498 retry_q:
499 	kds_raw = kdbp->kd_list_tail;
500 
501 	if (kds_raw.raw != KDS_PTR_NULL) {
502 		kdsp_actual = POINTER_FROM_KDS_PTR(kd_data_page->kd_bufs, kds_raw);
503 		bindx = kdsp_actual->kds_bufindx;
504 	} else {
505 		kdsp_actual = NULL;
506 		bindx = kd_ctrl_page->kdebug_events_per_storage_unit;
507 	}
508 
509 	if (kdsp_actual == NULL || bindx >= kd_ctrl_page->kdebug_events_per_storage_unit) {
510 		if (allocate_storage_unit(kd_ctrl_page, kd_data_page, cpu) == false) {
511 			/*
512 			 * this can only happen if wrapping
513 			 * has been disabled
514 			 */
515 			goto out;
516 		}
517 		goto retry_q;
518 	}
519 
520 	if (kd_rec.timestamp != -1) {
521 		/*
522 		 * IOP entries can be allocated before xnu allocates and inits the buffer
523 		 * And, Intel uses a special 0 value as a early tracing timestamp sentinel
524 		 * to set the start of trace-time-start-of-interest.
525 		 */
526 		if (kd_rec.timestamp < kdsp_actual->kds_timestamp) {
527 			kdsp_actual->kds_timestamp = kd_rec.timestamp;
528 		}
529 		now = kd_rec.timestamp;
530 	} else {
531 		if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
532 			now = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
533 		} else {
534 			now = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
535 		}
536 	}
537 
538 	if (!OSCompareAndSwap(bindx, bindx + 1, &kdsp_actual->kds_bufindx)) {
539 		goto retry_q;
540 	}
541 
542 	kd = &kdsp_actual->kds_records[bindx];
543 
544 	if (kd_ctrl_page->kdebug_flags & KDBG_DEBUGID_64) {
545 		/*DebugID has been passed in arg 4*/
546 		kd->debugid = 0;
547 	} else {
548 		kd->debugid = kd_rec.debugid;
549 	}
550 
551 	kd->arg1 = kd_rec.arg1;
552 	kd->arg2 = kd_rec.arg2;
553 	kd->arg3 = kd_rec.arg3;
554 	kd->arg4 = kd_rec.arg4;
555 	kd->arg5 = kd_rec.arg5; /* thread ID */
556 
557 	kdbg_set_timestamp_and_cpu(kd, now, cpu);
558 
559 	OSAddAtomic(1, &kdsp_actual->kds_bufcnt);
560 
561 out:
562 	enable_preemption();
563 }
564 
565 /*
566  * This is a general purpose routine to read the kd_bufs from the storage units.
567  * So it is the responsibility of the caller to make sure they have acquired
568  * the right synchronization primitive so they can retrieve the data in the correct
569  * way.
570  */
571 int
kernel_debug_read(struct kd_ctrl_page_t * kd_ctrl_page,struct kd_data_page_t * kd_data_page,user_addr_t buffer,size_t * number,vnode_t vp,vfs_context_t ctx,uint32_t file_version)572 kernel_debug_read(struct kd_ctrl_page_t *kd_ctrl_page, struct kd_data_page_t *kd_data_page, user_addr_t buffer, size_t *number, vnode_t vp, vfs_context_t ctx, uint32_t file_version)
573 {
574 	size_t count;
575 	unsigned int cpu, min_cpu;
576 	uint64_t barrier_min = 0, barrier_max = 0, t, earliest_time;
577 	int error = 0;
578 	kd_buf *tempbuf;
579 	uint32_t rcursor;
580 	kd_buf lostevent;
581 	union kds_ptr kdsp;
582 	bool traced_retrograde = false;
583 	struct kd_storage *kdsp_actual;
584 	struct kd_bufinfo *kdbp;
585 	struct kd_bufinfo *min_kdbp;
586 	size_t tempbuf_count;
587 	uint32_t tempbuf_number;
588 	uint32_t old_kdebug_flags;
589 	uint32_t old_kdebug_slowcheck;
590 	bool out_of_events = false;
591 	bool wrapped = false, set_preempt = true;
592 
593 	struct kd_bufinfo *kdbip = kd_data_page->kdbip;
594 	struct kd_storage_buffers *kd_bufs = kd_data_page->kd_bufs;
595 
596 	assert(number != NULL);
597 	count = *number / sizeof(kd_buf);
598 	*number = 0;
599 
600 	if (count == 0 || !(kd_ctrl_page->kdebug_flags & KDBG_BUFINIT) || kd_data_page->kdcopybuf == 0) {
601 		return EINVAL;
602 	}
603 
604 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
605 		/*
606 		 * A corpse can be created due to 'TASK_HAS_TOO_MANY_THREADS'
607 		 * and that can be handled by a callout thread that already
608 		 * has the eager-preemption set.
609 		 * So check to see if we are dealing with one such thread.
610 		 */
611 		set_preempt = !(thread_is_eager_preempt(current_thread()));
612 	}
613 
614 	if (set_preempt) {
615 		thread_set_eager_preempt(current_thread());
616 	}
617 
618 	memset(&lostevent, 0, sizeof(lostevent));
619 	lostevent.debugid = TRACE_LOST_EVENTS;
620 
621 	/*
622 	 * Capture the current time.  Only sort events that have occured
623 	 * before now.  Since the IOPs are being flushed here, it is possible
624 	 * that events occur on the AP while running live tracing.
625 	 */
626 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
627 		barrier_max = kdebug_timestamp() & KDBG_TIMESTAMP_MASK;
628 	} else {
629 		barrier_max = mach_continuous_time() & KDBG_TIMESTAMP_MASK;
630 	}
631 
632 	/*
633 	 * Disable wrap so storage units cannot be stolen out from underneath us
634 	 * while merging events.
635 	 *
636 	 * Because we hold ktrace_lock, no other control threads can be playing
637 	 * with kdebug_flags.  The code that emits new events could be running,
638 	 * but it grabs kds_spin_lock if it needs to acquire a new storage
639 	 * chunk, which is where it examines kdebug_flags.  If it is adding to
640 	 * the same chunk we're reading from, check for that below.
641 	 */
642 	wrapped = disable_wrap(kd_ctrl_page, &old_kdebug_slowcheck, &old_kdebug_flags);
643 
644 	if (count > kd_data_page->nkdbufs) {
645 		count = kd_data_page->nkdbufs;
646 	}
647 
648 	if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
649 		tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
650 	}
651 
652 	/*
653 	 * If the buffers have wrapped, do not emit additional lost events for the
654 	 * oldest storage units.
655 	 */
656 	if (wrapped) {
657 		kd_ctrl_page->kdebug_flags &= ~KDBG_WRAPPED;
658 
659 		for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
660 			if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
661 				continue;
662 			}
663 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
664 			kdsp_actual->kds_lostevents = false;
665 		}
666 	}
667 
668 	if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
669 		/*
670 		 * In TRIAGE mode we want to extract all the current
671 		 * records regardless of where we stopped reading last
672 		 * time so that we have the best shot at getting older
673 		 * records for threads before the buffers are wrapped.
674 		 * So set readlast to 0 to initiate the search from the
675 		 * 1st record.
676 		 */
677 		for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
678 			if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
679 				continue;
680 			}
681 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
682 			kdsp_actual->kds_readlast = 0;
683 		}
684 	}
685 
686 	/*
687 	 * Capture the earliest time where there are events for all CPUs and don't
688 	 * emit events with timestamps prior.
689 	 */
690 	barrier_min = kd_ctrl_page->oldest_time;
691 
692 	while (count) {
693 		tempbuf = kd_data_page->kdcopybuf;
694 		tempbuf_number = 0;
695 
696 		if (wrapped) {
697 			/*
698 			 * Emit a lost events tracepoint to indicate that previous events
699 			 * were lost -- the thread map cannot be trusted.  A new one must
700 			 * be taken so tools can analyze the trace in a backwards-facing
701 			 * fashion.
702 			 */
703 			kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, 0);
704 			*tempbuf = lostevent;
705 			wrapped = false;
706 			goto nextevent;
707 		}
708 
709 		/* While space left in merged events scratch buffer. */
710 		while (tempbuf_count) {
711 			bool lostevents = false;
712 			int lostcpu = 0;
713 			earliest_time = UINT64_MAX;
714 			min_kdbp = NULL;
715 			min_cpu = 0;
716 
717 			/* Check each CPU's buffers for the earliest event. */
718 			for (cpu = 0, kdbp = &kdbip[0]; cpu < kd_ctrl_page->kdebug_cpus; cpu++, kdbp++) {
719 				/* Skip CPUs without data in their oldest storage unit. */
720 				if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
721 next_cpu:
722 					continue;
723 				}
724 				/* From CPU data to buffer header to buffer. */
725 				kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
726 
727 next_event:
728 				/* The next event to be read from this buffer. */
729 				rcursor = kdsp_actual->kds_readlast;
730 
731 				/* Skip this buffer if there are no events left. */
732 				if (rcursor == kdsp_actual->kds_bufindx) {
733 					continue;
734 				}
735 
736 				if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
737 					/*
738 					 * TRIAGE mode record keeping doesn't (currently)
739 					 * use lostevent markers. It also doesn't want to
740 					 * call release_storage_unit() in this read call.
741 					 * It expects the buffers to wrap and records reclaimed
742 					 * in that way solely.
743 					 */
744 					t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
745 					goto skip_record_checks;
746 				}
747 
748 				/*
749 				 * Check that this storage unit wasn't stolen and events were
750 				 * lost.  This must have happened while wrapping was disabled
751 				 * in this function.
752 				 */
753 				if (kdsp_actual->kds_lostevents) {
754 					lostevents = true;
755 					kdsp_actual->kds_lostevents = false;
756 
757 					/*
758 					 * The earliest event we can trust is the first one in this
759 					 * stolen storage unit.
760 					 */
761 					uint64_t lost_time =
762 					    kdbg_get_timestamp(&kdsp_actual->kds_records[0]);
763 					if (kd_ctrl_page->oldest_time < lost_time) {
764 						/*
765 						 * If this is the first time we've seen lost events for
766 						 * this gap, record its timestamp as the oldest
767 						 * timestamp we're willing to merge for the lost events
768 						 * tracepoint.
769 						 */
770 						kd_ctrl_page->oldest_time = barrier_min = lost_time;
771 						lostcpu = cpu;
772 					}
773 				}
774 
775 				t = kdbg_get_timestamp(&kdsp_actual->kds_records[rcursor]);
776 
777 				if (t > barrier_max) {
778 					if (kdbg_debug) {
779 						printf("kdebug: FUTURE EVENT: debugid %#8x: "
780 						    "time %lld from CPU %u "
781 						    "(barrier at time %lld, read %lu events)\n",
782 						    kdsp_actual->kds_records[rcursor].debugid,
783 						    t, cpu, barrier_max, *number + tempbuf_number);
784 					}
785 					goto next_cpu;
786 				}
787 				if (t < kdsp_actual->kds_timestamp) {
788 					/*
789 					 * This indicates the event emitter hasn't completed
790 					 * filling in the event (becuase we're looking at the
791 					 * buffer that the record head is using).  The max barrier
792 					 * timestamp should have saved us from seeing these kinds
793 					 * of things, but other CPUs might be slow on the up-take.
794 					 *
795 					 * Bail out so we don't get out-of-order events by
796 					 * continuing to read events from other CPUs' events.
797 					 */
798 					out_of_events = true;
799 					break;
800 				}
801 
802 				/*
803 				 * Ignore events that have aged out due to wrapping or storage
804 				 * unit exhaustion while merging events.
805 				 */
806 				if (t < barrier_min) {
807 					kdsp_actual->kds_readlast++;
808 					if (kdbg_debug) {
809 						printf("kdebug: PAST EVENT: debugid %#8x: "
810 						    "time %lld from CPU %u "
811 						    "(barrier at time %lld)\n",
812 						    kdsp_actual->kds_records[rcursor].debugid,
813 						    t, cpu, barrier_min);
814 					}
815 
816 					if (kdsp_actual->kds_readlast >= kd_ctrl_page->kdebug_events_per_storage_unit) {
817 						release_storage_unit(kd_ctrl_page, kd_data_page, cpu, kdsp.raw);
818 
819 						if ((kdsp = kdbp->kd_list_head).raw == KDS_PTR_NULL) {
820 							goto next_cpu;
821 						}
822 						kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
823 					}
824 					goto next_event;
825 				}
826 
827 				/*
828 				 * Don't worry about merging any events -- just walk through
829 				 * the CPUs and find the latest timestamp of lost events.
830 				 */
831 				if (lostevents) {
832 					continue;
833 				}
834 skip_record_checks:
835 				if (t < earliest_time) {
836 					earliest_time = t;
837 					min_kdbp = kdbp;
838 					min_cpu = cpu;
839 				}
840 			}
841 			if (lostevents) {
842 				/*
843 				 * If any lost events were hit in the buffers, emit an event
844 				 * with the latest timestamp.
845 				 */
846 				kdbg_set_timestamp_and_cpu(&lostevent, barrier_min, lostcpu);
847 				*tempbuf = lostevent;
848 				tempbuf->arg1 = 1;
849 				goto nextevent;
850 			}
851 			if (min_kdbp == NULL) {
852 				/* All buffers ran empty. */
853 				out_of_events = true;
854 			}
855 			if (out_of_events) {
856 				break;
857 			}
858 
859 			kdsp = min_kdbp->kd_list_head;
860 			kdsp_actual = POINTER_FROM_KDS_PTR(kd_bufs, kdsp);
861 
862 			/* Copy earliest event into merged events scratch buffer. */
863 			*tempbuf = kdsp_actual->kds_records[kdsp_actual->kds_readlast++];
864 
865 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
866 				if (kdsp_actual->kds_readlast == kd_ctrl_page->kdebug_events_per_storage_unit) {
867 					release_storage_unit(kd_ctrl_page, kd_data_page, min_cpu, kdsp.raw);
868 				}
869 			}
870 
871 			/*
872 			 * Watch for out of order timestamps (from IOPs).
873 			 */
874 			if (earliest_time < min_kdbp->kd_prev_timebase) {
875 				/*
876 				 * If we haven't already, emit a retrograde events event.
877 				 * Otherwise, ignore this event.
878 				 */
879 				if (traced_retrograde) {
880 					continue;
881 				}
882 				if (kdbg_debug) {
883 					printf("kdebug: RETRO EVENT: debugid %#8x: "
884 					    "time %lld from CPU %u "
885 					    "(barrier at time %lld)\n",
886 					    kdsp_actual->kds_records[rcursor].debugid,
887 					    t, cpu, barrier_min);
888 				}
889 
890 				kdbg_set_timestamp_and_cpu(tempbuf, min_kdbp->kd_prev_timebase, kdbg_get_cpu(tempbuf));
891 				tempbuf->arg1 = tempbuf->debugid;
892 				tempbuf->arg2 = (kd_buf_argtype)earliest_time;
893 				tempbuf->arg3 = 0;
894 				tempbuf->arg4 = 0;
895 				tempbuf->debugid = TRACE_RETROGRADE_EVENTS;
896 				traced_retrograde = true;
897 			} else {
898 				min_kdbp->kd_prev_timebase = earliest_time;
899 			}
900 nextevent:
901 			tempbuf_count--;
902 			tempbuf_number++;
903 			tempbuf++;
904 
905 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE && (RAW_file_written += sizeof(kd_buf)) >= RAW_FLUSH_SIZE) {
906 				break;
907 			}
908 		}
909 
910 		if (tempbuf_number) {
911 			/*
912 			 * Remember the latest timestamp of events that we've merged so we
913 			 * don't think we've lost events later.
914 			 */
915 			uint64_t latest_time = kdbg_get_timestamp(tempbuf - 1);
916 			if (kd_ctrl_page->oldest_time < latest_time) {
917 				kd_ctrl_page->oldest_time = latest_time;
918 			}
919 
920 			if (kd_ctrl_page->mode == KDEBUG_MODE_TRACE) {
921 				error = kernel_debug_trace_write_to_file(&buffer, number, &count, tempbuf_number, vp, ctx, file_version);
922 				if (error) {
923 					goto check_error;
924 				}
925 			} else if (kd_ctrl_page->mode == KDEBUG_MODE_TRIAGE) {
926 				memcpy((void*)buffer, kd_data_page->kdcopybuf, tempbuf_number * sizeof(kd_buf));
927 
928 				buffer += (tempbuf_number * sizeof(kd_buf));
929 			} else {
930 				panic("kernel_debug_read: Invalid Mode");
931 			}
932 check_error:
933 			if (error) {
934 				*number = 0;
935 				error = EINVAL;
936 				break;
937 			}
938 			count   -= tempbuf_number;
939 			*number += tempbuf_number;
940 		}
941 		if (out_of_events == true) {
942 			/*
943 			 * all trace buffers are empty
944 			 */
945 			break;
946 		}
947 
948 		if ((tempbuf_count = count) > kd_ctrl_page->kdebug_kdcopybuf_count) {
949 			tempbuf_count = kd_ctrl_page->kdebug_kdcopybuf_count;
950 		}
951 	}
952 	if (!(old_kdebug_flags & KDBG_NOWRAP)) {
953 		enable_wrap(kd_ctrl_page, old_kdebug_slowcheck);
954 	}
955 
956 	if (set_preempt) {
957 		thread_clear_eager_preempt(current_thread());
958 	}
959 
960 	return error;
961 }
962