xref: /xnu-12377.41.6/osfmk/kperf/pet.c (revision bbb1b6f9e71b8cdde6e5cd6f4841f207dee3d828)
1 /*
2  * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Profile Every Thread (PET) provides a profile of all threads on the system
31  * when a timer fires.  PET supports the "record waiting threads" mode in
32  * Instruments, and used to be called All Thread States (ATS).  New tools should
33  * adopt the lightweight PET mode, which provides the same information, but with
34  * much less overhead.
35  *
36  * When traditional (non-lightweight) PET is active, a migrating timer call
37  * causes the PET thread to wake up.  The timer handler also issues a broadcast
38  * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core
39  * samples.  This is provided for backwards-compatibility with clients that
40  * expect on-core samples, when PET's timer was based off the on-core timers.
41  * Because PET sampling can take on the order of milliseconds, the PET thread
42  * will enter a new timer deadline after it finished sampling This perturbs the
43  * timer cadence by the duration of PET sampling, but it leaves the system to
44  * work on non-profiling tasks for the duration of the timer period.
45  *
46  * Lightweight PET samples the system less-intrusively than normal PET
47  * mode.  Instead of iterating tasks and threads on each sample, it checks the
48  * current time as threads are context switched on-core.  If the thread's local
49  * generation count is older than a sampling timer would have incremented a global
50  * generation count, the thread samples itself.
51  *
52  *            |  |
53  * thread A   +--+---------|
54  *            |  |
55  * thread B   |--+---------------|
56  *            |  |
57  * thread C   |  |         |-------------------------------------
58  *            |  |         |
59  * thread D   |  |         |     |-------------------------------
60  *            |  |         |     |
61  *            +--+---------+-----+--------------------------------> time
62  *               |         │     |
63  *               |         +-----+--- threads sampled when they come on-core in
64  *               |                    kperf_pet_switch_context
65  *               |
66  *               +--- PET timer would have fired
67  */
68 
69 #include <mach/mach_types.h>
70 #include <sys/errno.h>
71 
72 #include <kperf/kperf.h>
73 #include <kperf/buffer.h>
74 #include <kperf/sample.h>
75 #include <kperf/context.h>
76 #include <kperf/action.h>
77 #include <kperf/pet.h>
78 #include <kperf/kptimer.h>
79 
80 #include <kern/task.h>
81 #include <kern/kalloc.h>
82 #include <os/atomic_private.h>
83 #if defined(__x86_64__)
84 #include <i386/mp.h>
85 #endif /* defined(__x86_64__) */
86 
87 static LCK_MTX_DECLARE(kppet_mtx, &kperf_lck_grp);
88 
89 static struct {
90 	unsigned int g_actionid;
91 	/*
92 	 * The idle rate controls how many sampling periods to skip if a thread
93 	 * is idle.
94 	 */
95 	uint32_t g_idle_rate;
96 	bool g_setup:1;
97 	bool g_lightweight:1;
98 	uint64_t g_period;
99 	struct kperf_sample *g_sample;
100 
101 	thread_t g_sample_thread;
102 
103 	/*
104 	 * Used by the PET thread to manage which threads and tasks to sample.
105 	 */
106 	thread_t *g_threads;
107 	unsigned int g_nthreads;
108 	size_t g_threads_count;
109 
110 	task_t *g_tasks;
111 	unsigned int g_ntasks;
112 	size_t g_tasks_count;
113 } kppet = {
114 	.g_actionid = 0,
115 	.g_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE,
116 };
117 
118 uint64_t kppet_lightweight_start_time = 0;
119 
120 static uint64_t kppet_sample_tasks(uint32_t idle_rate);
121 static void kppet_thread(void * param, wait_result_t wr);
122 
123 static void
kppet_lock_assert_owned(void)124 kppet_lock_assert_owned(void)
125 {
126 	lck_mtx_assert(&kppet_mtx, LCK_MTX_ASSERT_OWNED);
127 }
128 
129 static void
kppet_lock(void)130 kppet_lock(void)
131 {
132 	lck_mtx_lock(&kppet_mtx);
133 }
134 
135 static void
kppet_unlock(void)136 kppet_unlock(void)
137 {
138 	lck_mtx_unlock(&kppet_mtx);
139 }
140 
141 void
kppet_set_period(uint64_t period)142 kppet_set_period(uint64_t period)
143 {
144 	kppet.g_period = period;
145 }
146 
147 static uint32_t
kppet_current_gen(void)148 kppet_current_gen(void)
149 {
150 	/*
151 	 * Don't worry too much about the memory model here.
152 	 * The timers starting up issues a broadcast cross-call.
153 	 * And the period/start time won't change while the timers are active.
154 	 */
155 	uint64_t period = os_atomic_load(&kppet.g_period, relaxed);
156 	if (period == 0) {
157 		return 0;
158 	}
159 	uint64_t start_time = os_atomic_load(&kppet_lightweight_start_time, relaxed);
160 	return (uint32_t)((mach_continuous_time() - start_time) / period);
161 }
162 
163 void
kppet_mark_sampled(thread_t thread)164 kppet_mark_sampled(thread_t thread)
165 {
166 	thread->kperf_pet_gen = kppet_current_gen();
167 }
168 
169 void
kppet_on_cpu(thread_t thread,thread_continue_t continuation,uintptr_t * starting_fp)170 kppet_on_cpu(thread_t thread, thread_continue_t continuation,
171     uintptr_t *starting_fp)
172 {
173 	assert(thread != NULL);
174 	assert(ml_get_interrupts_enabled() == FALSE);
175 
176 	uint32_t actionid = kppet.g_actionid;
177 	if (actionid == 0) {
178 		return;
179 	}
180 	uint32_t sample_gen = kppet_current_gen();
181 
182 	/*
183 	 * Has to match exactly to skip sampling.
184 	 */
185 	if (thread->kperf_pet_gen != sample_gen) {
186 		BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START, sample_gen, thread->kperf_pet_gen,
187 		    kppet_lightweight_start_time, kppet.g_period);
188 
189 		task_t task = get_threadtask(thread);
190 		struct kperf_context ctx = {
191 			.cur_thread = thread,
192 			.cur_task = task,
193 			.cur_pid = task_pid(task),
194 			.starting_fp = starting_fp,
195 		};
196 		/*
197 		 * Use a per-CPU interrupt buffer, since this is only called
198 		 * while interrupts are disabled, from the scheduler.
199 		 */
200 		struct kperf_sample *sample = kperf_intr_sample_buffer();
201 		if (!sample) {
202 			BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1);
203 			return;
204 		}
205 
206 		unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER;
207 		if (continuation != NULL) {
208 			flags |= SAMPLE_FLAG_CONTINUATION;
209 		}
210 		kperf_sample(sample, &ctx, actionid, flags);
211 
212 		BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
213 	} else {
214 		BUF_VERB(PERF_PET_SAMPLE_THREAD, sample_gen, thread->kperf_pet_gen,
215 		    kppet_lightweight_start_time, kppet.g_period);
216 	}
217 }
218 
219 #pragma mark - state transitions
220 
221 /*
222  * Lazily initialize PET.  The PET thread never exits once PET has been used
223  * once.
224  */
225 static void
kppet_setup(void)226 kppet_setup(void)
227 {
228 	if (kppet.g_setup) {
229 		return;
230 	}
231 
232 	kern_return_t kr = kernel_thread_start(kppet_thread, NULL,
233 	    &kppet.g_sample_thread);
234 	if (kr != KERN_SUCCESS) {
235 		panic("kperf: failed to create PET thread %d", kr);
236 	}
237 
238 	thread_set_thread_name(kppet.g_sample_thread, "kperf-pet-sampling");
239 	kppet.g_setup = true;
240 }
241 
242 void
kppet_config(unsigned int actionid)243 kppet_config(unsigned int actionid)
244 {
245 	/*
246 	 * Resetting kperf shouldn't get the PET thread started.
247 	 */
248 	if (actionid == 0 && !kppet.g_setup) {
249 		return;
250 	}
251 
252 	kppet_setup();
253 
254 	kppet_lock();
255 
256 	kppet.g_actionid = actionid;
257 
258 	if (actionid > 0) {
259 		if (!kppet.g_sample) {
260 			kppet.g_sample = kalloc_type_tag(struct kperf_sample,
261 			    Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
262 			kppet.g_sample->usample.usample_min = kalloc_type_tag(
263 				struct kperf_usample_min, Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
264 		}
265 	} else {
266 		if (kppet.g_tasks) {
267 			assert(kppet.g_tasks_count != 0);
268 			kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
269 			kppet.g_tasks = NULL;
270 			kppet.g_tasks_count = 0;
271 			kppet.g_ntasks = 0;
272 		}
273 		if (kppet.g_threads) {
274 			assert(kppet.g_threads_count != 0);
275 			void *g_tasks = (void *)kppet.g_tasks;
276 			kfree_type(thread_t, kppet.g_threads_count, g_tasks);
277 			kppet.g_tasks = NULL;
278 			kppet.g_threads = NULL;
279 			kppet.g_threads_count = 0;
280 			kppet.g_nthreads = 0;
281 		}
282 		if (kppet.g_sample != NULL) {
283 			kfree_type(struct kperf_usample_min,
284 			    kppet.g_sample->usample.usample_min);
285 			kfree_type(struct kperf_sample, kppet.g_sample);
286 		}
287 	}
288 
289 	kppet_unlock();
290 }
291 
292 void
kppet_reset(void)293 kppet_reset(void)
294 {
295 	kppet_config(0);
296 	kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE);
297 	kppet_set_lightweight_pet(0);
298 }
299 
300 void
kppet_wake_thread(void)301 kppet_wake_thread(void)
302 {
303 	thread_wakeup(&kppet);
304 }
305 
306 __attribute__((noreturn))
307 static void
kppet_thread(void * __unused param,wait_result_t __unused wr)308 kppet_thread(void * __unused param, wait_result_t __unused wr)
309 {
310 	kppet_lock();
311 
312 	for (;;) {
313 		BUF_INFO(PERF_PET_IDLE);
314 
315 		do {
316 			(void)lck_mtx_sleep(&kppet_mtx, LCK_SLEEP_DEFAULT, &kppet,
317 			    THREAD_UNINT);
318 		} while (kppet.g_actionid == 0);
319 
320 		BUF_INFO(PERF_PET_RUN);
321 
322 		uint64_t sampledur_abs = kppet_sample_tasks(kppet.g_idle_rate);
323 
324 		kptimer_pet_enter(sampledur_abs);
325 	}
326 }
327 
328 #pragma mark - sampling
329 
330 static void
kppet_sample_thread(int pid,task_t task,thread_t thread,uint32_t idle_rate)331 kppet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate)
332 {
333 	kppet_lock_assert_owned();
334 
335 	uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS |
336 	    SAMPLE_FLAG_THREAD_ONLY;
337 
338 	BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START);
339 
340 	struct kperf_context ctx = {
341 		.cur_thread = thread,
342 		.cur_task = task,
343 		.cur_pid = pid,
344 	};
345 
346 	boolean_t thread_dirty = kperf_thread_get_dirty(thread);
347 
348 	/*
349 	 * Clean a dirty thread and skip callstack sample if the thread was not
350 	 * dirty and thread had skipped less than `idle_rate` samples.
351 	 */
352 	if (thread_dirty) {
353 		kperf_thread_set_dirty(thread, FALSE);
354 	} else if ((thread->kperf_pet_cnt % idle_rate) != 0) {
355 		sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK;
356 	}
357 	thread->kperf_pet_cnt++;
358 
359 	kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid, sample_flags);
360 	kperf_sample_user(&kppet.g_sample->usample, &ctx, kppet.g_actionid,
361 	    sample_flags);
362 
363 	BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
364 }
365 
366 static kern_return_t
kppet_threads_prepare(task_t task)367 kppet_threads_prepare(task_t task)
368 {
369 	kppet_lock_assert_owned();
370 
371 	vm_size_t count_needed;
372 
373 	for (;;) {
374 		task_lock(task);
375 
376 		if (!task->active) {
377 			task_unlock(task);
378 			return KERN_FAILURE;
379 		}
380 
381 		/*
382 		 * With the task locked, figure out if enough space has been allocated to
383 		 * contain all of the thread references.
384 		 */
385 		count_needed = task->thread_count;
386 		if (count_needed <= kppet.g_threads_count) {
387 			break;
388 		}
389 
390 		/*
391 		 * Otherwise, allocate more and try again.
392 		 */
393 		task_unlock(task);
394 
395 		kfree_type(thread_t, kppet.g_threads_count, kppet.g_threads);
396 
397 		assert(count_needed > 0);
398 		kppet.g_threads_count = count_needed;
399 
400 		kppet.g_threads = kalloc_type_tag(thread_t, kppet.g_threads_count,
401 		    Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
402 		if (kppet.g_threads == NULL) {
403 			kppet.g_threads_count = 0;
404 			return KERN_RESOURCE_SHORTAGE;
405 		}
406 	}
407 
408 	thread_t thread;
409 	kppet.g_nthreads = 0;
410 	queue_iterate(&(task->threads), thread, thread_t, task_threads) {
411 		thread_reference(thread);
412 		kppet.g_threads[kppet.g_nthreads++] = thread;
413 	}
414 
415 	task_unlock(task);
416 
417 	return (kppet.g_nthreads > 0) ? KERN_SUCCESS : KERN_FAILURE;
418 }
419 
420 /*
421  * Sample a `task`, using `idle_rate` to control whether idle threads need to be
422  * re-sampled.
423  *
424  * The task must be referenced.
425  */
426 static void
kppet_sample_task(task_t task,uint32_t idle_rate)427 kppet_sample_task(task_t task, uint32_t idle_rate)
428 {
429 	kppet_lock_assert_owned();
430 	assert(task != kernel_task);
431 	if (task == kernel_task) {
432 		return;
433 	}
434 
435 	BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START);
436 
437 	int pid = task_pid(task);
438 	if (kperf_action_has_task(kppet.g_actionid)) {
439 		struct kperf_context ctx = {
440 			.cur_task = task,
441 			.cur_pid = pid,
442 		};
443 
444 		kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid,
445 		    SAMPLE_FLAG_TASK_ONLY);
446 	}
447 
448 	if (!kperf_action_has_thread(kppet.g_actionid)) {
449 		BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END);
450 		return;
451 	}
452 
453 	/*
454 	 * Suspend the task to see an atomic snapshot of all its threads.  This
455 	 * is expensive and disruptive.
456 	 */
457 	kern_return_t kr = task_suspend_internal(task);
458 	if (kr != KERN_SUCCESS) {
459 		BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
460 		return;
461 	}
462 
463 	kr = kppet_threads_prepare(task);
464 	if (kr != KERN_SUCCESS) {
465 		BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
466 		goto out;
467 	}
468 
469 	for (unsigned int i = 0; i < kppet.g_nthreads; i++) {
470 		thread_t thread = kppet.g_threads[i];
471 		assert(thread != THREAD_NULL);
472 
473 		kppet_sample_thread(pid, task, thread, idle_rate);
474 
475 		thread_deallocate(kppet.g_threads[i]);
476 	}
477 
478 out:
479 	task_resume_internal(task);
480 
481 	BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, kppet.g_nthreads);
482 }
483 
484 /*
485  * Store and reference all tasks on the system, so they can be safely inspected
486  * outside the `tasks_threads_lock`.
487  */
488 static kern_return_t
kppet_tasks_prepare(void)489 kppet_tasks_prepare(void)
490 {
491 	kppet_lock_assert_owned();
492 
493 	vm_size_t count_needed = 0;
494 
495 	for (;;) {
496 		lck_mtx_lock(&tasks_threads_lock);
497 
498 		/*
499 		 * With the lock held, break out of the lock/unlock loop if
500 		 * there's enough space to store all the tasks.
501 		 */
502 		count_needed = tasks_count;
503 		if (count_needed <= kppet.g_tasks_count) {
504 			break;
505 		}
506 
507 		/*
508 		 * Otherwise, allocate more memory outside of the lock.
509 		 */
510 		lck_mtx_unlock(&tasks_threads_lock);
511 
512 		if (count_needed > kppet.g_tasks_count) {
513 			if (kppet.g_tasks_count != 0) {
514 				kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
515 			}
516 
517 			assert(count_needed > 0);
518 			kppet.g_tasks_count = count_needed;
519 
520 			kppet.g_tasks = kalloc_type_tag(task_t, kppet.g_tasks_count,
521 			    Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
522 			if (!kppet.g_tasks) {
523 				kppet.g_tasks_count = 0;
524 				return KERN_RESOURCE_SHORTAGE;
525 			}
526 		}
527 	}
528 
529 	task_t task = TASK_NULL;
530 	kppet.g_ntasks = 0;
531 	queue_iterate(&tasks, task, task_t, tasks) {
532 		bool eligible_task = task != kernel_task;
533 		if (eligible_task) {
534 			task_reference(task);
535 			kppet.g_tasks[kppet.g_ntasks++] = task;
536 		}
537 	}
538 
539 	lck_mtx_unlock(&tasks_threads_lock);
540 
541 	return KERN_SUCCESS;
542 }
543 
544 static uint64_t
kppet_sample_tasks(uint32_t idle_rate)545 kppet_sample_tasks(uint32_t idle_rate)
546 {
547 	kppet_lock_assert_owned();
548 	assert(kppet.g_actionid > 0);
549 
550 	uint64_t start_abs = mach_absolute_time();
551 
552 	BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START);
553 
554 	kern_return_t kr = kppet_tasks_prepare();
555 	if (kr != KERN_SUCCESS) {
556 		BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr);
557 		BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END);
558 		return mach_absolute_time() - start_abs;
559 	}
560 
561 	for (unsigned int i = 0; i < kppet.g_ntasks; i++) {
562 		task_t task = kppet.g_tasks[i];
563 		assert(task != TASK_NULL);
564 		kppet_sample_task(task, idle_rate);
565 		task_deallocate(task);
566 		kppet.g_tasks[i] = TASK_NULL;
567 	}
568 
569 	BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, kppet.g_ntasks);
570 	kppet.g_ntasks = 0;
571 	return mach_absolute_time() - start_abs;
572 }
573 
574 #pragma mark - sysctl accessors
575 
576 int
kppet_get_idle_rate(void)577 kppet_get_idle_rate(void)
578 {
579 	return kppet.g_idle_rate;
580 }
581 
582 int
kppet_set_idle_rate(int new_idle_rate)583 kppet_set_idle_rate(int new_idle_rate)
584 {
585 	kppet.g_idle_rate = new_idle_rate;
586 	return 0;
587 }
588 
589 void
kppet_lightweight_active_update(void)590 kppet_lightweight_active_update(void)
591 {
592 	kppet_lightweight_start_time = (kperf_is_sampling() && kppet.g_lightweight) ? mach_continuous_time() : 0;
593 	kperf_on_cpu_update();
594 }
595 
596 int
kppet_get_lightweight_pet(void)597 kppet_get_lightweight_pet(void)
598 {
599 	return kppet.g_lightweight;
600 }
601 
602 int
kppet_set_lightweight_pet(int on)603 kppet_set_lightweight_pet(int on)
604 {
605 	if (kperf_is_sampling()) {
606 		return EBUSY;
607 	}
608 
609 	kppet.g_lightweight = (on == 1);
610 	kppet_lightweight_active_update();
611 	return 0;
612 }
613