xref: /xnu-11215.41.3/osfmk/kperf/pet.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 /*
2  * Copyright (c) 2011-2018 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 /*
30  * Profile Every Thread (PET) provides a profile of all threads on the system
31  * when a timer fires.  PET supports the "record waiting threads" mode in
32  * Instruments, and used to be called All Thread States (ATS).  New tools should
33  * adopt the lightweight PET mode, which provides the same information, but with
34  * much less overhead.
35  *
36  * When traditional (non-lightweight) PET is active, a migrating timer call
37  * causes the PET thread to wake up.  The timer handler also issues a broadcast
38  * IPI to the other CPUs, to provide a (somewhat) synchronized set of on-core
39  * samples.  This is provided for backwards-compatibility with clients that
40  * expect on-core samples, when PET's timer was based off the on-core timers.
41  * Because PET sampling can take on the order of milliseconds, the PET thread
42  * will enter a new timer deadline after it finished sampling This perturbs the
43  * timer cadence by the duration of PET sampling, but it leaves the system to
44  * work on non-profiling tasks for the duration of the timer period.
45  *
46  * Lightweight PET samples the system less-intrusively than normal PET
47  * mode.  Instead of iterating tasks and threads on each sample, it increments
48  * a global generation count, `kppet_gencount`, which is checked as threads are
49  * context switched on-core.  If the thread's local generation count is older
50  * than the global generation, the thread samples itself.
51  *
52  *            |  |
53  * thread A   +--+---------|
54  *            |  |
55  * thread B   |--+---------------|
56  *            |  |
57  * thread C   |  |         |-------------------------------------
58  *            |  |         |
59  * thread D   |  |         |     |-------------------------------
60  *            |  |         |     |
61  *            +--+---------+-----+--------------------------------> time
62  *               |         │     |
63  *               |         +-----+--- threads sampled when they come on-core in
64  *               |                    kperf_pet_switch_context
65  *               |
66  *               +--- PET timer fire, sample on-core threads A and B,
67  *                    increment kppet_gencount
68  */
69 
70 #include <mach/mach_types.h>
71 #include <sys/errno.h>
72 
73 #include <kperf/kperf.h>
74 #include <kperf/buffer.h>
75 #include <kperf/sample.h>
76 #include <kperf/context.h>
77 #include <kperf/action.h>
78 #include <kperf/pet.h>
79 #include <kperf/kptimer.h>
80 
81 #include <kern/task.h>
82 #include <kern/kalloc.h>
83 #if defined(__x86_64__)
84 #include <i386/mp.h>
85 #endif /* defined(__x86_64__) */
86 
87 static LCK_MTX_DECLARE(kppet_mtx, &kperf_lck_grp);
88 
89 static struct {
90 	unsigned int g_actionid;
91 	/*
92 	 * The idle rate controls how many sampling periods to skip if a thread
93 	 * is idle.
94 	 */
95 	uint32_t g_idle_rate;
96 	bool g_setup:1;
97 	bool g_lightweight:1;
98 	struct kperf_sample *g_sample;
99 
100 	thread_t g_sample_thread;
101 
102 	/*
103 	 * Used by the PET thread to manage which threads and tasks to sample.
104 	 */
105 	thread_t *g_threads;
106 	unsigned int g_nthreads;
107 	size_t g_threads_count;
108 
109 	task_t *g_tasks;
110 	unsigned int g_ntasks;
111 	size_t g_tasks_count;
112 } kppet = {
113 	.g_actionid = 0,
114 	.g_idle_rate = KPERF_PET_DEFAULT_IDLE_RATE,
115 };
116 
117 bool kppet_lightweight_active = false;
118 _Atomic uint32_t kppet_gencount = 0;
119 
120 static uint64_t kppet_sample_tasks(uint32_t idle_rate);
121 static void kppet_thread(void * param, wait_result_t wr);
122 
123 static void
kppet_lock_assert_owned(void)124 kppet_lock_assert_owned(void)
125 {
126 	lck_mtx_assert(&kppet_mtx, LCK_MTX_ASSERT_OWNED);
127 }
128 
129 static void
kppet_lock(void)130 kppet_lock(void)
131 {
132 	lck_mtx_lock(&kppet_mtx);
133 }
134 
135 static void
kppet_unlock(void)136 kppet_unlock(void)
137 {
138 	lck_mtx_unlock(&kppet_mtx);
139 }
140 
141 void
kppet_on_cpu(thread_t thread,thread_continue_t continuation,uintptr_t * starting_fp)142 kppet_on_cpu(thread_t thread, thread_continue_t continuation,
143     uintptr_t *starting_fp)
144 {
145 	assert(thread != NULL);
146 	assert(ml_get_interrupts_enabled() == FALSE);
147 
148 	uint32_t actionid = kppet.g_actionid;
149 	if (actionid == 0) {
150 		return;
151 	}
152 
153 	if (thread->kperf_pet_gen != atomic_load(&kppet_gencount)) {
154 		BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START,
155 		    atomic_load_explicit(&kppet_gencount,
156 		    memory_order_relaxed), thread->kperf_pet_gen);
157 
158 		task_t task = get_threadtask(thread);
159 		struct kperf_context ctx = {
160 			.cur_thread = thread,
161 			.cur_task = task,
162 			.cur_pid = task_pid(task),
163 			.starting_fp = starting_fp,
164 		};
165 		/*
166 		 * Use a per-CPU interrupt buffer, since this is only called
167 		 * while interrupts are disabled, from the scheduler.
168 		 */
169 		struct kperf_sample *sample = kperf_intr_sample_buffer();
170 		if (!sample) {
171 			BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END, 1);
172 			return;
173 		}
174 
175 		unsigned int flags = SAMPLE_FLAG_NON_INTERRUPT | SAMPLE_FLAG_PEND_USER;
176 		if (continuation != NULL) {
177 			flags |= SAMPLE_FLAG_CONTINUATION;
178 		}
179 		kperf_sample(sample, &ctx, actionid, flags);
180 
181 		BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
182 	} else {
183 		BUF_VERB(PERF_PET_SAMPLE_THREAD,
184 		    os_atomic_load(&kppet_gencount, relaxed), thread->kperf_pet_gen);
185 	}
186 }
187 
188 #pragma mark - state transitions
189 
190 /*
191  * Lazily initialize PET.  The PET thread never exits once PET has been used
192  * once.
193  */
194 static void
kppet_setup(void)195 kppet_setup(void)
196 {
197 	if (kppet.g_setup) {
198 		return;
199 	}
200 
201 	kern_return_t kr = kernel_thread_start(kppet_thread, NULL,
202 	    &kppet.g_sample_thread);
203 	if (kr != KERN_SUCCESS) {
204 		panic("kperf: failed to create PET thread %d", kr);
205 	}
206 
207 	thread_set_thread_name(kppet.g_sample_thread, "kperf-pet-sampling");
208 	kppet.g_setup = true;
209 }
210 
211 void
kppet_config(unsigned int actionid)212 kppet_config(unsigned int actionid)
213 {
214 	/*
215 	 * Resetting kperf shouldn't get the PET thread started.
216 	 */
217 	if (actionid == 0 && !kppet.g_setup) {
218 		return;
219 	}
220 
221 	kppet_setup();
222 
223 	kppet_lock();
224 
225 	kppet.g_actionid = actionid;
226 
227 	if (actionid > 0) {
228 		if (!kppet.g_sample) {
229 			kppet.g_sample = kalloc_type_tag(struct kperf_sample,
230 			    Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
231 			kppet.g_sample->usample.usample_min = kalloc_type_tag(
232 				struct kperf_usample_min, Z_WAITOK | Z_NOFAIL, VM_KERN_MEMORY_DIAG);
233 		}
234 	} else {
235 		if (kppet.g_tasks) {
236 			assert(kppet.g_tasks_count != 0);
237 			kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
238 			kppet.g_tasks = NULL;
239 			kppet.g_tasks_count = 0;
240 			kppet.g_ntasks = 0;
241 		}
242 		if (kppet.g_threads) {
243 			assert(kppet.g_threads_count != 0);
244 			void *g_tasks = (void *)kppet.g_tasks;
245 			kfree_type(thread_t, kppet.g_threads_count, g_tasks);
246 			kppet.g_tasks = NULL;
247 			kppet.g_threads = NULL;
248 			kppet.g_threads_count = 0;
249 			kppet.g_nthreads = 0;
250 		}
251 		if (kppet.g_sample != NULL) {
252 			kfree_type(struct kperf_usample_min,
253 			    kppet.g_sample->usample.usample_min);
254 			kfree_type(struct kperf_sample, kppet.g_sample);
255 		}
256 	}
257 
258 	kppet_unlock();
259 }
260 
261 void
kppet_reset(void)262 kppet_reset(void)
263 {
264 	kppet_config(0);
265 	kppet_set_idle_rate(KPERF_PET_DEFAULT_IDLE_RATE);
266 	kppet_set_lightweight_pet(0);
267 }
268 
269 void
kppet_wake_thread(void)270 kppet_wake_thread(void)
271 {
272 	thread_wakeup(&kppet);
273 }
274 
275 __attribute__((noreturn))
276 static void
kppet_thread(void * __unused param,wait_result_t __unused wr)277 kppet_thread(void * __unused param, wait_result_t __unused wr)
278 {
279 	kppet_lock();
280 
281 	for (;;) {
282 		BUF_INFO(PERF_PET_IDLE);
283 
284 		do {
285 			(void)lck_mtx_sleep(&kppet_mtx, LCK_SLEEP_DEFAULT, &kppet,
286 			    THREAD_UNINT);
287 		} while (kppet.g_actionid == 0);
288 
289 		BUF_INFO(PERF_PET_RUN);
290 
291 		uint64_t sampledur_abs = kppet_sample_tasks(kppet.g_idle_rate);
292 
293 		kptimer_pet_enter(sampledur_abs);
294 	}
295 }
296 
297 #pragma mark - sampling
298 
299 static void
kppet_sample_thread(int pid,task_t task,thread_t thread,uint32_t idle_rate)300 kppet_sample_thread(int pid, task_t task, thread_t thread, uint32_t idle_rate)
301 {
302 	kppet_lock_assert_owned();
303 
304 	uint32_t sample_flags = SAMPLE_FLAG_IDLE_THREADS |
305 	    SAMPLE_FLAG_THREAD_ONLY;
306 
307 	BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_START);
308 
309 	struct kperf_context ctx = {
310 		.cur_thread = thread,
311 		.cur_task = task,
312 		.cur_pid = pid,
313 	};
314 
315 	boolean_t thread_dirty = kperf_thread_get_dirty(thread);
316 
317 	/*
318 	 * Clean a dirty thread and skip callstack sample if the thread was not
319 	 * dirty and thread had skipped less than `idle_rate` samples.
320 	 */
321 	if (thread_dirty) {
322 		kperf_thread_set_dirty(thread, FALSE);
323 	} else if ((thread->kperf_pet_cnt % idle_rate) != 0) {
324 		sample_flags |= SAMPLE_FLAG_EMPTY_CALLSTACK;
325 	}
326 	thread->kperf_pet_cnt++;
327 
328 	kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid, sample_flags);
329 	kperf_sample_user(&kppet.g_sample->usample, &ctx, kppet.g_actionid,
330 	    sample_flags);
331 
332 	BUF_VERB(PERF_PET_SAMPLE_THREAD | DBG_FUNC_END);
333 }
334 
335 static kern_return_t
kppet_threads_prepare(task_t task)336 kppet_threads_prepare(task_t task)
337 {
338 	kppet_lock_assert_owned();
339 
340 	vm_size_t count_needed;
341 
342 	for (;;) {
343 		task_lock(task);
344 
345 		if (!task->active) {
346 			task_unlock(task);
347 			return KERN_FAILURE;
348 		}
349 
350 		/*
351 		 * With the task locked, figure out if enough space has been allocated to
352 		 * contain all of the thread references.
353 		 */
354 		count_needed = task->thread_count;
355 		if (count_needed <= kppet.g_threads_count) {
356 			break;
357 		}
358 
359 		/*
360 		 * Otherwise, allocate more and try again.
361 		 */
362 		task_unlock(task);
363 
364 		kfree_type(thread_t, kppet.g_threads_count, kppet.g_threads);
365 
366 		assert(count_needed > 0);
367 		kppet.g_threads_count = count_needed;
368 
369 		kppet.g_threads = kalloc_type_tag(thread_t, kppet.g_threads_count,
370 		    Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
371 		if (kppet.g_threads == NULL) {
372 			kppet.g_threads_count = 0;
373 			return KERN_RESOURCE_SHORTAGE;
374 		}
375 	}
376 
377 	thread_t thread;
378 	kppet.g_nthreads = 0;
379 	queue_iterate(&(task->threads), thread, thread_t, task_threads) {
380 		thread_reference(thread);
381 		kppet.g_threads[kppet.g_nthreads++] = thread;
382 	}
383 
384 	task_unlock(task);
385 
386 	return (kppet.g_nthreads > 0) ? KERN_SUCCESS : KERN_FAILURE;
387 }
388 
389 /*
390  * Sample a `task`, using `idle_rate` to control whether idle threads need to be
391  * re-sampled.
392  *
393  * The task must be referenced.
394  */
395 static void
kppet_sample_task(task_t task,uint32_t idle_rate)396 kppet_sample_task(task_t task, uint32_t idle_rate)
397 {
398 	kppet_lock_assert_owned();
399 	assert(task != kernel_task);
400 	if (task == kernel_task) {
401 		return;
402 	}
403 
404 	BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_START);
405 
406 	int pid = task_pid(task);
407 	if (kperf_action_has_task(kppet.g_actionid)) {
408 		struct kperf_context ctx = {
409 			.cur_task = task,
410 			.cur_pid = pid,
411 		};
412 
413 		kperf_sample(kppet.g_sample, &ctx, kppet.g_actionid,
414 		    SAMPLE_FLAG_TASK_ONLY);
415 	}
416 
417 	if (!kperf_action_has_thread(kppet.g_actionid)) {
418 		BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END);
419 		return;
420 	}
421 
422 	/*
423 	 * Suspend the task to see an atomic snapshot of all its threads.  This
424 	 * is expensive and disruptive.
425 	 */
426 	kern_return_t kr = task_suspend_internal(task);
427 	if (kr != KERN_SUCCESS) {
428 		BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, 1);
429 		return;
430 	}
431 
432 	kr = kppet_threads_prepare(task);
433 	if (kr != KERN_SUCCESS) {
434 		BUF_INFO(PERF_PET_ERROR, ERR_THREAD, kr);
435 		goto out;
436 	}
437 
438 	for (unsigned int i = 0; i < kppet.g_nthreads; i++) {
439 		thread_t thread = kppet.g_threads[i];
440 		assert(thread != THREAD_NULL);
441 
442 		kppet_sample_thread(pid, task, thread, idle_rate);
443 
444 		thread_deallocate(kppet.g_threads[i]);
445 	}
446 
447 out:
448 	task_resume_internal(task);
449 
450 	BUF_VERB(PERF_PET_SAMPLE_TASK | DBG_FUNC_END, kppet.g_nthreads);
451 }
452 
453 /*
454  * Store and reference all tasks on the system, so they can be safely inspected
455  * outside the `tasks_threads_lock`.
456  */
457 static kern_return_t
kppet_tasks_prepare(void)458 kppet_tasks_prepare(void)
459 {
460 	kppet_lock_assert_owned();
461 
462 	vm_size_t count_needed = 0;
463 
464 	for (;;) {
465 		lck_mtx_lock(&tasks_threads_lock);
466 
467 		/*
468 		 * With the lock held, break out of the lock/unlock loop if
469 		 * there's enough space to store all the tasks.
470 		 */
471 		count_needed = tasks_count;
472 		if (count_needed <= kppet.g_tasks_count) {
473 			break;
474 		}
475 
476 		/*
477 		 * Otherwise, allocate more memory outside of the lock.
478 		 */
479 		lck_mtx_unlock(&tasks_threads_lock);
480 
481 		if (count_needed > kppet.g_tasks_count) {
482 			if (kppet.g_tasks_count != 0) {
483 				kfree_type(task_t, kppet.g_tasks_count, kppet.g_tasks);
484 			}
485 
486 			assert(count_needed > 0);
487 			kppet.g_tasks_count = count_needed;
488 
489 			kppet.g_tasks = kalloc_type_tag(task_t, kppet.g_tasks_count,
490 			    Z_WAITOK | Z_ZERO, VM_KERN_MEMORY_DIAG);
491 			if (!kppet.g_tasks) {
492 				kppet.g_tasks_count = 0;
493 				return KERN_RESOURCE_SHORTAGE;
494 			}
495 		}
496 	}
497 
498 	task_t task = TASK_NULL;
499 	kppet.g_ntasks = 0;
500 	queue_iterate(&tasks, task, task_t, tasks) {
501 		bool eligible_task = task != kernel_task;
502 		if (eligible_task) {
503 			task_reference(task);
504 			kppet.g_tasks[kppet.g_ntasks++] = task;
505 		}
506 	}
507 
508 	lck_mtx_unlock(&tasks_threads_lock);
509 
510 	return KERN_SUCCESS;
511 }
512 
513 static uint64_t
kppet_sample_tasks(uint32_t idle_rate)514 kppet_sample_tasks(uint32_t idle_rate)
515 {
516 	kppet_lock_assert_owned();
517 	assert(kppet.g_actionid > 0);
518 
519 	uint64_t start_abs = mach_absolute_time();
520 
521 	BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_START);
522 
523 	kern_return_t kr = kppet_tasks_prepare();
524 	if (kr != KERN_SUCCESS) {
525 		BUF_INFO(PERF_PET_ERROR, ERR_TASK, kr);
526 		BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END);
527 		return mach_absolute_time() - start_abs;
528 	}
529 
530 	for (unsigned int i = 0; i < kppet.g_ntasks; i++) {
531 		task_t task = kppet.g_tasks[i];
532 		assert(task != TASK_NULL);
533 		kppet_sample_task(task, idle_rate);
534 		task_deallocate(task);
535 		kppet.g_tasks[i] = TASK_NULL;
536 	}
537 
538 	BUF_INFO(PERF_PET_SAMPLE | DBG_FUNC_END, kppet.g_ntasks);
539 	kppet.g_ntasks = 0;
540 	return mach_absolute_time() - start_abs;
541 }
542 
543 #pragma mark - sysctl accessors
544 
545 int
kppet_get_idle_rate(void)546 kppet_get_idle_rate(void)
547 {
548 	return kppet.g_idle_rate;
549 }
550 
551 int
kppet_set_idle_rate(int new_idle_rate)552 kppet_set_idle_rate(int new_idle_rate)
553 {
554 	kppet.g_idle_rate = new_idle_rate;
555 	return 0;
556 }
557 
558 void
kppet_lightweight_active_update(void)559 kppet_lightweight_active_update(void)
560 {
561 	kppet_lightweight_active = (kperf_is_sampling() && kppet.g_lightweight);
562 	kperf_on_cpu_update();
563 }
564 
565 int
kppet_get_lightweight_pet(void)566 kppet_get_lightweight_pet(void)
567 {
568 	return kppet.g_lightweight;
569 }
570 
571 int
kppet_set_lightweight_pet(int on)572 kppet_set_lightweight_pet(int on)
573 {
574 	if (kperf_is_sampling()) {
575 		return EBUSY;
576 	}
577 
578 	kppet.g_lightweight = (on == 1);
579 	kppet_lightweight_active_update();
580 	return 0;
581 }
582