1 #include <darwintest.h>
2
3 #include <assert.h>
4 #include <mach/clock_types.h>
5 #include <unistd.h>
6 #include <stdlib.h>
7 #include <stdio.h>
8 #include <errno.h>
9 #include <err.h>
10 #include <sys/time.h>
11 #include <mach/mach.h>
12 #include <mach/mach_time.h>
13 #include <pthread.h>
14 #include <perfdata/perfdata.h>
15 #include <sys/sysctl.h>
16 #include <sys/stat.h>
17 #include <sys/mount.h>
18 #include <stdbool.h>
19 #include <signal.h>
20 #include <sys/resource.h>
21 #include <sys/resource_private.h>
22 #include <time.h>
23 #include <os/atomic_private.h>
24 #include <libproc.h>
25 #include <TargetConditionals.h>
26 #include <sys/times.h>
27 #include "sched_test_utils.h"
28
29 #if __has_include(<mach/mach_time_private.h>)
30 #include <mach/mach_time_private.h>
31 #else
32 kern_return_t mach_get_times(uint64_t* absolute_time,
33 uint64_t* continuous_time,
34 struct timespec *tp);
35 #endif
36
37 /*
38 * This test program creates up to 8 worker threads performing
39 * mixed workloads of system calls (which contribute to both
40 * user and system time), as well as spins in userspace (which
41 * only contribute to user time).
42 *
43 * setitimer(2) is used to program timers that fire signals
44 * after various thresholds. The signal handler detects
45 * which thread the signal was delivered on by matching the
46 * stack pointer to ranges for each thread.
47 *
48 * After the test scenario is complete, the distribution of
49 * threads which received interrupts is evaluated to match
50 * expected heuristics.
51 */
52
53 T_GLOBAL_META(
54 T_META_RUN_CONCURRENTLY(false),
55 T_META_CHECK_LEAKS(false),
56 T_META_ALL_VALID_ARCHS(true),
57 T_META_RADAR_COMPONENT_NAME("xnu"),
58 T_META_RADAR_COMPONENT_VERSION("scheduler"),
59 T_META_OWNER("chimene"),
60 T_META_ENABLED(TARGET_OS_OSX),
61 T_META_TAG_VM_NOT_ELIGIBLE,
62 T_META_ASROOT(true) // for trace recording
63 );
64
65 static void *stat_thread(void *arg);
66 static void *statfs_thread(void *arg);
67
68 static void alrm_handler(int, struct __siginfo *, void *);
69
70 static semaphore_t gMainWaitForWorkers;
71 static semaphore_t gWorkersStart;
72
73 static pthread_mutex_t gShouldExitMutex = PTHREAD_MUTEX_INITIALIZER;
74 static pthread_cond_t gShouldExitCondition = PTHREAD_COND_INITIALIZER;
75
76 static _Atomic bool gShouldExit = false;
77
78 static const uint32_t max_threads = 9;
79
80 static struct threadentry {
81 pthread_t thread;
82 uint64_t tid;
83 void* stack_addr;
84 size_t stack_size;
85 bool expect_cpu_usage;
86 uint32_t alrm_count;
87 uint32_t vtalrm_count;
88 uint32_t prof_count;
89 uint32_t xcpu_count;
90 struct thsc_time_cpi self_stats;
91 } __attribute__((aligned(128))) gThreadList[max_threads];
92
93 static uint32_t nworkers;
94 static uint32_t nthreads;
95
96 static double offcore_time_percent_threshold = 75.0;
97
98 static bool is_rosetta = false;
99
100 static mach_timebase_info_data_t timebase_info;
101
102 /* Some statistics APIs return host abstime instead of Rosetta-translated abstime */
103 static uint64_t
abs_to_nanos_host(uint64_t abstime)104 abs_to_nanos_host(uint64_t abstime)
105 {
106 if (is_rosetta) {
107 return abstime * 125 / 3;
108 } else {
109 return abs_to_nanos(abstime);
110 }
111 }
112
113 static int
processIsTranslated(void)114 processIsTranslated(void)
115 {
116 int ret = 0;
117 size_t size = sizeof(ret);
118 if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) == -1) {
119 if (errno == ENOENT) {
120 return 0;
121 } else {
122 return -1;
123 }
124 }
125 return ret;
126 }
127
128 static void
fill_thread_stats(uint32_t i)129 fill_thread_stats(uint32_t i)
130 {
131 struct threadentry *entry = &gThreadList[i];
132
133 int rv = thread_selfcounts(THSC_TIME_CPI, &entry->self_stats, sizeof(entry->self_stats));
134 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "thread_selfcounts(THSC_TIME_CPI)");
135 }
136
137 T_DECL(setitimer,
138 "Test various setitimer delivered signals to CPU-burning threads")
139 {
140 T_SETUPBEGIN;
141
142 int rv;
143 kern_return_t kr;
144 uint32_t ncpu;
145 size_t ncpu_size = sizeof(ncpu);
146
147 trace_handle_t trace = NULL;
148
149 if (geteuid() == 0) {
150 trace = begin_collect_trace_fmt(COLLECT_TRACE_FLAG_DISABLE_SYSCALLS,
151 argc, argv, "test_setitimer");
152 }
153
154 struct sched_param self_param = {.sched_priority = 47};
155
156 rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, &self_param);
157 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_setschedparam");
158
159 kr = mach_timebase_info(&timebase_info);
160 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
161
162 long ticks = sysconf(_SC_CLK_TCK);
163 T_LOG("sysconf(_SC_CLK_TCK) = %ld\n", ticks);
164
165 is_rosetta = processIsTranslated();
166
167 rv = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
168 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
169
170 if (ncpu < 2) {
171 T_SKIP("%d CPUs not supported for test, returning success", ncpu);
172 }
173
174 nworkers = MIN(max_threads - 1, ncpu);
175 nthreads = nworkers + 1;
176
177 T_LOG("rosetta = %d\n", is_rosetta);
178 T_LOG("hw.ncpu = %d\n", ncpu);
179 T_LOG("nworkers = %d\n", nworkers);
180 T_LOG("nthreads = %d\n", nthreads);
181
182 kr = semaphore_create(mach_task_self(), &gMainWaitForWorkers, SYNC_POLICY_FIFO, 0);
183 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
184
185 kr = semaphore_create(mach_task_self(), &gWorkersStart, SYNC_POLICY_FIFO, 0);
186 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
187
188 pthread_attr_t attr;
189
190 rv = pthread_attr_init(&attr);
191 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
192
193 struct sched_param child_param = {.sched_priority = 37};
194
195 rv = pthread_attr_setschedparam(&attr, &child_param);
196 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_set_qos_class_np");
197
198 for (uint32_t i = 0; i < nthreads; i++) {
199 if (i == 0) {
200 gThreadList[i].thread = pthread_self();
201 } else {
202 rv = pthread_create(&gThreadList[i].thread, &attr,
203 i % 2 ? stat_thread : statfs_thread,
204 (void *)(uintptr_t)i);
205 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
206 gThreadList[i].expect_cpu_usage = i % 2 == 0 ? true : false;
207 }
208
209 rv = pthread_threadid_np(gThreadList[i].thread, &gThreadList[i].tid);
210 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_threadid_np");
211
212 gThreadList[i].stack_addr = pthread_get_stackaddr_np(gThreadList[i].thread);
213 gThreadList[i].stack_size = pthread_get_stacksize_np(gThreadList[i].thread);
214 }
215
216 rv = pthread_attr_destroy(&attr);
217 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
218
219 for (uint32_t i = 1; i < nthreads; i++) {
220 kr = semaphore_wait(gMainWaitForWorkers);
221 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait()");
222 }
223
224 for (uint32_t i = 0; i < nthreads; i++) {
225 T_LOG("Thread %p (0x%llx) checked in, stack %p/%p\n",
226 (void*)gThreadList[i].thread,
227 gThreadList[i].tid,
228 gThreadList[i].stack_addr,
229 (void *)gThreadList[i].stack_size);
230 }
231
232 check_recommended_core_mask(NULL);
233
234 wait_for_quiescence_default(argc, argv);
235
236 T_SETUPEND;
237
238 T_LOG("Finished wait_for_quiescence_default, starting test\n");
239
240 sigset_t sigmk;
241 sigemptyset(&sigmk);
242
243 struct sigaction sigact = {
244 .sa_sigaction = alrm_handler,
245 .sa_mask = sigmk,
246 .sa_flags = SA_SIGINFO,
247 };
248
249 rv = sigaction(SIGALRM, &sigact, NULL);
250 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGALRM)");
251
252 rv = sigaction(SIGVTALRM, &sigact, NULL);
253 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGVTALRM)");
254
255 rv = sigaction(SIGPROF, &sigact, NULL);
256 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGPROF)");
257
258 rv = sigaction(SIGXCPU, &sigact, NULL);
259 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGXCPU)");
260
261 struct itimerval itime = {
262 .it_interval.tv_sec = 0,
263 .it_interval.tv_usec = 10000,
264 .it_value.tv_sec = 0,
265 .it_value.tv_usec = 10, /* immediately */
266 };
267
268 rv = setitimer(ITIMER_REAL, &itime, NULL);
269 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
270
271 rv = setitimer(ITIMER_VIRTUAL, &itime, NULL);
272 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)");
273
274 rv = setitimer(ITIMER_PROF, &itime, NULL);
275 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)");
276
277 struct rlimit rlim = {};
278
279 rv = getrlimit(RLIMIT_CPU, &rlim);
280 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrlimit(RLIMIT_CPU)");
281
282 rlim.rlim_cur = 1;
283 rv = setrlimit(RLIMIT_CPU, &rlim);
284 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)");
285
286 rv = pthread_mutex_lock(&gShouldExitMutex);
287 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
288
289 kr = semaphore_signal_all(gWorkersStart);
290 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all()");
291
292 struct timespec timenow = {};
293 uint64_t time_start;
294
295 clock_t st_time, en_time;
296 struct tms st_cpu, en_cpu;
297 st_time = times(&st_cpu);
298
299 struct rusage ru_start, ru_end;
300 rv = getrusage(RUSAGE_SELF, &ru_start);
301 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_start)");
302
303 kr = mach_get_times(&time_start, NULL, &timenow);
304 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_get_times()");
305
306 struct timespec timeout = {
307 .tv_sec = timenow.tv_sec + 10,
308 .tv_nsec = timenow.tv_nsec,
309 };
310
311 uint64_t time_end = 0;
312
313 do {
314 assert(os_atomic_load(&gShouldExit, relaxed) == false);
315
316 rv = pthread_cond_timedwait(&gShouldExitCondition, &gShouldExitMutex, &timeout);
317 if (rv == ETIMEDOUT) {
318 os_atomic_store(&gShouldExit, true, relaxed);
319
320 time_end = mach_absolute_time();
321
322 struct itimerval itime_stop = {
323 .it_interval.tv_sec = 0,
324 .it_interval.tv_usec = 0,
325 .it_value.tv_sec = 0,
326 .it_value.tv_usec = 0, /* stop immediately */
327 };
328
329 rv = setitimer(ITIMER_REAL, &itime_stop, NULL);
330 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
331
332 rv = setitimer(ITIMER_VIRTUAL, &itime_stop, NULL);
333 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)");
334
335 rv = setitimer(ITIMER_PROF, &itime_stop, NULL);
336 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)");
337
338 rlim.rlim_cur = RLIM_INFINITY;
339 rv = setrlimit(RLIMIT_CPU, &rlim);
340 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)");
341
342 en_time = times(&en_cpu);
343 rv = getrusage(RUSAGE_SELF, &ru_end);
344 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_end)");
345
346 break;
347 } else {
348 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_cond_timedwait(&gShouldExitCondition, ...)");
349 }
350 } while (true);
351
352 rv = pthread_mutex_unlock(&gShouldExitMutex);
353 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
354
355 for (uint32_t i = 1; i < nthreads; i++) {
356 rv = pthread_join(gThreadList[i].thread, NULL);
357 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join");
358 }
359
360 fill_thread_stats(0);
361
362 struct rusage_info_v6 ru = {};
363 rv = proc_pid_rusage(getpid(), RUSAGE_INFO_V6, (rusage_info_t *)&ru);
364 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "proc_pid_rusage");
365
366 uint64_t test_duration = time_end - time_start;
367 uint64_t test_duration_ns = abs_to_nanos(test_duration);
368
369 double elapsed_secs = (double) test_duration_ns / (uint64_t)NSEC_PER_SEC;
370
371 T_LOG("test duration %3.3f seconds (%lld)\n", elapsed_secs, test_duration_ns);
372
373 uintmax_t real_ticks = en_time - st_time;
374 uintmax_t user_ticks = en_cpu.tms_utime - st_cpu.tms_utime;
375 uintmax_t system_ticks = en_cpu.tms_stime - st_cpu.tms_stime;
376
377 T_LOG("times(): Real Time: %jd (%3.3f seconds), User Time %jd (%3.3f seconds), System Time %jd (%3.3f seconds)\n",
378 real_ticks, (double)real_ticks / (double)ticks,
379 user_ticks, (double)user_ticks / (double)ticks,
380 system_ticks, (double)system_ticks / (double)ticks);
381
382 struct timeval ru_udelta, ru_sdelta;
383
384 timersub(&ru_end.ru_utime, &ru_start.ru_utime, &ru_udelta);
385 timersub(&ru_end.ru_stime, &ru_start.ru_stime, &ru_sdelta);
386
387 T_LOG("rusage(): User Time %ld.%06d, System Time %ld.%06d\n",
388 ru_udelta.tv_sec, ru_udelta.tv_usec,
389 ru_sdelta.tv_sec, ru_sdelta.tv_usec);
390
391 uint64_t total_user_time_ns = abs_to_nanos_host(ru.ri_user_time);
392 double total_user_time_s = (double)total_user_time_ns / (uint64_t)NSEC_PER_SEC;
393
394 uint64_t total_system_time_ns = abs_to_nanos_host(ru.ri_system_time);
395 double total_system_time_s = (double)total_system_time_ns / (uint64_t)NSEC_PER_SEC;
396
397 uint64_t total_time_ns = (total_user_time_ns + total_system_time_ns);
398 double total_time_s = (double)total_time_ns / (uint64_t)NSEC_PER_SEC;
399
400 uint64_t total_runnable_time_ns = abs_to_nanos_host(ru.ri_runnable_time);
401 double total_runnable_time_s = (double)total_runnable_time_ns / (uint64_t)NSEC_PER_SEC;
402
403 uint64_t total_pending_time_ns = total_runnable_time_ns - (total_time_ns);
404 double total_pending_time_s = (double)total_pending_time_ns / (uint64_t)NSEC_PER_SEC;
405
406 uint64_t total_p_time_ns = abs_to_nanos_host(ru.ri_user_ptime + ru.ri_system_ptime);
407 double total_p_time_s = (double)total_p_time_ns / (uint64_t)NSEC_PER_SEC;
408
409 T_LOG("total usage: time: %3.3f user: %3.3f kernel: %3.3f runnable: %3.3f pending: %3.3f pcore: %3.3f\n",
410 total_time_s, total_user_time_s, total_system_time_s,
411 total_runnable_time_s, total_pending_time_s,
412 total_p_time_s);
413
414 /*
415 * "Good" data looks like:
416 *
417 * total usage: time: 77.696 user: 16.570 kernel: 61.126 runnable: 79.951 pending: 2.255 pcore: 72.719
418 * Thread ALRM VTALRM PROF XCPU inst cycle user kernel offcore type
419 * 0x16f78f000 0 251 811 0 27680301973 28913501188 3706622958 ( 38.14%) 6012631083 ( 61.86%) 2.81% statfs
420 * 0x16f81b000 0 2 889 0 27962710058 28780576123 439297291 ( 4.53%) 9259942583 ( 95.47%) 3.01% stat
421 * 0x16f8a7000 0 251 836 0 27558331077 28889228535 3699010000 ( 38.08%) 6016015083 ( 61.92%) 2.85% statfs
422 * 0x16f933000 0 0 939 0 28078084696 28880195679 443067500 ( 4.56%) 9269807666 ( 95.44%) 2.87% stat
423 * 0x16f9bf000 0 283 874 0 27691851016 28969873070 3710916750 ( 38.16%) 6012783541 ( 61.84%) 2.76% statfs
424 * 0x16fa4b000 0 2 908 1 27945063330 28769971396 438583000 ( 4.53%) 9252694291 ( 95.47%) 3.09% stat
425 * 0x16fad7000 0 262 889 0 27328496429 28772748055 3689245375 ( 38.03%) 6011061458 ( 61.97%) 3.00% statfs
426 * 0x16fb63000 0 0 914 0 27942195343 28757254100 439690166 ( 4.53%) 9256659500 ( 95.47%) 3.04% stat
427 * 0x1fe2bb400 1001 0 3 0 72144372 102339334 3532125 ( 9.35%) 34249208 ( 90.65%) 99.62% main
428 */
429 uint32_t total_alrm = 0;
430 uint32_t total_vtalrm = 0;
431 uint32_t total_prof = 0;
432 uint32_t total_xcpu = 0;
433 uint32_t total_vtalrm_in_cpubound = 0;
434
435 uint32_t total_threads_not_finding_cpus = 0;
436
437 T_LOG("Thread ALRM VTALRM PROF XCPU "
438 "inst cycle user kernel "
439 "offcore type\n");
440
441 for (uint32_t i = 0; i < nthreads; i++) {
442 uint64_t user_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_user_time_mach);
443 uint64_t system_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_system_time_mach);
444
445
446 uint64_t total_time = user_time + system_time;
447
448 double percentage_user = (double)user_time / (double) total_time * 100;
449 double percentage_system = (double)system_time / (double) total_time * 100;
450
451 uint64_t not_running_time = test_duration_ns - total_time;
452 /*
453 * The worker threads spend a little bit of time waking up before the test
454 * start time is captured, so they can occasionally have a larger total_time
455 * than the test duration, leading to underflow in not_running_time and
456 * test failures.
457 * rdar://106147865
458 */
459 if (total_time > test_duration_ns) {
460 not_running_time = 0;
461 }
462 double percentage_not_running = (double)(not_running_time) / (double) test_duration_ns * 100;
463
464 char* thread_type_str = "";
465 char* warning_str = "";
466
467 if (i == 0) {
468 thread_type_str = "main ";
469 } else {
470 thread_type_str = i % 2 ? "stat " : "statfs ";
471
472 if (percentage_not_running > offcore_time_percent_threshold) {
473 total_threads_not_finding_cpus++;
474 warning_str = "** too much offcore time **";
475 }
476 }
477
478 T_LOG("0x%010llx %6d %6d %6d %6d %12lld %12lld %12lld (%7.2f%%) %12lld (%7.2f%%) %7.2f%% %s%s\n",
479 gThreadList[i].tid,
480 gThreadList[i].alrm_count,
481 gThreadList[i].vtalrm_count,
482 gThreadList[i].prof_count,
483 gThreadList[i].xcpu_count,
484 gThreadList[i].self_stats.ttci_instructions,
485 gThreadList[i].self_stats.ttci_cycles,
486 user_time, percentage_user,
487 system_time, percentage_system,
488 percentage_not_running,
489 thread_type_str, warning_str);
490
491 total_alrm += gThreadList[i].alrm_count;
492 total_vtalrm += gThreadList[i].vtalrm_count;
493 total_prof += gThreadList[i].prof_count;
494 total_xcpu += gThreadList[i].xcpu_count;
495
496 if (gThreadList[i].expect_cpu_usage) {
497 total_vtalrm_in_cpubound += gThreadList[i].vtalrm_count;
498 }
499 }
500
501 /*
502 * We expect all SIGALRM to go to the main thread, because it is the
503 * first thread in the process with the signal unmasked, and we
504 * never expect the signal handler itself to take >10ms
505 *
506 * This can happen if the main thread is preempted for the entire 10ms duration, though.
507 * Being high priority, it shouldn't be delayed for more than 10ms too often.
508 * Allow up to 10% to deliver to other threads.
509 */
510 if ((double)gThreadList[0].alrm_count * 100 / total_alrm < 90.0) {
511 T_FAIL("SIGALRM delivered to non-main thread more than 10%% of the time (%d of %d)",
512 gThreadList[0].alrm_count,
513 total_alrm);
514 }
515
516 /* We expect all worker threads to find CPUs of their own for most of the test */
517 if (total_threads_not_finding_cpus != 0) {
518 T_FAIL("%d worker threads spent more than %2.0f%% of time off-core",
519 total_threads_not_finding_cpus, offcore_time_percent_threshold);
520 }
521
522 /*
523 * SIGVTALRM is delivered based on user time, and we expect the busy
524 * threads to have an advantage and account for 80% (non-scientific) of events,
525 * since the other threads will spend more time in kernel mode.
526 */
527 if (total_vtalrm_in_cpubound * 100 / total_vtalrm < 80) {
528 T_FAIL("SIGVTALRM delivered to threads without extra userspace spin (only %d of %d)",
529 total_vtalrm_in_cpubound, total_vtalrm);
530 }
531
532 /*
533 * SIGPROF is delivered based on user+system time, and we expect it to be distributed
534 * among non-blocked threads (so not the main thread, which only handles SIGALRM).
535 */
536 if (gThreadList[0].prof_count * 100 / total_prof > 1) {
537 T_FAIL("SIGPROF delivered to main thread more than 1%% (%d of %d)",
538 gThreadList[0].prof_count,
539 total_prof);
540 }
541
542 /*
543 * SIGXCPU should be delivered at least once.
544 */
545 if (total_xcpu == 0) {
546 T_FAIL("SIGXCPU delivered %d times (expected at least once)", total_xcpu);
547 }
548
549 if (trace) {
550 end_collect_trace(trace);
551 }
552 }
553
554 static void *
stat_thread(void * arg)555 stat_thread(void *arg)
556 {
557 kern_return_t kr;
558 int rv;
559
560 /* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
561 kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
562 if (kr != KERN_ABORTED) {
563 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
564 }
565
566 rv = pthread_mutex_lock(&gShouldExitMutex);
567 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
568 rv = pthread_mutex_unlock(&gShouldExitMutex);
569 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
570
571 do {
572 struct stat sb;
573
574 rv = stat("/", &sb);
575 if (rv != 0) {
576 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "stat");
577 }
578 } while (os_atomic_load(&gShouldExit, relaxed) == false);
579
580 fill_thread_stats((uint32_t)(uintptr_t)arg);
581
582 return NULL;
583 }
584
585 static void *
statfs_thread(void * arg)586 statfs_thread(void *arg)
587 {
588 kern_return_t kr;
589 uint64_t previous_spin_timestamp;
590 int iteration = 0;
591 int rv;
592
593 /* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
594 kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
595 if (kr != KERN_ABORTED) {
596 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
597 }
598
599 rv = pthread_mutex_lock(&gShouldExitMutex);
600 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
601 rv = pthread_mutex_unlock(&gShouldExitMutex);
602 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
603
604 previous_spin_timestamp = mach_absolute_time();
605
606 do {
607 struct statfs sf;
608
609 /*
610 * Every so many system calls, inject a spin in userspace
611 * proportional to how much time was spent performing the
612 * system calls.
613 */
614 #define SYSCALL_ITERATIONS_BETWEEN_SPINS (10000)
615 if (++iteration % SYSCALL_ITERATIONS_BETWEEN_SPINS == 0) {
616 uint64_t now = mach_absolute_time();
617 uint64_t spin_deadline = now + (now - previous_spin_timestamp) / 2;
618
619 while (mach_absolute_time() < spin_deadline) {
620 ;
621 }
622
623 previous_spin_timestamp = mach_absolute_time();
624 }
625
626 rv = statfs("/", &sf);
627 if (rv != 0) {
628 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "statfs");
629 }
630 } while (os_atomic_load(&gShouldExit, relaxed) == false);
631
632 fill_thread_stats((uint32_t)(uintptr_t)arg);
633
634 return NULL;
635 }
636
637 static void
alrm_handler(int signum,struct __siginfo * info __unused,void * uap)638 alrm_handler(int signum, struct __siginfo *info __unused, void *uap)
639 {
640 ucontext_t *context = (ucontext_t *)uap;
641 struct threadentry *entry = NULL;
642 void *sp;
643
644 #if defined(__arm64__)
645 sp = (void *)__darwin_arm_thread_state64_get_sp((context->uc_mcontext)->__ss);
646 #elif defined(__i386__)
647 sp = (void *)(context->uc_mcontext)->__ss.__esp;
648 #elif defined(__x86_64__)
649 sp = (void *)(context->uc_mcontext)->__ss.__rsp;
650 #else
651 #error Unrecognized architecture
652 #endif
653
654 for (uint32_t i = 0; i < nworkers + 1; i++) {
655 struct threadentry *t = &gThreadList[i];
656 if (((uintptr_t)sp >= ((uintptr_t)t->stack_addr - t->stack_size) &&
657 ((uintptr_t)sp < (uintptr_t)t->stack_addr))) {
658 entry = t;
659 break;
660 }
661 }
662
663 if (entry == NULL) {
664 T_ASSERT_FAIL("Signal %d delivered to unknown thread, SP=%p", signum, sp);
665 }
666
667 switch (signum) {
668 case SIGALRM:
669 os_atomic_inc(&entry->alrm_count, relaxed);
670 break;
671 case SIGVTALRM:
672 os_atomic_inc(&entry->vtalrm_count, relaxed);
673 break;
674 case SIGPROF:
675 os_atomic_inc(&entry->prof_count, relaxed);
676 break;
677 case SIGXCPU:
678 os_atomic_inc(&entry->xcpu_count, relaxed);
679 break;
680 }
681 }
682
683 // When the SIGPROF signal was received.
684 static uint64_t sigprof_received_ns = 0;
685
686 static const uint64_t ITIMER_PROF_SECS = 2;
687 #define PROF_EXTRA_THREAD_COUNT (1)
688 // Include the main thread as participating in consuming CPU.
689 static const uint64_t EXPECTED_PROF_DURATION_SECS =
690 ITIMER_PROF_SECS / (PROF_EXTRA_THREAD_COUNT + 1);
691 static const uint64_t EXTRA_TIMEOUT_SECS = 1;
692
693 struct cpu_spin_args {
694 bool spin_while_true;
695 };
696
697 static void *
cpu_thread_main(void * arg)698 cpu_thread_main(void *arg)
699 {
700 bool *spin = arg;
701 while (*spin) {
702 }
703 return NULL;
704 }
705
706 static void
sigprof_received(int __unused sig)707 sigprof_received(int __unused sig)
708 {
709 sigprof_received_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
710 }
711
712 T_DECL(setitimer_prof,
713 "ensure a single-threaded process doesn't receive early signals",
714 T_META_TAG_VM_PREFERRED)
715 {
716 T_SETUPBEGIN;
717 (void)signal(SIGPROF, sigprof_received);
718
719 uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
720 uint64_t expected_end_ns = start_ns + (ITIMER_PROF_SECS * NSEC_PER_SEC);
721 uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
722
723 struct itimerval prof_timer = {
724 .it_value.tv_sec = ITIMER_PROF_SECS,
725 };
726
727 int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
728 T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
729 ITIMER_PROF_SECS);
730 T_SETUPEND;
731
732 uint64_t last_ns = 0;
733 while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
734 if (sigprof_received_ns) {
735 break;
736 }
737 }
738
739 T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
740 (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
741 T_LOG("SIGPROF was delivered %+.6gms after expected duration",
742 (double)(last_ns - expected_end_ns) / 1e6);
743 T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
744 (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
745 }
746
747 T_DECL(setitimer_prof_multi_threaded,
748 "ensure a multi-threaded process doesn't receive early signals",
749 T_META_TAG_VM_PREFERRED)
750 {
751 T_SETUPBEGIN;
752 (void)signal(SIGPROF, sigprof_received);
753
754 uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
755 uint64_t expected_end_ns = start_ns + (EXPECTED_PROF_DURATION_SECS * NSEC_PER_SEC);
756 uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
757
758 struct itimerval prof_timer = {
759 .it_value.tv_sec = ITIMER_PROF_SECS,
760 };
761
762 int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
763 T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
764 ITIMER_PROF_SECS);
765
766 /*
767 * If we start the extra thread before capturing start_ns,
768 * then it immediately starts accumulating time.
769 * Additionally, setitimer does a lazy timestamp capture in task_vtimer_set,
770 * which won't see that previously on-core accumulated time.
771 * These together could cause SIGPROF to deliver slightly before 1s.
772 * Create the threads after the timer starts to ensure that only CPU
773 * work done after the timer starts contributes to SIGPROF.
774 *
775 * rdar://134811651
776 */
777
778 pthread_t cpu_threads[PROF_EXTRA_THREAD_COUNT] = { 0 };
779 bool spin_while_true = true;
780 for (unsigned int i = 0; i < PROF_EXTRA_THREAD_COUNT; i++) {
781 int error = pthread_create(&cpu_threads[i], NULL, cpu_thread_main, &spin_while_true);
782 T_QUIET; T_ASSERT_POSIX_ZERO(error, "create thread %d", i);
783 }
784
785 T_LOG("spinning %d threads on CPU", PROF_EXTRA_THREAD_COUNT + 1);
786
787 T_SETUPEND;
788
789 uint64_t last_ns = 0;
790 while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
791 if (sigprof_received_ns) {
792 break;
793 }
794 }
795
796 spin_while_true = false;
797 T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
798 (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
799 T_LOG("SIGPROF was delivered %+.6gms after expected duration",
800 (double)(last_ns - expected_end_ns) / 1e6);
801 T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
802 (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
803 }
804
805 static uint64_t sigprof_received_usage_mach = 0;
806 extern uint64_t __thread_selfusage(void);
807 static int const ITERATION_COUNT = 50;
808 static uint64_t const ITIMER_PERF_PROF_US = 50000;
809
810 static void
sigprof_received_usage(int __unused sig)811 sigprof_received_usage(int __unused sig)
812 {
813 sigprof_received_usage_mach = __thread_selfusage();
814 }
815
816 T_DECL(setitimer_prof_latency,
817 "measure the latency of delivering SIGPROF",
818 T_META_TAG_PERF, T_META_TAG_VM_NOT_ELIGIBLE,
819 T_META_TIMEOUT(10))
820 {
821 T_SETUPBEGIN;
822 (void)signal(SIGPROF, sigprof_received_usage);
823
824 struct itimerval prof_timer = {
825 .it_value.tv_usec = ITIMER_PERF_PROF_US,
826 };
827 mach_timebase_info_data_t timebase = { 0 };
828 int ret = mach_timebase_info(&timebase);
829 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "mach_timebase_info");
830
831 char pd_path[PATH_MAX] = "";
832 pdwriter_t pd = pdwriter_open_tmp("xnu", "setitimer_sigprof_latency", 1, 0, pd_path, sizeof(pd_path));
833 T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(pd, "opened perfdata file");
834 T_SETUPEND;
835
836 T_LOG("running %d iterations, %lluus each", ITERATION_COUNT, ITIMER_PERF_PROF_US);
837 for (int i = 0; i < ITERATION_COUNT; i++) {
838 sigprof_received_usage_mach = 0;
839
840 uint64_t const start_usage_mach = __thread_selfusage();
841
842 ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
843 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
844 ITIMER_PROF_SECS);
845
846 while (sigprof_received_usage_mach == 0) {
847 // XXX ITIMER_PROF only re-evaluates the timers at user/kernel boundaries.
848 // Issue a trivial syscall (getpid has a fast path without making a syscall)
849 // to ensure the system checks the timer.
850 (void)getppid();
851 }
852
853 uint64_t const end_usage_mach = start_usage_mach +
854 (ITIMER_PERF_PROF_US * NSEC_PER_USEC) * timebase.denom / timebase.numer;
855 uint64_t const latency_mach = sigprof_received_usage_mach - end_usage_mach;
856 uint64_t const latency_ns = latency_mach * timebase.denom / timebase.numer;
857
858 pdwriter_new_value(pd, "sigprof_latency", pdunit_ns, latency_ns);
859 }
860
861 pdwriter_close(pd);
862 T_LOG("wrote perfdata to %s", pd_path);
863 }
864