xref: /xnu-12377.81.4/tests/sched/setitimer.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 #include <darwintest.h>
2 
3 #include <assert.h>
4 #include <mach/clock_types.h>
5 #include <unistd.h>
6 #include <stdlib.h>
7 #include <stdio.h>
8 #include <errno.h>
9 #include <err.h>
10 #include <sys/time.h>
11 #include <mach/mach.h>
12 #include <mach/mach_time.h>
13 #include <pthread.h>
14 #include <perfdata/perfdata.h>
15 #include <sys/sysctl.h>
16 #include <sys/stat.h>
17 #include <sys/mount.h>
18 #include <stdbool.h>
19 #include <signal.h>
20 #include <sys/resource.h>
21 #include <sys/resource_private.h>
22 #include <time.h>
23 #include <os/atomic_private.h>
24 #include <libproc.h>
25 #include <TargetConditionals.h>
26 #include <sys/times.h>
27 #include "sched_test_utils.h"
28 
29 #if __has_include(<mach/mach_time_private.h>)
30 #include <mach/mach_time_private.h>
31 #else
32 kern_return_t           mach_get_times(uint64_t* absolute_time,
33     uint64_t* continuous_time,
34     struct timespec *tp);
35 #endif
36 
37 /*
38  * This test program creates up to 8 worker threads performing
39  * mixed workloads of system calls (which contribute to both
40  * user and system time), as well as spins in userspace (which
41  * only contribute to user time).
42  *
43  * setitimer(2) is used to program timers that fire signals
44  * after various thresholds. The signal handler detects
45  * which thread the signal was delivered on by matching the
46  * stack pointer to ranges for each thread.
47  *
48  * After the test scenario is complete, the distribution of
49  * threads which received interrupts is evaluated to match
50  * expected heuristics.
51  */
52 
53 T_GLOBAL_META(
54 	T_META_RUN_CONCURRENTLY(false),
55 	T_META_CHECK_LEAKS(false),
56 	T_META_ALL_VALID_ARCHS(true),
57 	T_META_RADAR_COMPONENT_NAME("xnu"),
58 	T_META_RADAR_COMPONENT_VERSION("scheduler"),
59 	T_META_OWNER("chimene"),
60 	T_META_ENABLED(TARGET_OS_OSX),
61 	T_META_TAG_VM_NOT_ELIGIBLE,
62 	T_META_ASROOT(true)  // for trace recording
63 	);
64 
65 static void *stat_thread(void *arg);
66 static void *statfs_thread(void *arg);
67 
68 static void alrm_handler(int, struct __siginfo *, void *);
69 
70 static semaphore_t gMainWaitForWorkers;
71 static semaphore_t gWorkersStart;
72 
73 static pthread_mutex_t gShouldExitMutex = PTHREAD_MUTEX_INITIALIZER;
74 static pthread_cond_t  gShouldExitCondition = PTHREAD_COND_INITIALIZER;
75 
76 static _Atomic bool gShouldExit = false;
77 
78 static const uint32_t max_threads = 9;
79 
80 static struct threadentry {
81 	pthread_t thread;
82 	uint64_t tid;
83 	void* stack_addr;
84 	size_t stack_size;
85 	bool expect_cpu_usage;
86 	uint32_t alrm_count;
87 	uint32_t vtalrm_count;
88 	uint32_t prof_count;
89 	uint32_t xcpu_count;
90 	struct thsc_time_cpi self_stats;
91 } __attribute__((aligned(128))) gThreadList[max_threads];
92 
93 static uint32_t nworkers;
94 static uint32_t nthreads;
95 
96 static double offcore_time_percent_threshold = 75.0;
97 
98 static bool is_rosetta = false;
99 
100 static mach_timebase_info_data_t timebase_info;
101 
102 /* Some statistics APIs return host abstime instead of Rosetta-translated abstime */
103 static uint64_t
abs_to_nanos_host(uint64_t abstime)104 abs_to_nanos_host(uint64_t abstime)
105 {
106 	if (is_rosetta) {
107 		return abstime * 125 / 3;
108 	} else {
109 		return abs_to_nanos(abstime);
110 	}
111 }
112 
113 static int
processIsTranslated(void)114 processIsTranslated(void)
115 {
116 	int ret = 0;
117 	size_t size = sizeof(ret);
118 	if (sysctlbyname("sysctl.proc_translated", &ret, &size, NULL, 0) == -1) {
119 		if (errno == ENOENT) {
120 			return 0;
121 		} else {
122 			return -1;
123 		}
124 	}
125 	return ret;
126 }
127 
128 static void
fill_thread_stats(uint32_t i)129 fill_thread_stats(uint32_t i)
130 {
131 	struct threadentry *entry = &gThreadList[i];
132 
133 	int rv = thread_selfcounts(THSC_TIME_CPI, &entry->self_stats, sizeof(entry->self_stats));
134 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "thread_selfcounts(THSC_TIME_CPI)");
135 }
136 
137 T_DECL(setitimer,
138     "Test various setitimer delivered signals to CPU-burning threads")
139 {
140 	T_SETUPBEGIN;
141 
142 	int rv;
143 	kern_return_t kr;
144 	uint32_t ncpu;
145 	size_t ncpu_size = sizeof(ncpu);
146 
147 	trace_handle_t trace = NULL;
148 
149 	if (geteuid() == 0) {
150 		trace = begin_collect_trace_fmt(COLLECT_TRACE_FLAG_DISABLE_SYSCALLS,
151 		    argc, argv, "test_setitimer");
152 	}
153 
154 	struct sched_param self_param = {.sched_priority = 47};
155 
156 	rv = pthread_setschedparam(pthread_self(), SCHED_FIFO, &self_param);
157 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_setschedparam");
158 
159 	kr = mach_timebase_info(&timebase_info);
160 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_timebase_info");
161 
162 	long ticks = sysconf(_SC_CLK_TCK);
163 	T_LOG("sysconf(_SC_CLK_TCK) = %ld\n", ticks);
164 
165 	is_rosetta = processIsTranslated();
166 
167 	rv = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
168 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sysctlbyname(hw.ncpu)");
169 
170 	if (ncpu < 2) {
171 		T_SKIP("%d CPUs not supported for test, returning success", ncpu);
172 	}
173 
174 	nworkers = MIN(max_threads - 1, ncpu);
175 	nthreads = nworkers + 1;
176 
177 	T_LOG("rosetta = %d\n", is_rosetta);
178 	T_LOG("hw.ncpu = %d\n", ncpu);
179 	T_LOG("nworkers = %d\n", nworkers);
180 	T_LOG("nthreads = %d\n", nthreads);
181 
182 	kr = semaphore_create(mach_task_self(), &gMainWaitForWorkers, SYNC_POLICY_FIFO, 0);
183 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
184 
185 	kr = semaphore_create(mach_task_self(), &gWorkersStart, SYNC_POLICY_FIFO, 0);
186 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create()");
187 
188 	pthread_attr_t attr;
189 
190 	rv = pthread_attr_init(&attr);
191 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_init");
192 
193 	struct sched_param child_param = {.sched_priority = 37};
194 
195 	rv = pthread_attr_setschedparam(&attr, &child_param);
196 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_set_qos_class_np");
197 
198 	for (uint32_t i = 0; i < nthreads; i++) {
199 		if (i == 0) {
200 			gThreadList[i].thread = pthread_self();
201 		} else {
202 			rv = pthread_create(&gThreadList[i].thread, &attr,
203 			    i % 2 ? stat_thread : statfs_thread,
204 			    (void *)(uintptr_t)i);
205 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_create");
206 			gThreadList[i].expect_cpu_usage = i % 2 == 0 ? true : false;
207 		}
208 
209 		rv = pthread_threadid_np(gThreadList[i].thread, &gThreadList[i].tid);
210 		T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_threadid_np");
211 
212 		gThreadList[i].stack_addr = pthread_get_stackaddr_np(gThreadList[i].thread);
213 		gThreadList[i].stack_size = pthread_get_stacksize_np(gThreadList[i].thread);
214 	}
215 
216 	rv = pthread_attr_destroy(&attr);
217 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_attr_destroy");
218 
219 	for (uint32_t i = 1; i < nthreads; i++) {
220 		kr = semaphore_wait(gMainWaitForWorkers);
221 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait()");
222 	}
223 
224 	for (uint32_t i = 0; i < nthreads; i++) {
225 		T_LOG("Thread %p (0x%llx) checked in, stack %p/%p\n",
226 		    (void*)gThreadList[i].thread,
227 		    gThreadList[i].tid,
228 		    gThreadList[i].stack_addr,
229 		    (void *)gThreadList[i].stack_size);
230 	}
231 
232 	check_recommended_core_mask(NULL);
233 
234 	wait_for_quiescence_default(argc, argv);
235 
236 	T_SETUPEND;
237 
238 	T_LOG("Finished wait_for_quiescence_default, starting test\n");
239 
240 	sigset_t sigmk;
241 	sigemptyset(&sigmk);
242 
243 	struct sigaction sigact = {
244 		.sa_sigaction = alrm_handler,
245 		.sa_mask = sigmk,
246 		.sa_flags = SA_SIGINFO,
247 	};
248 
249 	rv = sigaction(SIGALRM, &sigact, NULL);
250 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGALRM)");
251 
252 	rv = sigaction(SIGVTALRM, &sigact, NULL);
253 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGVTALRM)");
254 
255 	rv = sigaction(SIGPROF, &sigact, NULL);
256 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGPROF)");
257 
258 	rv = sigaction(SIGXCPU, &sigact, NULL);
259 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "sigaction(SIGXCPU)");
260 
261 	struct itimerval itime = {
262 		.it_interval.tv_sec = 0,
263 		.it_interval.tv_usec = 10000,
264 		.it_value.tv_sec = 0,
265 		.it_value.tv_usec = 10,  /* immediately */
266 	};
267 
268 	rv = setitimer(ITIMER_REAL, &itime, NULL);
269 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
270 
271 	rv = setitimer(ITIMER_VIRTUAL, &itime, NULL);
272 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)");
273 
274 	rv = setitimer(ITIMER_PROF, &itime, NULL);
275 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)");
276 
277 	struct rlimit rlim = {};
278 
279 	rv = getrlimit(RLIMIT_CPU, &rlim);
280 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrlimit(RLIMIT_CPU)");
281 
282 	rlim.rlim_cur = 1;
283 	rv = setrlimit(RLIMIT_CPU, &rlim);
284 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)");
285 
286 	rv = pthread_mutex_lock(&gShouldExitMutex);
287 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
288 
289 	kr = semaphore_signal_all(gWorkersStart);
290 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all()");
291 
292 	struct timespec timenow = {};
293 	uint64_t time_start;
294 
295 	clock_t st_time, en_time;
296 	struct tms st_cpu, en_cpu;
297 	st_time = times(&st_cpu);
298 
299 	struct rusage ru_start, ru_end;
300 	rv = getrusage(RUSAGE_SELF, &ru_start);
301 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_start)");
302 
303 	kr = mach_get_times(&time_start, NULL, &timenow);
304 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "mach_get_times()");
305 
306 	struct timespec timeout = {
307 		.tv_sec = timenow.tv_sec + 10,
308 		.tv_nsec = timenow.tv_nsec,
309 	};
310 
311 	uint64_t time_end = 0;
312 
313 	do {
314 		assert(os_atomic_load(&gShouldExit, relaxed) == false);
315 
316 		rv = pthread_cond_timedwait(&gShouldExitCondition, &gShouldExitMutex, &timeout);
317 		if (rv == ETIMEDOUT) {
318 			os_atomic_store(&gShouldExit, true, relaxed);
319 
320 			time_end = mach_absolute_time();
321 
322 			struct itimerval itime_stop = {
323 				.it_interval.tv_sec = 0,
324 				.it_interval.tv_usec = 0,
325 				.it_value.tv_sec = 0,
326 				.it_value.tv_usec = 0,  /* stop immediately */
327 			};
328 
329 			rv = setitimer(ITIMER_REAL, &itime_stop, NULL);
330 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_REAL)");
331 
332 			rv = setitimer(ITIMER_VIRTUAL, &itime_stop, NULL);
333 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_VIRTUAL)");
334 
335 			rv = setitimer(ITIMER_PROF, &itime_stop, NULL);
336 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setitimer(ITIMER_PROF)");
337 
338 			rlim.rlim_cur = RLIM_INFINITY;
339 			rv = setrlimit(RLIMIT_CPU, &rlim);
340 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "setrlimit(RLIMIT_CPU)");
341 
342 			en_time = times(&en_cpu);
343 			rv = getrusage(RUSAGE_SELF, &ru_end);
344 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "getrusage(RUSAGE_SELF, &ru_end)");
345 
346 			break;
347 		} else {
348 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_cond_timedwait(&gShouldExitCondition, ...)");
349 		}
350 	} while (true);
351 
352 	rv = pthread_mutex_unlock(&gShouldExitMutex);
353 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
354 
355 	for (uint32_t i = 1; i < nthreads; i++) {
356 		rv = pthread_join(gThreadList[i].thread, NULL);
357 		T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_join");
358 	}
359 
360 	fill_thread_stats(0);
361 
362 	struct rusage_info_v6 ru = {};
363 	rv = proc_pid_rusage(getpid(), RUSAGE_INFO_V6, (rusage_info_t *)&ru);
364 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "proc_pid_rusage");
365 
366 	uint64_t test_duration = time_end - time_start;
367 	uint64_t test_duration_ns = abs_to_nanos(test_duration);
368 
369 	double elapsed_secs = (double) test_duration_ns / (uint64_t)NSEC_PER_SEC;
370 
371 	T_LOG("test duration %3.3f seconds (%lld)\n", elapsed_secs, test_duration_ns);
372 
373 	uintmax_t real_ticks = en_time - st_time;
374 	uintmax_t user_ticks = en_cpu.tms_utime - st_cpu.tms_utime;
375 	uintmax_t system_ticks = en_cpu.tms_stime - st_cpu.tms_stime;
376 
377 	T_LOG("times(): Real Time: %jd (%3.3f seconds), User Time %jd (%3.3f seconds), System Time %jd (%3.3f seconds)\n",
378 	    real_ticks, (double)real_ticks / (double)ticks,
379 	    user_ticks, (double)user_ticks / (double)ticks,
380 	    system_ticks, (double)system_ticks / (double)ticks);
381 
382 	struct timeval ru_udelta, ru_sdelta;
383 
384 	timersub(&ru_end.ru_utime, &ru_start.ru_utime, &ru_udelta);
385 	timersub(&ru_end.ru_stime, &ru_start.ru_stime, &ru_sdelta);
386 
387 	T_LOG("rusage(): User Time %ld.%06d, System Time %ld.%06d\n",
388 	    ru_udelta.tv_sec, ru_udelta.tv_usec,
389 	    ru_sdelta.tv_sec, ru_sdelta.tv_usec);
390 
391 	uint64_t total_user_time_ns = abs_to_nanos_host(ru.ri_user_time);
392 	double total_user_time_s = (double)total_user_time_ns / (uint64_t)NSEC_PER_SEC;
393 
394 	uint64_t total_system_time_ns = abs_to_nanos_host(ru.ri_system_time);
395 	double total_system_time_s = (double)total_system_time_ns / (uint64_t)NSEC_PER_SEC;
396 
397 	uint64_t total_time_ns = (total_user_time_ns + total_system_time_ns);
398 	double total_time_s = (double)total_time_ns / (uint64_t)NSEC_PER_SEC;
399 
400 	uint64_t total_runnable_time_ns = abs_to_nanos_host(ru.ri_runnable_time);
401 	double total_runnable_time_s = (double)total_runnable_time_ns / (uint64_t)NSEC_PER_SEC;
402 
403 	uint64_t total_pending_time_ns = total_runnable_time_ns - (total_time_ns);
404 	double total_pending_time_s = (double)total_pending_time_ns / (uint64_t)NSEC_PER_SEC;
405 
406 	uint64_t total_p_time_ns = abs_to_nanos_host(ru.ri_user_ptime + ru.ri_system_ptime);
407 	double total_p_time_s = (double)total_p_time_ns / (uint64_t)NSEC_PER_SEC;
408 
409 	T_LOG("total usage: time: %3.3f user: %3.3f kernel: %3.3f runnable: %3.3f pending: %3.3f pcore: %3.3f\n",
410 	    total_time_s, total_user_time_s, total_system_time_s,
411 	    total_runnable_time_s, total_pending_time_s,
412 	    total_p_time_s);
413 
414 	/*
415 	 * "Good" data looks like:
416 	 *
417 	 * total usage: time: 77.696 user: 16.570 kernel: 61.126 runnable: 79.951 pending: 2.255 pcore: 72.719
418 	 * Thread        ALRM VTALRM   PROF   XCPU      inst        cycle               user                  kernel          offcore  type
419 	 * 0x16f78f000      0    251    811      0  27680301973  28913501188   3706622958 (  38.14%)   6012631083 (  61.86%)    2.81%  statfs
420 	 * 0x16f81b000      0      2    889      0  27962710058  28780576123    439297291 (   4.53%)   9259942583 (  95.47%)    3.01%  stat
421 	 * 0x16f8a7000      0    251    836      0  27558331077  28889228535   3699010000 (  38.08%)   6016015083 (  61.92%)    2.85%  statfs
422 	 * 0x16f933000      0      0    939      0  28078084696  28880195679    443067500 (   4.56%)   9269807666 (  95.44%)    2.87%  stat
423 	 * 0x16f9bf000      0    283    874      0  27691851016  28969873070   3710916750 (  38.16%)   6012783541 (  61.84%)    2.76%  statfs
424 	 * 0x16fa4b000      0      2    908      1  27945063330  28769971396    438583000 (   4.53%)   9252694291 (  95.47%)    3.09%  stat
425 	 * 0x16fad7000      0    262    889      0  27328496429  28772748055   3689245375 (  38.03%)   6011061458 (  61.97%)    3.00%  statfs
426 	 * 0x16fb63000      0      0    914      0  27942195343  28757254100    439690166 (   4.53%)   9256659500 (  95.47%)    3.04%  stat
427 	 * 0x1fe2bb400   1001      0      3      0     72144372    102339334      3532125 (   9.35%)     34249208 (  90.65%)   99.62%  main
428 	 */
429 	uint32_t total_alrm = 0;
430 	uint32_t total_vtalrm = 0;
431 	uint32_t total_prof = 0;
432 	uint32_t total_xcpu = 0;
433 	uint32_t total_vtalrm_in_cpubound = 0;
434 
435 	uint32_t total_threads_not_finding_cpus = 0;
436 
437 	T_LOG("Thread         ALRM VTALRM   PROF   XCPU      "
438 	    "inst        cycle               user                  kernel          "
439 	    "offcore type\n");
440 
441 	for (uint32_t i = 0; i < nthreads; i++) {
442 		uint64_t user_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_user_time_mach);
443 		uint64_t system_time = abs_to_nanos_host(gThreadList[i].self_stats.ttci_system_time_mach);
444 
445 
446 		uint64_t total_time = user_time + system_time;
447 
448 		double percentage_user = (double)user_time / (double) total_time * 100;
449 		double percentage_system = (double)system_time / (double) total_time * 100;
450 
451 		uint64_t not_running_time = test_duration_ns - total_time;
452 		/*
453 		 * The worker threads spend a little bit of time waking up before the test
454 		 * start time is captured, so they can occasionally have a larger total_time
455 		 * than the test duration, leading to underflow in not_running_time and
456 		 * test failures.
457 		 * rdar://106147865
458 		 */
459 		if (total_time > test_duration_ns) {
460 			not_running_time = 0;
461 		}
462 		double percentage_not_running = (double)(not_running_time) / (double) test_duration_ns * 100;
463 
464 		char* thread_type_str = "";
465 		char* warning_str = "";
466 
467 		if (i == 0) {
468 			thread_type_str = "main ";
469 		} else {
470 			thread_type_str = i % 2 ? "stat   " : "statfs ";
471 
472 			if (percentage_not_running > offcore_time_percent_threshold) {
473 				total_threads_not_finding_cpus++;
474 				warning_str = "** too much offcore time **";
475 			}
476 		}
477 
478 		T_LOG("0x%010llx %6d %6d %6d %6d %12lld %12lld %12lld (%7.2f%%) %12lld (%7.2f%%) %7.2f%% %s%s\n",
479 		    gThreadList[i].tid,
480 		    gThreadList[i].alrm_count,
481 		    gThreadList[i].vtalrm_count,
482 		    gThreadList[i].prof_count,
483 		    gThreadList[i].xcpu_count,
484 		    gThreadList[i].self_stats.ttci_instructions,
485 		    gThreadList[i].self_stats.ttci_cycles,
486 		    user_time, percentage_user,
487 		    system_time, percentage_system,
488 		    percentage_not_running,
489 		    thread_type_str, warning_str);
490 
491 		total_alrm += gThreadList[i].alrm_count;
492 		total_vtalrm += gThreadList[i].vtalrm_count;
493 		total_prof += gThreadList[i].prof_count;
494 		total_xcpu += gThreadList[i].xcpu_count;
495 
496 		if (gThreadList[i].expect_cpu_usage) {
497 			total_vtalrm_in_cpubound += gThreadList[i].vtalrm_count;
498 		}
499 	}
500 
501 	/*
502 	 * We expect all SIGALRM to go to the main thread, because it is the
503 	 * first thread in the process with the signal unmasked, and we
504 	 * never expect the signal handler itself to take >10ms
505 	 *
506 	 * This can happen if the main thread is preempted for the entire 10ms duration, though.
507 	 * Being high priority, it shouldn't be delayed for more than 10ms too often.
508 	 * Allow up to 10% to deliver to other threads.
509 	 */
510 	if ((double)gThreadList[0].alrm_count * 100 / total_alrm < 90.0) {
511 		T_FAIL("SIGALRM delivered to non-main thread more than 10%% of the time (%d of %d)",
512 		    gThreadList[0].alrm_count,
513 		    total_alrm);
514 	}
515 
516 	/* We expect all worker threads to find CPUs of their own for most of the test */
517 	if (total_threads_not_finding_cpus != 0) {
518 		T_FAIL("%d worker threads spent more than %2.0f%% of time off-core",
519 		    total_threads_not_finding_cpus, offcore_time_percent_threshold);
520 	}
521 
522 	/*
523 	 * SIGVTALRM is delivered based on user time, and we expect the busy
524 	 * threads to have an advantage and account for 80% (non-scientific) of events,
525 	 * since the other threads will spend more time in kernel mode.
526 	 */
527 	if (total_vtalrm_in_cpubound * 100 / total_vtalrm < 80) {
528 		T_FAIL("SIGVTALRM delivered to threads without extra userspace spin (only %d of %d)",
529 		    total_vtalrm_in_cpubound, total_vtalrm);
530 	}
531 
532 	/*
533 	 * SIGPROF is delivered based on user+system time, and we expect it to be distributed
534 	 * among non-blocked threads (so not the main thread, which only handles SIGALRM).
535 	 */
536 	if (gThreadList[0].prof_count * 100 / total_prof > 1) {
537 		T_FAIL("SIGPROF delivered to main thread more than 1%% (%d of %d)",
538 		    gThreadList[0].prof_count,
539 		    total_prof);
540 	}
541 
542 	/*
543 	 * SIGXCPU should be delivered at least once.
544 	 */
545 	if (total_xcpu == 0) {
546 		T_FAIL("SIGXCPU delivered %d times (expected at least once)", total_xcpu);
547 	}
548 
549 	if (trace) {
550 		end_collect_trace(trace);
551 	}
552 }
553 
554 static void *
stat_thread(void * arg)555 stat_thread(void *arg)
556 {
557 	kern_return_t kr;
558 	int rv;
559 
560 	/* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
561 	kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
562 	if (kr != KERN_ABORTED) {
563 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
564 	}
565 
566 	rv = pthread_mutex_lock(&gShouldExitMutex);
567 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
568 	rv = pthread_mutex_unlock(&gShouldExitMutex);
569 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
570 
571 	do {
572 		struct stat sb;
573 
574 		rv = stat("/", &sb);
575 		if (rv != 0) {
576 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "stat");
577 		}
578 	} while (os_atomic_load(&gShouldExit, relaxed) == false);
579 
580 	fill_thread_stats((uint32_t)(uintptr_t)arg);
581 
582 	return NULL;
583 }
584 
585 static void *
statfs_thread(void * arg)586 statfs_thread(void *arg)
587 {
588 	kern_return_t kr;
589 	uint64_t previous_spin_timestamp;
590 	int iteration = 0;
591 	int rv;
592 
593 	/* This wait can be aborted by one of the signals, so we make sure to wait for the first iteration of main */
594 	kr = semaphore_wait_signal(gWorkersStart, gMainWaitForWorkers);
595 	if (kr != KERN_ABORTED) {
596 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal()");
597 	}
598 
599 	rv = pthread_mutex_lock(&gShouldExitMutex);
600 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_lock(&gShouldExitMutex)");
601 	rv = pthread_mutex_unlock(&gShouldExitMutex);
602 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "pthread_mutex_unlock(&gShouldExitMutex)");
603 
604 	previous_spin_timestamp = mach_absolute_time();
605 
606 	do {
607 		struct statfs sf;
608 
609 		/*
610 		 * Every so many system calls, inject a spin in userspace
611 		 * proportional to how much time was spent performing the
612 		 * system calls.
613 		 */
614 #define SYSCALL_ITERATIONS_BETWEEN_SPINS (10000)
615 		if (++iteration % SYSCALL_ITERATIONS_BETWEEN_SPINS == 0) {
616 			uint64_t now = mach_absolute_time();
617 			uint64_t spin_deadline = now + (now - previous_spin_timestamp) / 2;
618 
619 			while (mach_absolute_time() < spin_deadline) {
620 				;
621 			}
622 
623 			previous_spin_timestamp = mach_absolute_time();
624 		}
625 
626 		rv = statfs("/", &sf);
627 		if (rv != 0) {
628 			T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "statfs");
629 		}
630 	} while (os_atomic_load(&gShouldExit, relaxed) == false);
631 
632 	fill_thread_stats((uint32_t)(uintptr_t)arg);
633 
634 	return NULL;
635 }
636 
637 static void
alrm_handler(int signum,struct __siginfo * info __unused,void * uap)638 alrm_handler(int signum, struct __siginfo *info __unused, void *uap)
639 {
640 	ucontext_t *context = (ucontext_t *)uap;
641 	struct threadentry *entry = NULL;
642 	void *sp;
643 
644 #if defined(__arm64__)
645 	sp = (void *)__darwin_arm_thread_state64_get_sp((context->uc_mcontext)->__ss);
646 #elif defined(__i386__)
647 	sp = (void *)(context->uc_mcontext)->__ss.__esp;
648 #elif defined(__x86_64__)
649 	sp = (void *)(context->uc_mcontext)->__ss.__rsp;
650 #else
651 #error Unrecognized architecture
652 #endif
653 
654 	for (uint32_t i = 0; i < nworkers + 1; i++) {
655 		struct threadentry *t = &gThreadList[i];
656 		if (((uintptr_t)sp >= ((uintptr_t)t->stack_addr - t->stack_size) &&
657 		    ((uintptr_t)sp < (uintptr_t)t->stack_addr))) {
658 			entry = t;
659 			break;
660 		}
661 	}
662 
663 	if (entry == NULL) {
664 		T_ASSERT_FAIL("Signal %d delivered to unknown thread, SP=%p", signum, sp);
665 	}
666 
667 	switch (signum) {
668 	case SIGALRM:
669 		os_atomic_inc(&entry->alrm_count, relaxed);
670 		break;
671 	case SIGVTALRM:
672 		os_atomic_inc(&entry->vtalrm_count, relaxed);
673 		break;
674 	case SIGPROF:
675 		os_atomic_inc(&entry->prof_count, relaxed);
676 		break;
677 	case SIGXCPU:
678 		os_atomic_inc(&entry->xcpu_count, relaxed);
679 		break;
680 	}
681 }
682 
683 // When the SIGPROF signal was received.
684 static uint64_t sigprof_received_ns = 0;
685 
686 static const uint64_t ITIMER_PROF_SECS = 2;
687 #define PROF_EXTRA_THREAD_COUNT (1)
688 // Include the main thread as participating in consuming CPU.
689 static const uint64_t EXPECTED_PROF_DURATION_SECS =
690     ITIMER_PROF_SECS / (PROF_EXTRA_THREAD_COUNT + 1);
691 static const uint64_t EXTRA_TIMEOUT_SECS = 1;
692 
693 struct cpu_spin_args {
694 	bool spin_while_true;
695 };
696 
697 static void *
cpu_thread_main(void * arg)698 cpu_thread_main(void *arg)
699 {
700 	bool *spin = arg;
701 	while (*spin) {
702 	}
703 	return NULL;
704 }
705 
706 static void
sigprof_received(int __unused sig)707 sigprof_received(int __unused sig)
708 {
709 	sigprof_received_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
710 }
711 
712 T_DECL(setitimer_prof,
713     "ensure a single-threaded process doesn't receive early signals",
714     T_META_TAG_VM_PREFERRED)
715 {
716 	T_SETUPBEGIN;
717 	(void)signal(SIGPROF, sigprof_received);
718 
719 	uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
720 	uint64_t expected_end_ns = start_ns + (ITIMER_PROF_SECS * NSEC_PER_SEC);
721 	uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
722 
723 	struct itimerval prof_timer = {
724 		.it_value.tv_sec = ITIMER_PROF_SECS,
725 	};
726 
727 	int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
728 	T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
729 	    ITIMER_PROF_SECS);
730 	T_SETUPEND;
731 
732 	uint64_t last_ns = 0;
733 	while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
734 		if (sigprof_received_ns) {
735 			break;
736 		}
737 	}
738 
739 	T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
740 	    (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
741 	T_LOG("SIGPROF was delivered %+.6gms after expected duration",
742 	    (double)(last_ns - expected_end_ns) / 1e6);
743 	T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
744 	    (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
745 }
746 
747 T_DECL(setitimer_prof_multi_threaded,
748     "ensure a multi-threaded process doesn't receive early signals",
749     T_META_TAG_VM_PREFERRED)
750 {
751 	T_SETUPBEGIN;
752 	(void)signal(SIGPROF, sigprof_received);
753 
754 	uint64_t start_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC);
755 	uint64_t expected_end_ns = start_ns + (EXPECTED_PROF_DURATION_SECS * NSEC_PER_SEC);
756 	uint64_t end_timeout_ns = expected_end_ns + (EXTRA_TIMEOUT_SECS * NSEC_PER_SEC);
757 
758 	struct itimerval prof_timer = {
759 		.it_value.tv_sec = ITIMER_PROF_SECS,
760 	};
761 
762 	int ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
763 	T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
764 	    ITIMER_PROF_SECS);
765 
766 	/*
767 	 * If we start the extra thread before capturing start_ns,
768 	 * then it immediately starts accumulating time.
769 	 * Additionally, setitimer does a lazy timestamp capture in task_vtimer_set,
770 	 * which won't see that previously on-core accumulated time.
771 	 * These together could cause SIGPROF to deliver slightly before 1s.
772 	 * Create the threads after the timer starts to ensure that only CPU
773 	 * work done after the timer starts contributes to SIGPROF.
774 	 *
775 	 * rdar://134811651
776 	 */
777 
778 	pthread_t cpu_threads[PROF_EXTRA_THREAD_COUNT] = { 0 };
779 	bool spin_while_true = true;
780 	for (unsigned int i = 0; i < PROF_EXTRA_THREAD_COUNT; i++) {
781 		int error = pthread_create(&cpu_threads[i], NULL, cpu_thread_main, &spin_while_true);
782 		T_QUIET; T_ASSERT_POSIX_ZERO(error, "create thread %d", i);
783 	}
784 
785 	T_LOG("spinning %d threads on CPU", PROF_EXTRA_THREAD_COUNT + 1);
786 
787 	T_SETUPEND;
788 
789 	uint64_t last_ns = 0;
790 	while ((last_ns = clock_gettime_nsec_np(CLOCK_MONOTONIC)) < end_timeout_ns) {
791 		if (sigprof_received_ns) {
792 			break;
793 		}
794 	}
795 
796 	spin_while_true = false;
797 	T_EXPECT_LE(last_ns, end_timeout_ns, "received SIGPROF within the timeout (%.9gs < %.9gs)",
798 	    (double)(last_ns - start_ns) / 1e9, (double)(end_timeout_ns - start_ns) / 1e9);
799 	T_LOG("SIGPROF was delivered %+.6gms after expected duration",
800 	    (double)(last_ns - expected_end_ns) / 1e6);
801 	T_EXPECT_GE(last_ns, expected_end_ns, "received SIGPROF after enough time (%.9gs > %.9gs)",
802 	    (double)(last_ns - start_ns) / 1e9, (double)(expected_end_ns - start_ns) / 1e9);
803 }
804 
805 static uint64_t sigprof_received_usage_mach = 0;
806 extern uint64_t __thread_selfusage(void);
807 static int const ITERATION_COUNT = 50;
808 static uint64_t const ITIMER_PERF_PROF_US = 50000;
809 
810 static void
sigprof_received_usage(int __unused sig)811 sigprof_received_usage(int __unused sig)
812 {
813 	sigprof_received_usage_mach = __thread_selfusage();
814 }
815 
816 T_DECL(setitimer_prof_latency,
817     "measure the latency of delivering SIGPROF",
818     T_META_TAG_PERF, T_META_TAG_VM_NOT_ELIGIBLE,
819     T_META_TIMEOUT(10))
820 {
821 	T_SETUPBEGIN;
822 	(void)signal(SIGPROF, sigprof_received_usage);
823 
824 	struct itimerval prof_timer = {
825 		.it_value.tv_usec = ITIMER_PERF_PROF_US,
826 	};
827 	mach_timebase_info_data_t timebase = { 0 };
828 	int ret = mach_timebase_info(&timebase);
829 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "mach_timebase_info");
830 
831 	char pd_path[PATH_MAX] = "";
832 	pdwriter_t pd = pdwriter_open_tmp("xnu", "setitimer_sigprof_latency", 1, 0, pd_path, sizeof(pd_path));
833 	T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(pd, "opened perfdata file");
834 	T_SETUPEND;
835 
836 	T_LOG("running %d iterations, %lluus each", ITERATION_COUNT, ITIMER_PERF_PROF_US);
837 	for (int i = 0; i < ITERATION_COUNT; i++) {
838 		sigprof_received_usage_mach = 0;
839 
840 		uint64_t const start_usage_mach = __thread_selfusage();
841 
842 		ret = setitimer(ITIMER_PROF, &prof_timer, NULL);
843 		T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "setitimer(ITIMER_PROF, %llus)",
844 		    ITIMER_PROF_SECS);
845 
846 		while (sigprof_received_usage_mach == 0) {
847 			// XXX ITIMER_PROF only re-evaluates the timers at user/kernel boundaries.
848 			// Issue a trivial syscall (getpid has a fast path without making a syscall)
849 			// to ensure the system checks the timer.
850 			(void)getppid();
851 		}
852 
853 		uint64_t const end_usage_mach = start_usage_mach +
854 		    (ITIMER_PERF_PROF_US * NSEC_PER_USEC) * timebase.denom / timebase.numer;
855 		uint64_t const latency_mach = sigprof_received_usage_mach - end_usage_mach;
856 		uint64_t const latency_ns = latency_mach * timebase.denom / timebase.numer;
857 
858 		pdwriter_new_value(pd, "sigprof_latency", pdunit_ns, latency_ns);
859 	}
860 
861 	pdwriter_close(pd);
862 	T_LOG("wrote perfdata to %s", pd_path);
863 }
864