xref: /xnu-11215.41.3/tests/sched/yield_aggressor.c (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1 #include <unistd.h>
2 #include <stdlib.h>
3 #include <pthread.h>
4 #include <spawn.h>
5 #include <string.h>
6 #include <mach/mach.h>
7 #include <mach/mach_time.h>
8 #include <TargetConditionals.h>
9 #include <sys/work_interval.h>
10 #include <sys/stat.h>
11 #include <sys/sysctl.h>
12 #include <sys/time.h>
13 #include <stdatomic.h>
14 #include <time.h>
15 
16 #include <darwintest.h>
17 #include <darwintest_utils.h>
18 #include <perfdata/perfdata.h>
19 #include "sched_test_utils.h"
20 
21 T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
22     T_META_RADAR_COMPONENT_NAME("xnu"),
23     T_META_RADAR_COMPONENT_VERSION("scheduler"),
24     T_META_TAG_PERF,
25     T_META_TAG_VM_NOT_ELIGIBLE);
26 
27 /* Code and logic taken from Daniel Chimene's yield-aggressor.c test (rdar://47327537) */
28 
29 static const size_t MAX_PDJ_PATH_LEN = 256;
30 
31 static void
sched_yield_loop(uint64_t iterations)32 sched_yield_loop(uint64_t iterations)
33 {
34 	for (uint64_t i = 0; i < iterations; i++) {
35 		sched_yield();
36 	}
37 }
38 
39 static void
swtch_loop(uint64_t iterations)40 swtch_loop(uint64_t iterations)
41 {
42 	for (uint64_t i = 0; i < iterations; i++) {
43 		swtch();
44 	}
45 }
46 
47 static void
swtch_pri_loop(uint64_t iterations)48 swtch_pri_loop(uint64_t iterations)
49 {
50 	for (uint64_t i = 0; i < iterations; i++) {
51 		swtch_pri(0);
52 	}
53 }
54 
55 static void
thread_switch_loop(uint64_t iterations)56 thread_switch_loop(uint64_t iterations)
57 {
58 	for (uint64_t i = 0; i < iterations; i++) {
59 		thread_switch(MACH_PORT_NULL, SWITCH_OPTION_NONE, MACH_MSG_TIMEOUT_NONE);
60 	}
61 }
62 
63 static void
thread_switch_wait_loop(uint64_t iterations)64 thread_switch_wait_loop(uint64_t iterations)
65 {
66 	for (uint64_t i = 0; i < iterations; i++) {
67 		thread_switch(MACH_PORT_NULL, SWITCH_OPTION_WAIT, MACH_MSG_TIMEOUT_NONE);
68 	}
69 }
70 
71 static void
thread_switch_depress_loop(uint64_t iterations)72 thread_switch_depress_loop(uint64_t iterations)
73 {
74 	for (uint64_t i = 0; i < iterations; i++) {
75 		thread_switch(MACH_PORT_NULL, SWITCH_OPTION_DEPRESS, MACH_MSG_TIMEOUT_NONE);
76 	}
77 }
78 
79 typedef enum yield_type {
80 	SCHED_YIELD = 0,
81 	SWTCH = 1,
82 	SWTCH_PRI = 2,
83 	THREAD_SWITCH = 3,
84 	THREAD_SWITCH_WAIT = 4,
85 	THREAD_SWITCH_DEPRESS = 5
86 } yield_type_t;
87 
88 static const int NUM_YIELD_TYPES = 6;
89 
90 static char* name_table[NUM_YIELD_TYPES] = {
91 	[SCHED_YIELD]           = "sched_yield",
92 	[SWTCH]                 = "swtch",
93 	[SWTCH_PRI]             = "swtch_pri",
94 	[THREAD_SWITCH]         = "thread_switch(none)",
95 	[THREAD_SWITCH_WAIT]    = "thread_switch(wait)",
96 	[THREAD_SWITCH_DEPRESS] = "thread_switch(depress)",
97 };
98 
99 static void (*fn_table[NUM_YIELD_TYPES])(uint64_t) = {
100 	[SCHED_YIELD]           = sched_yield_loop,
101 	[SWTCH]                 = swtch_loop,
102 	[SWTCH_PRI]             = swtch_pri_loop,
103 	[THREAD_SWITCH]         = thread_switch_loop,
104 	[THREAD_SWITCH_WAIT]    = thread_switch_wait_loop,
105 	[THREAD_SWITCH_DEPRESS] = thread_switch_depress_loop,
106 };
107 
108 static semaphore_t ready_sem, go_sem;
109 static unsigned int num_iterations, num_threads;
110 static _Atomic unsigned int done_threads;
111 static yield_type_t curr_yield_type;
112 
113 static void *
thread_fn(__unused void * arg)114 thread_fn(__unused void *arg)
115 {
116 	kern_return_t kr;
117 
118 	kr = semaphore_wait_signal(go_sem, ready_sem);
119 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
120 
121 	fn_table[curr_yield_type](num_iterations);
122 
123 	if (atomic_fetch_add(&done_threads, 1) == num_threads - 1) {
124 		kr = semaphore_wait_signal(go_sem, ready_sem);
125 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
126 	} else {
127 		kr = semaphore_wait(go_sem);
128 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
129 	}
130 	return NULL;
131 }
132 
133 static void
start_threads(pthread_t * threads,void * (* start_routine)(void *),int priority)134 start_threads(pthread_t *threads, void *(*start_routine)(void *), int priority)
135 {
136 	int rv;
137 	pthread_attr_t attr;
138 
139 	rv = pthread_attr_init(&attr);
140 	T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_init");
141 
142 	for (unsigned int i = 0; i < num_threads; i++) {
143 		struct sched_param param = { .sched_priority = (int)priority };
144 
145 		rv = pthread_attr_setschedparam(&attr, &param);
146 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setschedparam");
147 
148 		rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
149 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setdetachstate");
150 
151 		rv = pthread_create(&threads[i], &attr, start_routine, NULL);
152 		T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_create");
153 	}
154 
155 	rv = pthread_attr_destroy(&attr);
156 	T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_destroy");
157 }
158 
159 struct cpu_time {
160 	natural_t sys;
161 	natural_t user;
162 	natural_t idle;
163 };
164 
165 static void
record_cpu_time(struct cpu_time * cpu_time)166 record_cpu_time(struct cpu_time *cpu_time)
167 {
168 	host_cpu_load_info_data_t load;
169 	kern_return_t kr;
170 	mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT;
171 
172 	kr = host_statistics(mach_host_self(), HOST_CPU_LOAD_INFO, (int *)&load, &count);
173 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_statistics");
174 
175 	cpu_time->sys = load.cpu_ticks[CPU_STATE_SYSTEM];
176 	cpu_time->user = load.cpu_ticks[CPU_STATE_USER] + load.cpu_ticks[CPU_STATE_NICE];
177 	cpu_time->idle = load.cpu_ticks[CPU_STATE_IDLE];
178 }
179 
180 static void
write_independent_variables(pdwriter_t writer)181 write_independent_variables(pdwriter_t writer)
182 {
183 	pdwriter_record_variable_str(writer, "yield_variant", name_table[curr_yield_type]);
184 	pdwriter_record_variable_dbl(writer, "num_iterations", num_iterations);
185 	pdwriter_record_variable_dbl(writer, "num_threads", num_threads);
186 }
187 
188 static const double MS_PER_CPU_TICK = 10.0;
189 
190 static void
write_time_values(pdwriter_t writer,struct cpu_time * delta_times,uint64_t elapsed_usecs,double idle_ratio)191 write_time_values(pdwriter_t writer, struct cpu_time *delta_times, uint64_t elapsed_usecs, double idle_ratio)
192 {
193 	pdwriter_new_value(writer, "system_time", pdunit_milliseconds_cpu, delta_times->sys * MS_PER_CPU_TICK);
194 	write_independent_variables(writer);
195 
196 	pdwriter_new_value(writer, "user_time", pdunit_milliseconds_cpu, delta_times->user * MS_PER_CPU_TICK);
197 	write_independent_variables(writer);
198 
199 	pdwriter_new_value(writer, "idle_time", pdunit_milliseconds_cpu, delta_times->idle * MS_PER_CPU_TICK);
200 	write_independent_variables(writer);
201 
202 	pdwriter_new_value(writer, "wall_clock_time", pdunit_microseconds, elapsed_usecs);
203 	write_independent_variables(writer);
204 
205 	/* Main metric of note, with a threshold in perfmeta to guard against regression */
206 	pdwriter_new_value(writer, "idle_time_ratio", pdunit_percent_cpus, idle_ratio);
207 	write_independent_variables(writer);
208 }
209 
210 static void
run_yielding_test(yield_type_t yield_type,unsigned int num_iters,unsigned int thread_count,int thread_pri,pdwriter_t writer)211 run_yielding_test(yield_type_t yield_type, unsigned int num_iters, unsigned int thread_count,
212     int thread_pri, pdwriter_t writer)
213 {
214 	T_SETUPBEGIN;
215 
216 	T_LOG("===== Yield Variety: %s", name_table[yield_type]);
217 
218 	wait_for_quiescence_default();
219 
220 	kern_return_t kr;
221 
222 	num_iterations = num_iters;
223 	num_threads = thread_count;
224 	curr_yield_type = yield_type;
225 	done_threads = 0;
226 
227 	kr = semaphore_create(mach_task_self(), &ready_sem, SYNC_POLICY_FIFO, 0);
228 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
229 	kr = semaphore_create(mach_task_self(), &go_sem, SYNC_POLICY_FIFO, 0);
230 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
231 
232 	pthread_t threads[num_threads];
233 	start_threads(threads, &thread_fn, thread_pri);
234 
235 	for (uint32_t i = 0; i < num_threads; i++) {
236 		kr = semaphore_wait(ready_sem);
237 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
238 	}
239 
240 	T_SETUPEND;
241 
242 	struct cpu_time start_times, finish_times, delta_times;
243 	uint64_t before_nsec, after_nsec;
244 
245 	record_cpu_time(&start_times);
246 	before_nsec = clock_gettime_nsec_np(CLOCK_REALTIME);
247 
248 	/* Signal threads to begin yielding "work" */
249 	kr = semaphore_signal_all(go_sem);
250 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all");
251 
252 	kr = semaphore_wait(ready_sem);
253 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
254 
255 	/* Capture cpu stats after yielding "work" has finished */
256 	after_nsec = clock_gettime_nsec_np(CLOCK_REALTIME);
257 	record_cpu_time(&finish_times);
258 
259 	uint64_t elapsed_usecs = (after_nsec - before_nsec) / 1000;
260 	T_LOG("All %u threads finished yielding %u times each", num_threads, num_iterations);
261 	T_LOG("Elapsed Runtime: %f seconds", ((double) elapsed_usecs) / USEC_PER_SEC);
262 
263 	delta_times.sys = finish_times.sys - start_times.sys;
264 	delta_times.user = finish_times.user - start_times.user;
265 	delta_times.idle = finish_times.idle - start_times.idle;
266 	T_LOG("System CPU ticks: %d", delta_times.sys);
267 	T_LOG("User CPU ticks: %d", delta_times.user);
268 	T_LOG("Idle CPU ticks: %d", delta_times.idle);
269 
270 	natural_t total_ticks = delta_times.sys + delta_times.user + delta_times.idle;
271 	T_QUIET; T_ASSERT_GT(total_ticks, 0, "CPU load stats failed to update, likely due to host_statistics() rate limit");
272 
273 	double cpu_idle_ratio = delta_times.idle * 1.0 / total_ticks;
274 	T_LOG("*** Ratio of Idle CPU time: %f\n\n", cpu_idle_ratio);
275 
276 	write_time_values(writer, &delta_times, elapsed_usecs, cpu_idle_ratio);
277 }
278 
279 static const int DEFAULT_THREAD_PRI = 31;
280 static const int DEFAULT_NUM_ITERS = 100000;
281 
282 #define KERNEL_BOOTARGS_MAX_SIZE 1024
283 static char kernel_bootargs[KERNEL_BOOTARGS_MAX_SIZE];
284 
285 T_DECL(yield_aggressor,
286     "Ensure that CPUs do not go idle when there are many threads all yielding "
287     "in a loop (for different varieties of yield)",
288     /* Required to get around the rate limit for host_statistics() */
289     T_META_BOOTARGS_SET("amfi_get_out_of_my_way=1"),
290     T_META_ASROOT(true))
291 {
292 	/* Warn if amfi_get_out_of_my_way is not set and fail later on if we actually run into the rate limit */
293 	size_t kernel_bootargs_size = sizeof(kernel_bootargs);
294 	int rv = sysctlbyname("kern.bootargs", kernel_bootargs, &kernel_bootargs_size, NULL, 0);
295 	T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.bootargs");
296 	if (strstr(kernel_bootargs, "amfi_get_out_of_my_way=1") == NULL) {
297 		T_LOG("WARNING: amfi_get_out_of_my_way=1 boot-arg is missing, required to reliably capture CPU load data");
298 	}
299 
300 	char pdj_path[MAX_PDJ_PATH_LEN];
301 	pdwriter_t writer = pdwriter_open_tmp("xnu", "scheduler.yield_aggressor", 0, 0, pdj_path, MAX_PDJ_PATH_LEN);
302 	T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(writer, "pdwriter_open_tmp");
303 
304 	/*
305 	 * Thread count is NCPU * 3 in order to ensure that there are enough yielding threads
306 	 * to keep all of the cores busy context-switching between them. NCPU * 1 threads would
307 	 * not be sufficient to guarantee this, because a core temporarily keeps two threads
308 	 * off of the run-queues at a time while performing a context-switch (rather than only
309 	 * the one thread it is running during normal execution). Lastly, we choose NCPU * 3
310 	 * rather than NCPU * 2 because doing so empirically reduces the variance of values
311 	 * betweens runs.
312 	 */
313 	unsigned int thread_count = (unsigned int) dt_ncpu() * 3;
314 
315 	for (yield_type_t yield_type = SCHED_YIELD; yield_type <= THREAD_SWITCH_DEPRESS; yield_type++) {
316 		run_yielding_test(yield_type, DEFAULT_NUM_ITERS, thread_count, DEFAULT_THREAD_PRI, writer);
317 	}
318 
319 	T_LOG("Perfdata file written to: %s", pdj_path);
320 	pdwriter_close(writer);
321 
322 	T_END;
323 }
324