1 #include <unistd.h>
2 #include <stdlib.h>
3 #include <pthread.h>
4 #include <spawn.h>
5 #include <string.h>
6 #include <mach/mach.h>
7 #include <mach/mach_time.h>
8 #include <TargetConditionals.h>
9 #include <sys/work_interval.h>
10 #include <sys/stat.h>
11 #include <sys/sysctl.h>
12 #include <sys/time.h>
13 #include <stdatomic.h>
14 #include <time.h>
15
16 #include <darwintest.h>
17 #include <darwintest_utils.h>
18 #include <perfdata/perfdata.h>
19
20 T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
21 T_META_RADAR_COMPONENT_NAME("xnu"),
22 T_META_RADAR_COMPONENT_VERSION("scheduler"),
23 T_META_TAG_PERF);
24
25 /* Code and logic taken from Daniel Chimene's yield-aggressor.c test (rdar://47327537) */
26
27 static const size_t MAX_PDJ_PATH_LEN = 256;
28
29 static void
sched_yield_loop(uint64_t iterations)30 sched_yield_loop(uint64_t iterations)
31 {
32 for (uint64_t i = 0; i < iterations; i++) {
33 sched_yield();
34 }
35 }
36
37 static void
swtch_loop(uint64_t iterations)38 swtch_loop(uint64_t iterations)
39 {
40 for (uint64_t i = 0; i < iterations; i++) {
41 swtch();
42 }
43 }
44
45 static void
swtch_pri_loop(uint64_t iterations)46 swtch_pri_loop(uint64_t iterations)
47 {
48 for (uint64_t i = 0; i < iterations; i++) {
49 swtch_pri(0);
50 }
51 }
52
53 static void
thread_switch_loop(uint64_t iterations)54 thread_switch_loop(uint64_t iterations)
55 {
56 for (uint64_t i = 0; i < iterations; i++) {
57 thread_switch(MACH_PORT_NULL, SWITCH_OPTION_NONE, MACH_MSG_TIMEOUT_NONE);
58 }
59 }
60
61 static void
thread_switch_wait_loop(uint64_t iterations)62 thread_switch_wait_loop(uint64_t iterations)
63 {
64 for (uint64_t i = 0; i < iterations; i++) {
65 thread_switch(MACH_PORT_NULL, SWITCH_OPTION_WAIT, MACH_MSG_TIMEOUT_NONE);
66 }
67 }
68
69 static void
thread_switch_depress_loop(uint64_t iterations)70 thread_switch_depress_loop(uint64_t iterations)
71 {
72 for (uint64_t i = 0; i < iterations; i++) {
73 thread_switch(MACH_PORT_NULL, SWITCH_OPTION_DEPRESS, MACH_MSG_TIMEOUT_NONE);
74 }
75 }
76
77 typedef enum yield_type {
78 SCHED_YIELD = 0,
79 SWTCH = 1,
80 SWTCH_PRI = 2,
81 THREAD_SWITCH = 3,
82 THREAD_SWITCH_WAIT = 4,
83 THREAD_SWITCH_DEPRESS = 5
84 } yield_type_t;
85
86 static const int NUM_YIELD_TYPES = 6;
87
88 static char* name_table[NUM_YIELD_TYPES] = {
89 [SCHED_YIELD] = "sched_yield",
90 [SWTCH] = "swtch",
91 [SWTCH_PRI] = "swtch_pri",
92 [THREAD_SWITCH] = "thread_switch(none)",
93 [THREAD_SWITCH_WAIT] = "thread_switch(wait)",
94 [THREAD_SWITCH_DEPRESS] = "thread_switch(depress)",
95 };
96
97 static void (*fn_table[NUM_YIELD_TYPES])(uint64_t) = {
98 [SCHED_YIELD] = sched_yield_loop,
99 [SWTCH] = swtch_loop,
100 [SWTCH_PRI] = swtch_pri_loop,
101 [THREAD_SWITCH] = thread_switch_loop,
102 [THREAD_SWITCH_WAIT] = thread_switch_wait_loop,
103 [THREAD_SWITCH_DEPRESS] = thread_switch_depress_loop,
104 };
105
106 static semaphore_t ready_sem, go_sem;
107 static unsigned int num_iterations, num_threads;
108 static _Atomic unsigned int done_threads;
109 static yield_type_t curr_yield_type;
110
111 static void *
thread_fn(__unused void * arg)112 thread_fn(__unused void *arg)
113 {
114 kern_return_t kr;
115
116 kr = semaphore_wait_signal(go_sem, ready_sem);
117 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
118
119 fn_table[curr_yield_type](num_iterations);
120
121 if (atomic_fetch_add(&done_threads, 1) == num_threads - 1) {
122 kr = semaphore_wait_signal(go_sem, ready_sem);
123 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait_signal");
124 } else {
125 kr = semaphore_wait(go_sem);
126 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
127 }
128 return NULL;
129 }
130
131 static void
start_threads(pthread_t * threads,void * (* start_routine)(void *),int priority)132 start_threads(pthread_t *threads, void *(*start_routine)(void *), int priority)
133 {
134 int rv;
135 pthread_attr_t attr;
136
137 rv = pthread_attr_init(&attr);
138 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_init");
139
140 for (unsigned int i = 0; i < num_threads; i++) {
141 struct sched_param param = { .sched_priority = (int)priority };
142
143 rv = pthread_attr_setschedparam(&attr, ¶m);
144 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setschedparam");
145
146 rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
147 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setdetachstate");
148
149 rv = pthread_create(&threads[i], &attr, start_routine, NULL);
150 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_create");
151 }
152
153 rv = pthread_attr_destroy(&attr);
154 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_destroy");
155 }
156
157 struct cpu_time {
158 natural_t sys;
159 natural_t user;
160 natural_t idle;
161 };
162
163 static void
record_cpu_time(struct cpu_time * cpu_time)164 record_cpu_time(struct cpu_time *cpu_time)
165 {
166 host_cpu_load_info_data_t load;
167 kern_return_t kr;
168 mach_msg_type_number_t count = HOST_CPU_LOAD_INFO_COUNT;
169
170 kr = host_statistics(mach_host_self(), HOST_CPU_LOAD_INFO, (int *)&load, &count);
171 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "host_statistics");
172
173 cpu_time->sys = load.cpu_ticks[CPU_STATE_SYSTEM];
174 cpu_time->user = load.cpu_ticks[CPU_STATE_USER] + load.cpu_ticks[CPU_STATE_NICE];
175 cpu_time->idle = load.cpu_ticks[CPU_STATE_IDLE];
176 }
177
178 static void
write_independent_variables(pdwriter_t writer)179 write_independent_variables(pdwriter_t writer)
180 {
181 pdwriter_record_variable_str(writer, "yield_variant", name_table[curr_yield_type]);
182 pdwriter_record_variable_dbl(writer, "num_iterations", num_iterations);
183 pdwriter_record_variable_dbl(writer, "num_threads", num_threads);
184 }
185
186 static const double MS_PER_CPU_TICK = 10.0;
187
188 static void
write_time_values(pdwriter_t writer,struct cpu_time * delta_times,uint64_t elapsed_usecs,double idle_ratio)189 write_time_values(pdwriter_t writer, struct cpu_time *delta_times, uint64_t elapsed_usecs, double idle_ratio)
190 {
191 pdwriter_new_value(writer, "system_time", pdunit_milliseconds_cpu, delta_times->sys * MS_PER_CPU_TICK);
192 write_independent_variables(writer);
193
194 pdwriter_new_value(writer, "user_time", pdunit_milliseconds_cpu, delta_times->user * MS_PER_CPU_TICK);
195 write_independent_variables(writer);
196
197 pdwriter_new_value(writer, "idle_time", pdunit_milliseconds_cpu, delta_times->idle * MS_PER_CPU_TICK);
198 write_independent_variables(writer);
199
200 pdwriter_new_value(writer, "wall_clock_time", pdunit_microseconds, elapsed_usecs);
201 write_independent_variables(writer);
202
203 /* Main metric of note, with a threshold in perfmeta to guard against regression */
204 pdwriter_new_value(writer, "idle_time_ratio", pdunit_percent_cpus, idle_ratio);
205 write_independent_variables(writer);
206 }
207
208 static void
run_yielding_test(yield_type_t yield_type,unsigned int num_iters,unsigned int thread_count,int thread_pri,pdwriter_t writer)209 run_yielding_test(yield_type_t yield_type, unsigned int num_iters, unsigned int thread_count,
210 int thread_pri, pdwriter_t writer)
211 {
212 T_SETUPBEGIN;
213
214 T_LOG("===== Yield Variety: %s", name_table[yield_type]);
215
216 kern_return_t kr;
217
218 num_iterations = num_iters;
219 num_threads = thread_count;
220 curr_yield_type = yield_type;
221 done_threads = 0;
222
223 kr = semaphore_create(mach_task_self(), &ready_sem, SYNC_POLICY_FIFO, 0);
224 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
225 kr = semaphore_create(mach_task_self(), &go_sem, SYNC_POLICY_FIFO, 0);
226 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_create");
227
228 pthread_t threads[num_threads];
229 start_threads(threads, &thread_fn, thread_pri);
230
231 for (uint32_t i = 0; i < num_threads; i++) {
232 kr = semaphore_wait(ready_sem);
233 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
234 }
235
236 /* Wait 100ms for the system to settle down */
237 usleep(100000);
238
239 T_SETUPEND;
240
241 struct cpu_time start_times, finish_times, delta_times;
242 uint64_t before_nsec, after_nsec;
243
244 record_cpu_time(&start_times);
245 before_nsec = clock_gettime_nsec_np(CLOCK_REALTIME);
246
247 /* Signal threads to begin yielding "work" */
248 kr = semaphore_signal_all(go_sem);
249 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_signal_all");
250
251 kr = semaphore_wait(ready_sem);
252 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "semaphore_wait");
253
254 /* Capture cpu stats after yielding "work" has finished */
255 after_nsec = clock_gettime_nsec_np(CLOCK_REALTIME);
256 record_cpu_time(&finish_times);
257
258 uint64_t elapsed_usecs = (after_nsec - before_nsec) / 1000;
259 T_LOG("All %u threads finished yielding %u times each", num_threads, num_iterations);
260 T_LOG("Elapsed Runtime: %f seconds", ((double) elapsed_usecs) / USEC_PER_SEC);
261
262 delta_times.sys = finish_times.sys - start_times.sys;
263 delta_times.user = finish_times.user - start_times.user;
264 delta_times.idle = finish_times.idle - start_times.idle;
265 T_LOG("System CPU ticks: %d", delta_times.sys);
266 T_LOG("User CPU ticks: %d", delta_times.user);
267 T_LOG("Idle CPU ticks: %d", delta_times.idle);
268
269 natural_t total_ticks = delta_times.sys + delta_times.user + delta_times.idle;
270 T_QUIET; T_ASSERT_GT(total_ticks, 0, "CPU load stats failed to update, likely due to host_statistics() rate limit");
271
272 double cpu_idle_ratio = delta_times.idle * 1.0 / total_ticks;
273 T_LOG("*** Ratio of Idle CPU time: %f\n\n", cpu_idle_ratio);
274
275 write_time_values(writer, &delta_times, elapsed_usecs, cpu_idle_ratio);
276 }
277
278 static const int DEFAULT_THREAD_PRI = 31;
279 static const int DEFAULT_NUM_ITERS = 100000;
280
281 #define KERNEL_BOOTARGS_MAX_SIZE 1024
282 static char kernel_bootargs[KERNEL_BOOTARGS_MAX_SIZE];
283
284 T_DECL(yield_aggressor,
285 "Ensure that CPUs do not go idle when there are many threads all yielding "
286 "in a loop (for different varieties of yield)",
287 /* Required to get around the rate limit for host_statistics() */
288 T_META_BOOTARGS_SET("amfi_get_out_of_my_way=1"),
289 T_META_ASROOT(true))
290 {
291 /* Warn if amfi_get_out_of_my_way is not set and fail later on if we actually run into the rate limit */
292 size_t kernel_bootargs_size = sizeof(kernel_bootargs);
293 int rv = sysctlbyname("kern.bootargs", kernel_bootargs, &kernel_bootargs_size, NULL, 0);
294 T_QUIET; T_ASSERT_POSIX_SUCCESS(rv, "kern.bootargs");
295 if (strstr(kernel_bootargs, "amfi_get_out_of_my_way=1") == NULL) {
296 T_LOG("WARNING: amfi_get_out_of_my_way=1 boot-arg is missing, required to reliably capture CPU load data");
297 }
298
299 char pdj_path[MAX_PDJ_PATH_LEN];
300 pdwriter_t writer = pdwriter_open_tmp("xnu", "scheduler.yield_aggressor", 0, 0, pdj_path, MAX_PDJ_PATH_LEN);
301 T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(writer, "pdwriter_open_tmp");
302
303 /*
304 * Thread count is NCPU * 3 in order to ensure that there are enough yielding threads
305 * to keep all of the cores busy context-switching between them. NCPU * 1 threads would
306 * not be sufficient to guarantee this, because a core temporarily keeps two threads
307 * off of the run-queues at a time while performing a context-switch (rather than only
308 * the one thread it is running during normal execution). Lastly, we choose NCPU * 3
309 * rather than NCPU * 2 because doing so empirically reduces the variance of values
310 * betweens runs.
311 */
312 unsigned int thread_count = (unsigned int) dt_ncpu() * 3;
313
314 for (yield_type_t yield_type = SCHED_YIELD; yield_type <= THREAD_SWITCH_DEPRESS; yield_type++) {
315 run_yielding_test(yield_type, DEFAULT_NUM_ITERS, thread_count, DEFAULT_THREAD_PRI, writer);
316 }
317
318 T_LOG("Perfdata file written to: %s", pdj_path);
319 pdwriter_close(writer);
320
321 T_END;
322 }
323