1 // Copyright (c) 2023 Apple Inc. All rights reserved.
2
3 #include <unistd.h>
4 #include <stdlib.h>
5 #include <stdio.h>
6 #include <pthread.h>
7 #include <string.h>
8 #include <mach/mach.h>
9 #include <mach/mach_time.h>
10 #include <sys/stat.h>
11 #include <sys/sysctl.h>
12 #include <stdatomic.h>
13 #include <sys/work_interval.h>
14 #include <ktrace.h>
15 #include <sys/kdebug.h>
16
17 #include <darwintest.h>
18 #include <darwintest_utils.h>
19 #include "test_utils.h"
20
21 T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
22 T_META_RADAR_COMPONENT_NAME("xnu"),
23 T_META_RADAR_COMPONENT_VERSION("scheduler"));
24
25 static mach_timebase_info_data_t timebase_info;
26
27 static uint64_t
nanos_to_abs(uint64_t nanos)28 nanos_to_abs(uint64_t nanos)
29 {
30 mach_timebase_info(&timebase_info);
31 return nanos * timebase_info.denom / timebase_info.numer;
32 }
33
34 static uint64_t
abs_to_nanos(uint64_t abs)35 abs_to_nanos(uint64_t abs)
36 {
37 mach_timebase_info(&timebase_info);
38 return abs * timebase_info.numer / timebase_info.denom;
39 }
40
41 static int BACKGROUND_PRI;
42 static int NUM_THREADS;
43 static uint64_t SLEEP_SECONDS;
44 static uint64_t INTERRUPT_DISABLE_TIMEOUT_NS;
45
46 static uint64_t start_timestamp = 0ULL;
47 static volatile int sum = 0; // Keeps the spin-loop below from compiling-out
48
49 static void *
make_tg_and_spin(__unused void * arg)50 make_tg_and_spin(__unused void *arg)
51 {
52 int ret;
53 assert(SLEEP_SECONDS > 2);
54
55 /* Create and join a new thread group (TG) */
56 work_interval_t wi_handle;
57 ret = work_interval_create(&wi_handle, WORK_INTERVAL_FLAG_GROUP);
58 T_QUIET; T_ASSERT_POSIX_ZERO(ret, 0, "work_interval_create");
59
60 /* Allow other threads a chance to get on-core and create/join their own TGs */
61 uint64_t yield_deadline = start_timestamp + nanos_to_abs(1 * NSEC_PER_SEC);
62 while (mach_absolute_time() < yield_deadline) {
63 sched_yield();
64 }
65
66 /*
67 * Remain runnable long enough for the sched_maintenance_thread to scan the
68 * many created TGs all at the same time in one scheduler tick.
69 */
70 uint64_t spin_deadline = start_timestamp + nanos_to_abs((SLEEP_SECONDS - 2) * NSEC_PER_SEC);
71 while (mach_absolute_time() < spin_deadline) {
72 sum++;
73 }
74
75 /*
76 * Terminate with about a second to spare of SLEEP_SECONDS, so that we have
77 * time to bring down the number of runnable thread groups before the test
78 * case reenables the previous kern.interrupt_masked_debug_mode value.
79 * Otherwise, a system failing this test could panic.
80 */
81 return NULL;
82 }
83
84 static void
start_threads(pthread_t * threads,void * (* start_routine)(void *),int priority,int num_threads)85 start_threads(pthread_t *threads, void *(*start_routine)(void *), int priority, int num_threads)
86 {
87 int rv;
88 pthread_attr_t attr;
89
90 rv = pthread_attr_init(&attr);
91 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_init");
92
93 for (int i = 0; i < num_threads; i++) {
94 struct sched_param param = { .sched_priority = priority };
95
96 rv = pthread_attr_setschedparam(&attr, ¶m);
97 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setschedparam");
98
99 rv = pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
100 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setdetachstate");
101
102 /* Make the thread stacks smaller, so pthread will let us make more */
103 rv = pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
104 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_setstacksize");
105
106 rv = pthread_create(&threads[i], &attr, start_routine, NULL);
107 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_create");
108 }
109
110 rv = pthread_attr_destroy(&attr);
111 T_QUIET; T_ASSERT_POSIX_ZERO(rv, "pthread_attr_destroy");
112 }
113
114 static uint64_t old_preemption_debug_mode = 0;
115 static size_t old_preemption_debug_mode_size = sizeof(old_preemption_debug_mode);
116
117 static void
restore_preemption_disable_debug_mode(void)118 restore_preemption_disable_debug_mode(void)
119 {
120 int ret = sysctlbyname("kern.sched_preemption_disable_debug_mode", NULL, NULL,
121 &old_preemption_debug_mode, old_preemption_debug_mode_size);
122 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.sched_preemption_disable_debug_mode");
123 T_LOG("kern.sched_preemption_disable_debug_mode restored to previous value: %llu", old_preemption_debug_mode);
124 }
125
126 static uint64_t old_interrupt_debug_mode = 0;
127 static size_t old_interrupt_debug_mode_size = sizeof(old_interrupt_debug_mode);
128
129 static void
restore_interrupt_disable_debug_mode(void)130 restore_interrupt_disable_debug_mode(void)
131 {
132 int ret = sysctlbyname("kern.interrupt_masked_debug_mode", NULL, NULL,
133 &old_interrupt_debug_mode, old_interrupt_debug_mode_size);
134 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_debug_mode");
135 T_LOG("kern.interrupt_masked_debug_mode restored to previous value: %llu", old_interrupt_debug_mode);
136 }
137
138 static uint64_t old_interrupt_disable_timeout = 0;
139 static size_t old_interrupt_disable_timeout_size = sizeof(old_interrupt_disable_timeout);
140
141 static void
restore_interrupt_disable_timeout(void)142 restore_interrupt_disable_timeout(void)
143 {
144 int ret = sysctlbyname("kern.interrupt_masked_threshold_mt", NULL, NULL,
145 &old_interrupt_disable_timeout, old_interrupt_disable_timeout_size);
146 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_threshold_mt");
147 T_LOG("kern.interrupt_masked_threshold_mt restored to previous value: %llu", old_interrupt_disable_timeout);
148 }
149
150 static const char *ktrace_file_short_name = "overload_runqueue_with_thread_groups.ktrace";
151
152 static void
save_collected_ktrace(char * trace_path)153 save_collected_ktrace(char *trace_path)
154 {
155 int ret;
156
157 T_LOG("ktrace file saved at \"%s\"", trace_path);
158 ret = chmod(trace_path, 0777);
159 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "chmod");
160
161 char compressed_path[MAXPATHLEN];
162 snprintf(compressed_path, MAXPATHLEN, "%s.tar.gz", ktrace_file_short_name);
163 ret = dt_resultfile(compressed_path, sizeof(compressed_path));
164 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_resultfile marking \"%s\" for collection", compressed_path);
165 T_LOG("\"%s\" marked for upload", compressed_path);
166
167 char *tar_args[] = {"/usr/bin/tar", "-czvf", compressed_path, trace_path, NULL};
168 pid_t tar_pid = dt_launch_tool_pipe(tar_args, false, NULL,
169 ^bool (__unused char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
170 return true;
171 },
172 ^bool (char *data, __unused size_t data_size, __unused dt_pipe_data_handler_context_t *context) {
173 T_LOG("[tar] Error msg: %s", data);
174 return true;
175 },
176 BUFFER_PATTERN_LINE, NULL);
177
178 T_QUIET; T_ASSERT_TRUE(tar_pid, "[tar] pid %d", tar_pid);
179 ret = dt_waitpid(tar_pid, NULL, NULL, 0);
180 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "dt_waitpid");
181
182 ret = chmod(compressed_path, 0777);
183 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "chmod");
184 }
185
186 /*
187 * Parse the recorded ktrace file to see if we crossed the set interrupt-disabled
188 * timeout and thus failed the test.
189 */
190 static void
search_for_interrupt_disable_timeout_tracepoint(char * trace_path)191 search_for_interrupt_disable_timeout_tracepoint(char *trace_path)
192 {
193 __block int ret;
194 ktrace_session_t read_session = ktrace_session_create();
195 ret = ktrace_set_file(read_session, trace_path);
196 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_set_file");
197 __block const char *offending_thread = NULL;
198
199 ktrace_events_single(read_session, MACHDBG_CODE(DBG_MACH_SCHED, MACH_INT_MASKED_EXPIRED), ^(ktrace_event_t e) {
200 if (offending_thread == NULL) {
201 T_LOG("Interrupts were held disabled for %llu ns, crossing the %llu ns threshold:", abs_to_nanos(e->arg1), INTERRUPT_DISABLE_TIMEOUT_NS);
202 ret = ktrace_print_trace_point(stdout, read_session, e, KTP_KIND_CSV,
203 KTP_FLAG_WALLTIME | KTP_FLAG_THREADNAME | KTP_FLAG_PID | KTP_FLAG_EVENTNAME | KTP_FLAG_EXECNAME);
204 T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "ktrace_print_trace_point output");
205 printf("\n"); // Flush output from ktrace_print_trace_point
206 offending_thread = ktrace_get_name_for_thread(read_session, e->threadid);
207 ktrace_end(read_session, 0);
208 }
209 });
210
211 ktrace_set_completion_handler(read_session, ^{
212 if (offending_thread == NULL) {
213 T_PASS("Scheduler survived %d simulatenously runnable thread groups without disabling interrupts for more than %llu ns!", NUM_THREADS, INTERRUPT_DISABLE_TIMEOUT_NS);
214 } else {
215 save_collected_ktrace(trace_path);
216 T_FAIL("Interrupts held disabled for more than %llu ns by thread \"%s\"", INTERRUPT_DISABLE_TIMEOUT_NS, offending_thread);
217 }
218 T_END;
219 });
220
221 ret = ktrace_start(read_session, dispatch_get_main_queue());
222 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start");
223 }
224
225 T_DECL(overload_runqueue_with_thread_groups,
226 "Overload the runqueue with distinct thread groups to verify that the scheduler"
227 "does not trip an interrupts-disabled timeout whenever it scans the runqueue",
228 T_META_ASROOT(true), XNU_T_META_SOC_SPECIFIC, T_META_ENABLED(TARGET_OS_IOS))
229 {
230 BACKGROUND_PRI = 4;
231 NUM_THREADS = 1000;
232 SLEEP_SECONDS = 20;
233 /* Matches DEFAULT_INTERRUPT_MASKED_TIMEOUT value in XNU */
234 INTERRUPT_DISABLE_TIMEOUT_NS = 500 * NSEC_PER_USEC; // 500 microseconds
235
236 __block int ret;
237
238 /* Configure interrupts-disabled timeout to drop a ktracepoint */
239 uint64_t emit_tracepoint_mode = 1;
240 ret = sysctlbyname("kern.interrupt_masked_debug_mode",
241 &old_interrupt_debug_mode, &old_interrupt_debug_mode_size,
242 &emit_tracepoint_mode, sizeof(emit_tracepoint_mode));
243 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_debug_mode");
244 T_ATEND(restore_interrupt_disable_debug_mode);
245 /* Configure the preemption-disabled debug mode as well, to avoid panicking if the test fails */
246 ret = sysctlbyname("kern.sched_preemption_disable_debug_mode",
247 &old_preemption_debug_mode, &old_preemption_debug_mode_size,
248 &emit_tracepoint_mode, sizeof(emit_tracepoint_mode));
249 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "kern.sched_preemption_disable_debug_mode");
250 T_ATEND(restore_preemption_disable_debug_mode);
251
252 /* Set interrupts-disabled timeout threshold */
253 uint64_t disable_timeout = nanos_to_abs(INTERRUPT_DISABLE_TIMEOUT_NS);
254 ret = sysctlbyname("kern.interrupt_masked_threshold_mt",
255 &old_interrupt_disable_timeout, &old_interrupt_disable_timeout_size,
256 &disable_timeout, sizeof(disable_timeout));
257 T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_ZERO(ret, "kern.interrupt_masked_threshold_mt");
258 T_ATEND(restore_interrupt_disable_timeout);
259
260 /* Use ktrace to observe if the interrupt-disable timeout drops a tracepoint */
261 ktrace_session_t session = ktrace_session_create();
262 T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(session, "ktrace_session_create");
263 char filepath_arr[MAXPATHLEN] = "";
264 const char *tmp_dir = dt_tmpdir();
265 strlcpy(filepath_arr, tmp_dir, sizeof(filepath_arr));
266 strlcat(filepath_arr, "/", sizeof(filepath_arr));
267 strlcat(filepath_arr, ktrace_file_short_name, sizeof(filepath_arr));
268 ret = remove(filepath_arr);
269 T_QUIET; T_WITH_ERRNO; T_ASSERT_TRUE((ret == 0) || (errno == ENOENT), "remove");
270 char *filepath = filepath_arr;
271 ret = ktrace_events_filter(session, "C0x01", ^(__unused ktrace_event_t event){}); // records scheduler tracepoints
272 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_events_filter");
273 ktrace_set_completion_handler(session, ^{
274 search_for_interrupt_disable_timeout_tracepoint(filepath);
275 });
276 ret = ktrace_start_writing_path(session, filepath, 0);
277 T_QUIET; T_ASSERT_POSIX_ZERO(ret, "ktrace_start_writing_path");
278
279 /* Spin up lots of threads, each creating and joining its own thread group */
280 T_LOG("Creating %d threads at pri %d, each with a unique thread group", NUM_THREADS, BACKGROUND_PRI);
281 start_timestamp = mach_absolute_time();
282 pthread_t *bg_threads = malloc(sizeof(pthread_t) * (size_t)NUM_THREADS);
283 start_threads(bg_threads, make_tg_and_spin, BACKGROUND_PRI, NUM_THREADS);
284
285 T_LOG("Waiting %llu seconds to see if the scheduler can handle it...", SLEEP_SECONDS);
286 dispatch_after(dispatch_time(DISPATCH_TIME_NOW, (int64_t)(SLEEP_SECONDS * NSEC_PER_SEC)), dispatch_get_main_queue(), ^{
287 ktrace_end(session, 0);
288 });
289 dispatch_main();
290 }
291