xref: /xnu-12377.81.4/tests/sched/thread_group_fairness.c (revision 043036a2b3718f7f0be807e2870f8f47d3fa0796)
1 #include <unistd.h>
2 #include <stdlib.h>
3 #include <pthread.h>
4 #include <spawn.h>
5 #include <mach/mach.h>
6 #include <mach/mach_time.h>
7 #include <TargetConditionals.h>
8 #include <sys/work_interval.h>
9 #include <sys/stat.h>
10 #include <sys/sysctl.h>
11 #include <os/atomic_private.h>
12 
13 #include <darwintest.h>
14 #include <darwintest_utils.h>
15 #include <perfdata/perfdata.h>
16 #include "test_utils.h"
17 #include "sched_test_utils.h"
18 
19 #include "thread_group_fairness_workload_config.h"
20 
21 T_GLOBAL_META(T_META_NAMESPACE("xnu.scheduler"),
22     T_META_RADAR_COMPONENT_NAME("xnu"),
23     T_META_RADAR_COMPONENT_VERSION("scheduler"),
24     T_META_TAG_PERF,
25     T_META_TAG_VM_NOT_ELIGIBLE);
26 
27 static const size_t MAX_PDJ_PATH_LEN = 256;
28 static unsigned int num_cores;
29 
30 static void
workload_config_load(void)31 workload_config_load(void)
32 {
33 	int ret;
34 	size_t len = 0;
35 	ret = sysctlbyname("kern.workload_config", NULL, &len,
36 	    sched_thread_group_fairness_workload_config_plist,
37 	    sched_thread_group_fairness_workload_config_plist_len);
38 	if (ret == -1 && errno == ENOENT) {
39 		T_SKIP("kern.workload_config failed");
40 	}
41 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "kern.workload_config");
42 }
43 
44 static void
workload_config_cleanup(void)45 workload_config_cleanup(void)
46 {
47 	size_t len = 0;
48 	sysctlbyname("kern.workload_config", NULL, &len, "", 1);
49 }
50 
51 static void
environment_init(void)52 environment_init(void)
53 {
54 	num_cores = (unsigned int) dt_ncpu();
55 
56 	if (platform_is_amp()) {
57 		/*
58 		 * Derecommend all clusters except the E cores, to ensure that thread groups
59 		 * compete over the same cores irrespective of CLPC's cluster recommendations
60 		 */
61 		char *clpcctrl_args[] = {"-C", "e", NULL};
62 		execute_clpcctrl(clpcctrl_args, false);
63 	}
64 
65 	/*
66 	 * Load a test workload plist containing a Workload ID with
67 	 * WorkloadClass == DISCRETIONARY, in order to mark the thread group
68 	 * for that workload as THREAD_GROUP_FLAGS_EFFICIENT
69 	 */
70 	T_ATEND(workload_config_cleanup);
71 	workload_config_load();
72 }
73 
74 static void
set_work_interval_id(work_interval_t * handle,uint32_t work_interval_flags)75 set_work_interval_id(work_interval_t *handle, uint32_t work_interval_flags)
76 {
77 	int ret;
78 	mach_port_t port = MACH_PORT_NULL;
79 
80 	ret = work_interval_copy_port(*handle, &port);
81 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "work_interval_copy_port");
82 
83 	struct work_interval_workload_id_params wlid_params = {
84 		.wlidp_flags = WORK_INTERVAL_WORKLOAD_ID_HAS_ID,
85 		.wlidp_wicreate_flags = work_interval_flags,
86 		.wlidp_name = (uintptr_t)"com.test.myapp.discretionary",
87 	};
88 
89 	ret = __work_interval_ctl(WORK_INTERVAL_OPERATION_SET_WORKLOAD_ID, port, &wlid_params, sizeof(wlid_params));
90 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "WORK_INTERVAL_OPERATION_SET_WORKLOAD_ID");
91 }
92 
93 static uint32_t
make_work_interval(work_interval_t * handle,uint32_t work_type_flags)94 make_work_interval(work_interval_t *handle, uint32_t work_type_flags)
95 {
96 	int ret;
97 	uint32_t work_interval_flags = WORK_INTERVAL_FLAG_JOINABLE | WORK_INTERVAL_FLAG_GROUP | work_type_flags;
98 	ret = work_interval_create(handle, work_interval_flags);
99 	T_QUIET; T_ASSERT_POSIX_SUCCESS(ret, "work_interval_create");
100 
101 	if (work_type_flags & WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID) {
102 		set_work_interval_id(handle, work_interval_flags);
103 	}
104 	return work_interval_flags;
105 }
106 
107 struct thread_data {
108 	work_interval_t *handle;
109 	uint32_t work_interval_flags;
110 };
111 
112 static void *
spin_thread_fn(void * arg)113 spin_thread_fn(void *arg)
114 {
115 	struct thread_data *info = (struct thread_data *)arg;
116 	int ret;
117 
118 	/* Join the thread group associated with the work interval handle */
119 	ret = work_interval_join(*(info->handle));
120 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "work_interval_join");
121 
122 	/* Spin indefinitely */
123 	volatile uint64_t spin_count = 0;
124 	while (mach_absolute_time() < UINT64_MAX) {
125 		spin_count++;
126 	}
127 	return NULL;
128 }
129 
130 static void
start_threads(pthread_t * threads,struct thread_data * thread_datas,work_interval_t * handle,uint32_t work_interval_flags)131 start_threads(pthread_t *threads, struct thread_data *thread_datas, work_interval_t *handle, uint32_t work_interval_flags)
132 {
133 	int ret;
134 	for (unsigned int i = 0; i < num_cores; i++) {
135 		thread_datas[i].handle = handle;
136 		thread_datas[i].work_interval_flags = work_interval_flags;
137 		ret = pthread_create(&threads[i], NULL, spin_thread_fn, &thread_datas[i]);
138 		T_QUIET; T_ASSERT_POSIX_ZERO(ret, "pthread_create");
139 	}
140 }
141 
142 static uint64_t
snapshot_user_time_usec(pthread_t * threads)143 snapshot_user_time_usec(pthread_t *threads)
144 {
145 	kern_return_t kr;
146 	uint64_t cumulative_user_time_usec = 0;
147 	mach_msg_type_number_t count = THREAD_BASIC_INFO_COUNT;
148 	for (unsigned int i = 0; i < num_cores; i++) {
149 		mach_port_t thread_port = pthread_mach_thread_np(threads[i]);
150 		thread_basic_info_data_t info;
151 		kr = thread_info(thread_port, THREAD_BASIC_INFO, (thread_info_t)&info, &count);
152 		T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "thread_info");
153 		uint64_t thread_usr_usec = (uint64_t) (info.user_time.seconds) * USEC_PER_SEC + (uint64_t) info.user_time.microseconds;
154 		cumulative_user_time_usec += thread_usr_usec;
155 	}
156 	return cumulative_user_time_usec;
157 }
158 
159 T_DECL(thread_group_fairness,
160     "Ensure that thread groups tagged as higher priority do not starve out "
161     "thread groups tagged as lower priority when both behave as CPU spinners",
162     XNU_T_META_REQUIRES_DEVELOPMENT_KERNEL,     /* needed to set workload config */
163     T_META_ASROOT(YES))
164 {
165 	T_SETUPBEGIN;
166 
167 	wait_for_quiescence_default(argc, argv);
168 	environment_init();
169 
170 	/*
171 	 * Create two work intervals with corresponding thread groups that would
172 	 * be associated with differing priorities.
173 	 */
174 	work_interval_t lower_pri_handle, higher_pri_handle;
175 	uint32_t lower_pri_flags = make_work_interval(&lower_pri_handle, WORK_INTERVAL_TYPE_DEFAULT | WORK_INTERVAL_FLAG_HAS_WORKLOAD_ID);
176 	uint32_t higher_pri_flags = make_work_interval(&higher_pri_handle, WORK_INTERVAL_TYPE_DEFAULT);
177 
178 	/* Start threads to join the lower priority thread group */
179 	pthread_t lower_threads[num_cores];
180 	struct thread_data lower_thread_datas[num_cores];
181 	start_threads(lower_threads, lower_thread_datas, &lower_pri_handle, lower_pri_flags);
182 
183 	/* Start threads to join the higher priority thread group  */
184 	pthread_t higher_threads[num_cores];
185 	struct thread_data higher_thread_datas[num_cores];
186 	start_threads(higher_threads, higher_thread_datas, &higher_pri_handle, higher_pri_flags);
187 
188 	T_SETUPEND;
189 
190 	/* Snapshot thread runtimes */
191 	uint64_t start_lower_priority_runtime_usec = snapshot_user_time_usec(lower_threads);
192 	uint64_t start_higher_priority_runtime_usec = snapshot_user_time_usec(higher_threads);
193 
194 	/* Allow thread groups time to compete */
195 	sleep(3);
196 
197 	/*
198 	 * Snapshot runtimes again and compare the usage ratio between the lower and
199 	 * higher priority thread groups, to determine whether the lower priority group
200 	 * has been starved
201 	 */
202 	uint64_t finish_lower_priority_runtime_usec = snapshot_user_time_usec(lower_threads);
203 	uint64_t finish_higher_priority_runtime_usec = snapshot_user_time_usec(higher_threads);
204 
205 	uint64_t lower_priority_runtime = finish_lower_priority_runtime_usec - start_lower_priority_runtime_usec;
206 	uint64_t higher_priority_runtime = finish_higher_priority_runtime_usec - start_higher_priority_runtime_usec;
207 
208 	T_QUIET; T_ASSERT_GT(lower_priority_runtime, 10000LL, "lower priority thread group got at least 10ms of CPU time");
209 	T_QUIET; T_ASSERT_GT(higher_priority_runtime, 10000LL, "higher priority thread group got at least 10ms of CPU time");
210 
211 	/* Record the observed runtime ratio */
212 	char pdj_path[MAX_PDJ_PATH_LEN];
213 	pdwriter_t writer = pdwriter_open_tmp("xnu", "scheduler.thread_group_fairness", 0, 0, pdj_path, MAX_PDJ_PATH_LEN);
214 	T_QUIET; T_WITH_ERRNO; T_ASSERT_NOTNULL(writer, "pdwriter_open_tmp");
215 
216 	double runtime_ratio_value;
217 	double total_runtime = (double)(lower_priority_runtime + higher_priority_runtime);
218 	if (lower_priority_runtime <= higher_priority_runtime) {
219 		runtime_ratio_value = (double)(lower_priority_runtime) / total_runtime;
220 	} else {
221 		runtime_ratio_value = (double)(higher_priority_runtime) / total_runtime;
222 	}
223 	T_LOG("Observed timeshare ratio: %f", runtime_ratio_value);
224 
225 	pdwriter_new_value(writer, "Thread Group Runtime Ratio", PDUNIT_CUSTOM(runtime_ratio), runtime_ratio_value);
226 	pdwriter_record_larger_better(writer);
227 	pdwriter_close(writer);
228 	/* Ensure that the perfdata file can be copied by BATS */
229 	T_QUIET; T_ASSERT_POSIX_ZERO(chmod(pdj_path, 0644), "chmod");
230 
231 	T_END;
232 }
233 
234 static uint64_t
get_thread_group_cpu_time(int sched_bucket)235 get_thread_group_cpu_time(int sched_bucket)
236 {
237 	int ret;
238 	uint64_t cpu_stats[2];
239 	size_t cpu_stats_len = sizeof(uint64_t) * 2;
240 	ret = sysctlbyname("kern.clutch_bucket_group_cpu_stats", cpu_stats, &cpu_stats_len,
241 	    &sched_bucket, sizeof(sched_bucket));
242 	if (ret != 0 && errno == ENOTSUP) {
243 		T_LOG("Test only supported on Clutch/Edge scheduler (current policy is \"%s\") "
244 		    "platforms on development/debug build variants", platform_sched_policy());
245 		T_SKIP("kern.clutch_bucket_group_cpu_stats development-only sysctl not present");
246 	}
247 	T_QUIET; T_WITH_ERRNO; T_ASSERT_POSIX_SUCCESS(ret, "kern.clutch_bucket_group_cpu_stats");
248 	return cpu_stats[0];
249 }
250 
251 static volatile uint64_t mach_deadline = 0;
252 static const int seconds = 2;
253 static _Atomic volatile uint64_t count = 0;
254 static const int iters_per_lock_hold = 100000;
255 static const int low_qos = QOS_CLASS_USER_INITIATED;
256 static const int low_sched_bucket = 2; // TH_BUCKET_SHARE_IN
257 static const int high_qos = QOS_CLASS_USER_INTERACTIVE;
258 static const int high_sched_bucket = 1; // TH_BUCKET_SHARE_FG
259 static _Atomic volatile bool recorder_picked = false;
260 
261 static void *
boost_while_working(void * arg)262 boost_while_working(void *arg)
263 {
264 	int ret;
265 	work_interval_t wi = (work_interval_t)arg;
266 	ret = work_interval_join(wi);
267 	T_QUIET; T_ASSERT_POSIX_ZERO(ret, "work_interval_join");
268 
269 	bool is_recorder = os_atomic_cmpxchg(&recorder_picked, false, true, relaxed);
270 	uint64_t cpu_time_begin_low = 0;
271 	uint64_t cpu_time_begin_high = 0;
272 	if (is_recorder) {
273 		cpu_time_begin_low = get_thread_group_cpu_time(low_sched_bucket);
274 		cpu_time_begin_high = get_thread_group_cpu_time(high_sched_bucket);
275 	}
276 
277 	while (mach_absolute_time() < mach_deadline) {
278 		/* Assume high priority */
279 		ret = pthread_set_qos_class_self_np(high_qos, 0);
280 		T_QUIET; T_ASSERT_POSIX_ZERO(ret, "pthread_set_qos_class_self_np UI");
281 		T_QUIET; T_ASSERT_EQ(qos_class_self(), high_qos, "qos_class_self");
282 		/* Complete a "work item" */
283 		for (volatile int i = 0; i < iters_per_lock_hold; i++) {
284 			os_atomic_inc(&count, relaxed);
285 		}
286 		/* Drop priority down before parking to sleep */
287 		ret = pthread_set_qos_class_self_np(low_qos, 0);
288 		T_QUIET; T_ASSERT_POSIX_ZERO(ret, "pthread_set_qos_class_self_np IN");
289 		T_QUIET; T_ASSERT_EQ(qos_class_self(), low_qos, "qos_class_self");
290 		usleep(2 * 1000); // 2ms
291 	}
292 
293 	if (is_recorder) {
294 		uint64_t cpu_time_end_low = get_thread_group_cpu_time(low_sched_bucket);
295 		uint64_t cpu_time_end_high = get_thread_group_cpu_time(high_sched_bucket);
296 
297 		T_QUIET; T_ASSERT_GE(cpu_time_end_high, cpu_time_begin_high,
298 		    "non-monotonic thread group CPU time");
299 		uint64_t high_cpu_time = cpu_time_end_high - cpu_time_begin_high;
300 		T_QUIET; T_ASSERT_GE(cpu_time_end_low, cpu_time_begin_low,
301 		    "non-monotonic thread group CPU time");
302 		uint64_t low_cpu_time = cpu_time_end_low - cpu_time_begin_low;
303 
304 		T_QUIET; T_ASSERT_GT(high_cpu_time + low_cpu_time, 0ULL,
305 		    "CPU not attributed to either expected bucket");
306 		T_LOG("High ticks: %llu, Low ticks: %llu, High-to-low ratio: %.3f",
307 		    high_cpu_time, low_cpu_time, high_cpu_time * 1.0 / (high_cpu_time + low_cpu_time));
308 		T_EXPECT_GE(high_cpu_time, low_cpu_time, "More work accounted to the high QoS");
309 		T_EXPECT_LE(low_cpu_time * 1.0, high_cpu_time * 0.2,
310 		    "Vast majority of work accounted to the high QoS");
311 	}
312 	return NULL;
313 }
314 
315 /*
316  * Note, preemption due to non-test threads poses a special problem for
317  * this test because time the test threads spend preempted at their low
318  * QoS, in between processing work items, translates to "blocked" time
319  * for the thread group at its high QoS. This leads to CPU usage aging
320  * out more quickly for the high QoS, causing the test to fail.
321  *
322  * Additionally, the test must be run like an application in the QoS
323  * engine, without a QoS ceiling which would prevent the test threads
324  * from performing adequately high QoS boosts. For example:
325  * sudo taskpolicy -a ./thread_group_fairness -n interactivity_cpu_accounting
326  */
327 T_DECL(interactivity_cpu_accounting,
328     "Ensure that CPU runtime tracked for calculating interactivity score "
329     "gets attributed to the right QoS that performed the work, even if we "
330     "switch QoS while on-core (rdar://125045167)",
331     T_META_ENABLED(TARGET_CPU_ARM64 && !TARGET_OS_BRIDGE),
332 #if TARGET_OS_WATCH
333     T_META_MAYFAIL("Watches too noisy with high priority spinners (rdar://150323037)"),
334 #elif TARGET_OS_TV
335     T_META_MAYFAIL("TVs too noisy with high priority audio (rdar://149974201)"),
336 #endif
337     T_META_ASROOT(YES))
338 {
339 	/* Skips the test if needed sysctl isn't present */
340 	get_thread_group_cpu_time(0);
341 
342 	/* Ensure we don't have a QoS ceiling that would prevent high enough boosts */
343 	struct task_policy_state policy_state;
344 	mach_msg_type_number_t count = TASK_POLICY_STATE_COUNT;
345 	boolean_t get_default = FALSE;
346 	kern_return_t kr = task_policy_get(mach_task_self(), TASK_POLICY_STATE,
347 	    (task_policy_t)&policy_state, &count, &get_default);
348 	T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "task_policy_get(self, TASK_POLICY_STATE)");
349 	int requested_app_type = (policy_state.requested & POLICY_REQ_APPTYPE_MASK) >> POLICY_REQ_APPTYPE_SHIFT;
350 	T_QUIET; T_ASSERT_EQ(requested_app_type, TASK_APPTYPE_APP_DEFAULT,
351 	    "Test needs to be run like an application for QoS boosting above pri 37 to succeed");
352 
353 	wait_for_quiescence(argc, argv, 0.9, 10);
354 
355 	trace_handle_t trace = begin_collect_trace(argc, argv, T_NAME);
356 	T_SETUPEND;
357 
358 	if (platform_is_amp()) {
359 		/*
360 		 * Isolate-out the effects of cluster recommendation, since that
361 		 * causes threads to be preempted sometimes for rebalancing purposes.
362 		 */
363 		char *clpcctrl_args[] = {"-C", "p", NULL};
364 		execute_clpcctrl(clpcctrl_args, false);
365 	}
366 
367 	mach_deadline = mach_absolute_time() + nanos_to_abs(seconds * NSEC_PER_SEC);
368 
369 	/*
370 	 * Create threads in their own TG that will run work at "boosted"
371 	 * priority and after a work item is complete, lower their
372 	 * priority back down to a low QoS before "parking" via usleep().
373 	 *
374 	 * We expect that the interactivity score for the high QoS for this
375 	 * TG will be the one to lower, rather than the low QoS which the
376 	 * threads are switching down to before context-switching off-core.
377 	 */
378 	int num_boosters = MIN(4, dt_ncpu());
379 	work_interval_t wi_handle;
380 	make_work_interval(&wi_handle, WORK_INTERVAL_TYPE_DEFAULT);
381 	pthread_t threads[num_boosters];
382 	for (int i = 0; i < num_boosters; i++) {
383 		create_thread(&threads[i], NULL, boost_while_working, wi_handle);
384 	}
385 
386 	/*
387 	 * Wait for test deadline to pass, to avoid priority boosting
388 	 * with pthread_join(), which would affect the results.
389 	 */
390 	uint64_t curr_time = mach_absolute_time();
391 	if (curr_time < mach_deadline) {
392 		usleep(abs_to_nanos(mach_deadline - curr_time) / NSEC_PER_USEC);
393 	}
394 	for (int i = 0; i < num_boosters; i++) {
395 		pthread_join(threads[i], NULL);
396 	}
397 
398 	if (platform_is_amp()) {
399 		/* Reenable all cores to speed up trace post-processing */
400 		char *recommend_all_cores_args[] = {"-C", "all", NULL};
401 		execute_clpcctrl(recommend_all_cores_args, false);
402 	}
403 	end_collect_trace(trace);
404 }
405