1 /*
2 * Copyright (c) 2019 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /**
29 * On devices that support it, this test ensures that a mach exception is
30 * generated when a matrix-math exception is triggered, and that the
31 * matrix register file is correctly preserved or zeroed on context switch.
32 */
33
34 /*
35 * IMPLEMENTATION NOTE:
36 *
37 * This test code goes to some unusual lengths to avoid calling out to libc or
38 * libdarwintest while the CPU is in streaming SVE mode (i.e., between
39 * ops->start() and ops->stop()). Both of these libraries are built with SIMD
40 * instructions that will cause the test executable to crash while in streaming
41 * SVE mode.
42 *
43 * Ordinarily this is the wrong way to solve this problem. Functions that use
44 * streaming SVE mode should have annotations telling the compiler so, and the
45 * compiler will automatically generate appropriate interworking code. However
46 * this interworking code will stash SME state to memory and temporarily exit
47 * streaming SVE mode. We're specifically testing how xnu manages live SME
48 * register state, so we can't let the compiler stash and disable this state
49 * behind our backs.
50 */
51
52 #ifdef __arm64__
53 #include <mach/error.h>
54 #endif /* __arm64__ */
55
56 #include <darwintest.h>
57 #include <pthread.h>
58 #include <stdlib.h>
59 #include <mach/mach.h>
60 #include <mach/thread_act.h>
61 #include <mach/thread_status.h>
62 #include <mach/exception.h>
63 #include <machine/cpu_capabilities.h>
64 #include <sys/types.h>
65 #include <sys/sysctl.h>
66
67 #include "arm_matrix.h"
68 #include "exc_helpers.h"
69 #include "test_utils.h"
70
71 T_GLOBAL_META(
72 T_META_NAMESPACE("xnu.arm"),
73 T_META_RADAR_COMPONENT_NAME("xnu"),
74 T_META_RADAR_COMPONENT_VERSION("arm"),
75 T_META_OWNER("ghackmann"),
76 T_META_RUN_CONCURRENTLY(true)
77 );
78
79 #ifdef __arm64__
80
81 #ifndef EXC_ARM_SME_DISALLOWED
82 #define EXC_ARM_SME_DISALLOWED 2
83 #endif
84
85 /* Whether we caught the EXC_BAD_INSTRUCTION mach exception or not. */
86 static volatile bool mach_exc_caught = false;
87
88 static size_t
bad_instruction_exception_handler(__unused mach_port_t task,__unused mach_port_t thread,exception_type_t type,mach_exception_data_t codes,__unused uint64_t exception_pc)89 bad_instruction_exception_handler(
90 __unused mach_port_t task,
91 __unused mach_port_t thread,
92 exception_type_t type,
93 mach_exception_data_t codes,
94 __unused uint64_t exception_pc)
95 {
96 T_QUIET; T_ASSERT_EQ(type, EXC_BAD_INSTRUCTION, "Caught an EXC_BAD_INSTRUCTION exception");
97 T_QUIET; T_ASSERT_EQ(codes[0], (uint64_t)EXC_ARM_UNDEFINED, "The subcode is EXC_ARM_UNDEFINED");
98
99 mach_exc_caught = true;
100 return 4;
101 }
102 #endif
103
104
105 #ifdef __arm64__
106 static void
test_matrix_not_started(const struct arm_matrix_operations * ops)107 test_matrix_not_started(const struct arm_matrix_operations *ops)
108 {
109 if (!ops->is_available()) {
110 T_SKIP("Running on non-%s target, skipping...", ops->name);
111 }
112
113 mach_port_t exc_port = create_exception_port(EXC_MASK_BAD_INSTRUCTION);
114
115 size_t size = ops->data_size();
116 uint8_t *d = ops->alloc_data();
117 bzero(d, size);
118
119 ops->start();
120 ops->load_one_vector(d);
121 ops->stop();
122 T_PASS("%s instruction after start instruction should not cause an exception", ops->name);
123
124 mach_exc_caught = false;
125 run_exception_handler(exc_port, bad_instruction_exception_handler);
126 ops->load_one_vector(d);
127 T_EXPECT_TRUE(mach_exc_caught, "%s instruction before start instruction should cause an exception", ops->name);
128
129 free(d);
130 }
131 #endif
132
133
134 T_DECL(sme_not_started,
135 "Test that SME instructions before smstart generate mach exceptions.", T_META_TAG_VM_NOT_ELIGIBLE)
136 {
137 #ifndef __arm64__
138 T_SKIP("Running on non-arm64 target, skipping...");
139 #else
140 test_matrix_not_started(&sme_operations);
141 #endif
142 }
143
144 #ifdef __arm64__
145 struct test_thread;
146 typedef bool (*thread_fn_t)(struct test_thread const* thread);
147
148 struct test_thread {
149 pthread_t thread;
150 pthread_t companion_thread;
151 thread_fn_t thread_fn;
152 uint32_t cpuid;
153 uint32_t thread_id;
154 const struct arm_matrix_operations *ops;
155 };
156
157 static uint32_t barrier;
158 static pthread_cond_t barrier_cond = PTHREAD_COND_INITIALIZER;
159 static pthread_mutex_t barrier_lock = PTHREAD_MUTEX_INITIALIZER;
160
161 static uint32_t end_barrier;
162 static pthread_cond_t end_barrier_cond = PTHREAD_COND_INITIALIZER;
163 static pthread_mutex_t end_barrier_lock = PTHREAD_MUTEX_INITIALIZER;
164
165 static void
test_thread_barrier(void)166 test_thread_barrier(void)
167 {
168 /* Wait for all threads to reach this barrier */
169 pthread_mutex_lock(&barrier_lock);
170 barrier--;
171 if (barrier) {
172 while (barrier) {
173 pthread_cond_wait(&barrier_cond, &barrier_lock);
174 }
175 } else {
176 pthread_cond_broadcast(&barrier_cond);
177 }
178 pthread_mutex_unlock(&barrier_lock);
179 }
180
181 static void
test_thread_notify_exited(void)182 test_thread_notify_exited(void)
183 {
184 pthread_mutex_lock(&end_barrier_lock);
185 if (0 == --end_barrier) {
186 pthread_cond_signal(&end_barrier_cond);
187 }
188 pthread_mutex_unlock(&end_barrier_lock);
189 }
190
191 static void
wait_for_test_threads(void)192 wait_for_test_threads(void)
193 {
194 pthread_mutex_lock(&end_barrier_lock);
195 while (end_barrier) {
196 pthread_cond_wait(&end_barrier_cond, &end_barrier_lock);
197 }
198 pthread_mutex_unlock(&end_barrier_lock);
199 }
200
201 static uint32_t
ncpus(void)202 ncpus(void)
203 {
204 uint32_t ncpu;
205 size_t ncpu_size = sizeof(ncpu);
206 int err = sysctlbyname("hw.ncpu", &ncpu, &ncpu_size, NULL, 0);
207 T_QUIET; T_ASSERT_POSIX_ZERO(err, "Retrieved CPU count");
208
209 return ncpu;
210 }
211
212 static int
thread_bind_cpu_unchecked(uint32_t cpuid)213 thread_bind_cpu_unchecked(uint32_t cpuid)
214 {
215 /*
216 * libc's sysctl() implementation calls strlen(name), which is
217 * SIMD-accelerated. Avoid this by directly invoking the libsyscall
218 * wrapper with namelen computed at compile time.
219 */
220 #define THREAD_BIND_CPU "kern.sched_thread_bind_cpu"
221 extern int __sysctlbyname(const char *name, size_t namelen, void *oldp, size_t *oldlenp, void *newp, size_t newlen);
222 const char *name = THREAD_BIND_CPU;
223 size_t namelen = sizeof(THREAD_BIND_CPU) - 1;
224 return __sysctlbyname(name, namelen, NULL, 0, &cpuid, sizeof(cpuid));
225 }
226
227 static void
thread_bind_cpu(uint32_t cpuid)228 thread_bind_cpu(uint32_t cpuid)
229 {
230 int err = thread_bind_cpu_unchecked(cpuid);
231 T_QUIET; T_ASSERT_POSIX_ZERO(err, "Bound thread to CPU %u", cpuid);
232 }
233
234 static void *
test_thread_shim(void * arg)235 test_thread_shim(void *arg)
236 {
237 struct test_thread const *thread = arg;
238
239 thread_bind_cpu(thread->cpuid);
240 bool const ret = thread->thread_fn(thread);
241 test_thread_notify_exited();
242 return (void *)(uintptr_t)ret;
243 }
244
245 static void
test_on_each_cpu(thread_fn_t thread_fn,const struct arm_matrix_operations * ops,const char * desc)246 test_on_each_cpu(thread_fn_t thread_fn, const struct arm_matrix_operations *ops, const char *desc)
247 {
248 uint32_t ncpu = ncpus();
249 uint32_t nthreads = ncpu * 2;
250 barrier = 1 /* This thread */ + nthreads;
251 end_barrier = nthreads;
252 struct test_thread *threads = calloc(nthreads, sizeof(threads[0]));
253
254 for (uint32_t i = 0; i < nthreads; i++) {
255 threads[i].thread_fn = thread_fn;
256 threads[i].cpuid = i % ncpu;
257 threads[i].thread_id = i;
258 threads[i].ops = ops;
259
260 int const err = pthread_create(&threads[i].thread, NULL, test_thread_shim, &threads[i]);
261 T_QUIET; T_ASSERT_EQ(err, 0, "%s: created thread #%u", desc, i);
262
263 // The other of two threads under test pinned to the same CPU.
264 threads[(ncpu + i) % nthreads].companion_thread = threads[i].thread;
265 }
266
267 // Wait for all companion_threads to be set.
268 test_thread_barrier();
269
270 // like pthread_join()ing all threads, but without the priority boosting shenanigans.
271 wait_for_test_threads();
272
273 for (uint32_t i = 0; i < nthreads; i++) {
274 void *thread_ret_ptr;
275 int err = pthread_join(threads[i].thread, &thread_ret_ptr);
276 T_QUIET; T_ASSERT_EQ(err, 0, "%s: joined thread #%u", desc, i);
277
278 bool thread_ret = (uintptr_t)thread_ret_ptr;
279 if (thread_ret) {
280 T_PASS("%s: thread #%u passed", desc, i);
281 } else {
282 T_FAIL("%s: thread #%u failed", desc, i);
283 }
284 }
285
286 free(threads);
287 }
288
289 static bool
active_context_switch_thread(struct test_thread const * thread)290 active_context_switch_thread(struct test_thread const* thread)
291 {
292 const struct arm_matrix_operations *ops = thread->ops;
293 const uint32_t thread_id = thread->thread_id;
294 size_t size = ops->data_size();
295 uint8_t *d1 = ops->alloc_data();
296 memset(d1, (char)thread_id, size);
297
298 uint8_t *d2 = ops->alloc_data();
299
300 test_thread_barrier();
301
302 // companion_thread will be valid only after the barrier.
303 thread_t const companion_thread = pthread_mach_thread_np(thread->companion_thread);
304 T_QUIET; T_ASSERT_NE(companion_thread, THREAD_NULL, "pthread_mach_thread_np");
305
306 bool ok = true;
307 for (unsigned int i = 0; i < 100000 && ok; i++) {
308 ops->start();
309 ops->load_data(d1);
310
311 /*
312 * Rescheduling with the matrix registers active must preserve
313 * state, even after a context switch.
314 */
315 thread_switch(companion_thread, SWITCH_OPTION_NONE, 0);
316
317 ops->store_data(d2);
318 ops->stop();
319
320 if (memcmp(d1, d2, size)) {
321 ok = false;
322 }
323 }
324
325 free(d2);
326 free(d1);
327 return ok;
328 }
329
330 static bool
inactive_context_switch_thread(struct test_thread const * thread)331 inactive_context_switch_thread(struct test_thread const* thread)
332 {
333 const struct arm_matrix_operations *ops = thread->ops;
334 const uint32_t thread_id = thread->thread_id;
335 size_t size = ops->data_size();
336 uint8_t *d1 = ops->alloc_data();
337 memset(d1, (char)thread_id, size);
338
339 uint8_t *d2 = ops->alloc_data();
340
341 test_thread_barrier();
342
343 // companion_thread will be valid only after the barrier.
344 thread_t const companion_thread = pthread_mach_thread_np(thread->companion_thread);
345 T_QUIET; T_ASSERT_NE(companion_thread, THREAD_NULL, "pthread_mach_thread_np");
346
347 bool ok = true;
348 for (unsigned int i = 0; i < 100000 && ok; i++) {
349 ops->start();
350 ops->load_data(d1);
351 ops->stop();
352
353 /*
354 * Rescheduling with the matrix registers inactive may preserve
355 * state or may zero it out.
356 */
357 thread_switch(companion_thread, SWITCH_OPTION_NONE, 0);
358
359 ops->start();
360 ops->store_data(d2);
361 ops->stop();
362
363 for (size_t j = 0; j < size; j++) {
364 if (d1[j] != d2[j] && d2[j] != 0) {
365 ok = false;
366 }
367 }
368 }
369
370 free(d2);
371 free(d1);
372 return ok;
373 }
374
375 static void
test_thread_migration(const struct arm_matrix_operations * ops)376 test_thread_migration(const struct arm_matrix_operations *ops)
377 {
378 size_t size = ops->data_size();
379 uint8_t *d = ops->alloc_data();
380 arc4random_buf(d, size);
381
382 uint32_t ncpu = ncpus();
383 uint8_t *cpu_d[ncpu];
384 for (uint32_t cpuid = 0; cpuid < ncpu; cpuid++) {
385 cpu_d[cpuid] = ops->alloc_data();
386 memset(cpu_d[cpuid], 0, size);
387 }
388
389 ops->start();
390 ops->load_data(d);
391 for (uint32_t cpuid = 0; cpuid < ncpu; cpuid++) {
392 int err = thread_bind_cpu_unchecked(cpuid);
393 if (err) {
394 ops->stop();
395 T_ASSERT_POSIX_ZERO(err, "Bound thread to CPU %u", cpuid);
396 }
397 ops->store_data(cpu_d[cpuid]);
398 }
399 ops->stop();
400
401 for (uint32_t cpuid = 0; cpuid < ncpu; cpuid++) {
402 int cmp = memcmp(d, cpu_d[cpuid], size);
403 T_EXPECT_EQ(cmp, 0, "Matrix state migrated to CPU %u", cpuid);
404 free(cpu_d[cpuid]);
405 }
406 free(d);
407 }
408 #endif
409
410
411 T_DECL(sme_context_switch,
412 "Test that SME contexts are migrated during context switch and do not leak between process contexts.",
413 T_META_BOOTARGS_SET("enable_skstb=1"),
414 T_META_REQUIRES_SYSCTL_EQ("hw.optional.arm.FEAT_SME2", 1),
415 XNU_T_META_SOC_SPECIFIC, T_META_TAG_VM_NOT_ELIGIBLE)
416 {
417 #ifndef __arm64__
418 T_SKIP("Running on non-arm64 target, skipping...");
419 #else
420 if (!sme_operations.is_available()) {
421 T_SKIP("Running on non-SME target, skipping...");
422 }
423
424 test_thread_migration(&sme_operations);
425 test_on_each_cpu(active_context_switch_thread, &sme_operations, "SME context migrates when active");
426 test_on_each_cpu(inactive_context_switch_thread, &sme_operations, "SME context does not leak across processes");
427 #endif
428 }
429
430
431 #if __arm64__
432 /*
433 * Sequence of events in thread_{get,set}_state test:
434 *
435 * 1. Parent creates child thread.
436 * 2. Child thread signals parent thread to proceed.
437 * 3. Parent populates child's matrix state registers via thread_set_state(),
438 * and signals child thread to proceed.
439 * 4. Child arbitrarily updates each byte in its local matrix register state
440 * by adding 1, and signals parent thread to proceed.
441 * 5. Parent reads back the child's updated matrix state with
442 * thread_get_state(), and confirms that every byte has been modified as
443 * expected.
444 */
445 static enum thread_state_test_state {
446 INIT,
447 CHILD_READY,
448 PARENT_POPULATED_MATRIX_STATE,
449 CHILD_UPDATED_MATRIX_STATE,
450 DONE
451 } thread_state_test_state;
452
453 static pthread_cond_t thread_state_test_cond = PTHREAD_COND_INITIALIZER;
454 static pthread_mutex_t thread_state_test_lock = PTHREAD_MUTEX_INITIALIZER;
455
456 static void
wait_for_thread_state_test_state(enum thread_state_test_state state)457 wait_for_thread_state_test_state(enum thread_state_test_state state)
458 {
459 pthread_mutex_lock(&thread_state_test_lock);
460 while (thread_state_test_state != state) {
461 pthread_cond_wait(&thread_state_test_cond, &thread_state_test_lock);
462 }
463 pthread_mutex_unlock(&thread_state_test_lock);
464 }
465
466 static void
thread_set_state_test_state(enum thread_state_test_state state)467 thread_set_state_test_state(enum thread_state_test_state state)
468 {
469 pthread_mutex_lock(&thread_state_test_lock);
470 thread_state_test_state = state;
471 pthread_cond_broadcast(&thread_state_test_cond);
472 pthread_mutex_unlock(&thread_state_test_lock);
473 }
474
475 static void *
test_matrix_thread_state_child(void * arg __unused)476 test_matrix_thread_state_child(void *arg __unused)
477 {
478 const struct arm_matrix_operations *ops = arg;
479
480 size_t size = ops->data_size();
481 uint8_t *d = ops->alloc_data();
482
483
484 thread_set_state_test_state(CHILD_READY);
485 wait_for_thread_state_test_state(PARENT_POPULATED_MATRIX_STATE);
486 ops->store_data(d);
487 for (size_t i = 0; i < size; i++) {
488 d[i]++;
489 }
490 ops->load_data(d);
491 thread_set_state_test_state(CHILD_UPDATED_MATRIX_STATE);
492
493 wait_for_thread_state_test_state(DONE);
494 ops->stop();
495 return NULL;
496 }
497
498 static void
test_matrix_thread_state(const struct arm_matrix_operations * ops)499 test_matrix_thread_state(const struct arm_matrix_operations *ops)
500 {
501 if (!ops->is_available()) {
502 T_SKIP("Running on non-%s target, skipping...", ops->name);
503 }
504
505 size_t size = ops->data_size();
506 uint8_t *d = ops->alloc_data();
507 arc4random_buf(d, size);
508
509 thread_state_test_state = INIT;
510
511 pthread_t thread;
512 #pragma clang diagnostic push
513 #pragma clang diagnostic ignored "-Wincompatible-pointer-types-discards-qualifiers"
514 void *arg = ops;
515 #pragma clang diagnostic pop
516 int err = pthread_create(&thread, NULL, test_matrix_thread_state_child, arg);
517 T_QUIET; T_ASSERT_EQ(err, 0, "pthread_create()");
518
519 mach_port_t mach_thread = pthread_mach_thread_np(thread);
520 T_QUIET; T_ASSERT_NE(mach_thread, MACH_PORT_NULL, "pthread_mach_thread_np()");
521
522 wait_for_thread_state_test_state(CHILD_READY);
523 kern_return_t kr = ops->thread_set_state(mach_thread, d);
524 T_QUIET; T_ASSERT_EQ(kr, KERN_SUCCESS, "%s thread_set_state()", ops->name);
525 thread_set_state_test_state(PARENT_POPULATED_MATRIX_STATE);
526
527 wait_for_thread_state_test_state(CHILD_UPDATED_MATRIX_STATE);
528 uint8_t *thread_d = ops->alloc_data();
529 kr = ops->thread_get_state(mach_thread, thread_d);
530 T_QUIET; T_ASSERT_EQ(kr, KERN_SUCCESS, "%s thread_get_state()", ops->name);
531 for (size_t i = 0; i < size; i++) {
532 d[i]++;
533 }
534 T_EXPECT_EQ(memcmp(d, thread_d, size), 0, "thread_get_state() read expected %s data from child thread", ops->name);
535
536 thread_set_state_test_state(DONE);
537 free(thread_d);
538 free(d);
539 pthread_join(thread, NULL);
540 }
541
542 #endif
543
544 #ifdef __arm64__
545
546 T_DECL(sme_thread_state,
547 "Test thread_{get,set}_state with SME thread state.",
548 XNU_T_META_SOC_SPECIFIC)
549 {
550 test_matrix_thread_state(&sme_operations);
551 }
552
553 T_DECL(sme_exception_ports,
554 "Test that thread_set_exception_ports rejects SME thread-state flavors.",
555 XNU_T_META_SOC_SPECIFIC)
556 {
557 mach_port_t exc_port;
558 mach_port_t task = mach_task_self();
559 mach_port_t thread = mach_thread_self();
560
561 kern_return_t kr = mach_port_allocate(task, MACH_PORT_RIGHT_RECEIVE, &exc_port);
562 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Allocated mach exception port");
563 kr = mach_port_insert_right(task, exc_port, exc_port, MACH_MSG_TYPE_MAKE_SEND);
564 T_QUIET; T_ASSERT_MACH_SUCCESS(kr, "Inserted a SEND right into the exception port");
565
566 kr = thread_set_exception_ports(thread, EXC_MASK_ALL, exc_port, EXCEPTION_STATE, ARM_THREAD_STATE64);
567 T_EXPECT_MACH_SUCCESS(kr, "thread_set_exception_ports accepts flavor %u", (unsigned int)ARM_THREAD_STATE64);
568
569 for (thread_state_flavor_t flavor = ARM_SME_STATE; flavor <= ARM_SME2_STATE; flavor++) {
570 kr = thread_set_exception_ports(thread, EXC_MASK_ALL, exc_port, EXCEPTION_STATE, flavor);
571 T_EXPECT_MACH_ERROR(kr, KERN_INVALID_ARGUMENT, "thread_set_exception_ports rejects flavor %u", (unsigned int)flavor);
572 }
573 }
574
575 T_DECL(sme_max_svl_b_sysctl,
576 "Test the hw.optional.arm.sme_max_svl_b sysctl",
577 XNU_T_META_SOC_SPECIFIC)
578 {
579 unsigned int max_svl_b;
580 size_t max_svl_b_size = sizeof(max_svl_b);
581
582 int err = sysctlbyname("hw.optional.arm.sme_max_svl_b", &max_svl_b, &max_svl_b_size, NULL, 0);
583 T_QUIET; T_ASSERT_POSIX_SUCCESS(err, "sysctlbyname(hw.optional.arm.sme_max_svl_b)");
584 if (sme_operations.is_available()) {
585 /* Architecturally SVL must be a power-of-two between 128 and 2048 bits */
586 const unsigned int ARCH_MIN_SVL_B = 128 / 8;
587 const unsigned int ARCH_MAX_SVL_B = 2048 / 8;
588
589 T_EXPECT_EQ(__builtin_popcount(max_svl_b), 1, "Maximum SVL_B is a power of 2");
590 T_EXPECT_GE(max_svl_b, ARCH_MIN_SVL_B, "Maximum SVL_B >= architectural minimum");
591 T_EXPECT_LE(max_svl_b, ARCH_MAX_SVL_B, "Maximum SVL_B <= architectural maximum");
592 } else {
593 T_EXPECT_EQ(max_svl_b, 0, "Maximum SVL_B is 0 when SME is unavailable");
594 }
595 }
596 #endif /* __arm64__ */
597