1 /*
2 * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28 /*
29 * @OSF_COPYRIGHT@
30 */
31 /*
32 * Mach Operating System
33 * Copyright (c) 1992-1990 Carnegie Mellon University
34 * All Rights Reserved.
35 *
36 * Permission to use, copy, modify and distribute this software and its
37 * documentation is hereby granted, provided that both the copyright
38 * notice and this permission notice appear in all copies of the
39 * software, derivative works or modified versions, and any portions
40 * thereof, and that both notices appear in supporting documentation.
41 *
42 * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43 * CONDITION. CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44 * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45 *
46 * Carnegie Mellon requests users of this software to return to
47 *
48 * Software Distribution Coordinator or [email protected]
49 * School of Computer Science
50 * Carnegie Mellon University
51 * Pittsburgh PA 15213-3890
52 *
53 * any improvements or extensions that they make and grant Carnegie Mellon
54 * the rights to redistribute these changes.
55 */
56
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
60
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
66 #include <kern/spl.h>
67 #include <kern/assert.h>
68
69 #include <libkern/OSAtomic.h>
70
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
73 #include <i386/fpu.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap.h>
78
79 xstate_t fpu_capability = UNDEFINED; /* extended state capability */
80 xstate_t fpu_default = UNDEFINED; /* default extended state */
81
82 #define ALIGNED(addr, size) (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
86
87 /* Forward */
88
89 extern void fpinit(void);
90 extern void fp_save(
91 thread_t thr_act);
92 extern void fp_load(
93 thread_t thr_act);
94
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96 static xstate_t thread_xstate(thread_t);
97
98 x86_ext_thread_state_t initial_fp_state __attribute((aligned(64)));
99 x86_ext_thread_state_t default_avx512_state __attribute((aligned(64)));
100 x86_ext_thread_state_t default_avx_state __attribute((aligned(64)));
101 x86_ext_thread_state_t default_fx_state __attribute((aligned(64)));
102
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask;
105
106 #define fninit() \
107 __asm__ volatile("fninit")
108
109 #define fnstcw(control) \
110 __asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111
112 #define fldcw(control) \
113 __asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114
115 #define fnclex() \
116 __asm__ volatile("fnclex")
117
118 #define fnsave(state) \
119 __asm__ volatile("fnsave %0" : "=m" (*state))
120
121 #define frstor(state) \
122 __asm__ volatile("frstor %0" : : "m" (state))
123
124 #define fwait() \
125 __asm__("fwait");
126
127 static inline void
fxrstor(struct x86_fx_thread_state * a)128 fxrstor(struct x86_fx_thread_state *a)
129 {
130 __asm__ __volatile__ ("fxrstor %0" :: "m" (*a));
131 }
132
133 static inline void
fxsave(struct x86_fx_thread_state * a)134 fxsave(struct x86_fx_thread_state *a)
135 {
136 __asm__ __volatile__ ("fxsave %0" : "=m" (*a));
137 }
138
139 static inline void
fxrstor64(struct x86_fx_thread_state * a)140 fxrstor64(struct x86_fx_thread_state *a)
141 {
142 __asm__ __volatile__ ("fxrstor64 %0" :: "m" (*a));
143 }
144
145 static inline void
fxsave64(struct x86_fx_thread_state * a)146 fxsave64(struct x86_fx_thread_state *a)
147 {
148 __asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
149 }
150
151 #define IS_VALID_XSTATE(x) ((x) == FP || (x) == AVX || (x) == AVX512)
152
153 SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
154 [FP] = NULL,
155 [AVX] = NULL,
156 [AVX512] = NULL
157 };
158 static const uint32_t fp_state_size[] = {
159 [FP] = sizeof(struct x86_fx_thread_state),
160 [AVX] = sizeof(struct x86_avx_thread_state),
161 [AVX512] = sizeof(struct x86_avx512_thread_state)
162 };
163
164 static const char *const xstate_name[] = {
165 [UNDEFINED] = "UNDEFINED",
166 [FP] = "FP",
167 [AVX] = "AVX",
168 [AVX512] = "AVX512"
169 };
170
171 #define fpu_ZMM_capable (fpu_capability == AVX512)
172 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
173 /*
174 * On-demand AVX512 support
175 * ------------------------
176 * On machines with AVX512 support, by default, threads are created with
177 * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178 * capabilities are advertised in the commpage and via sysctl. If a thread
179 * opts to use AVX512 instructions, the first will result in a #UD exception.
180 * Faulting AVX512 intructions are recognizable by their unique prefix.
181 * This exception results in the thread being promoted to use an AVX512-sized
182 * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183 * instruction is re-driven and the thread can proceed to perform AVX512
184 * operations.
185 *
186 * In addition to AVX512 instructions causing promotion, the thread_set_state()
187 * primitive with an AVX512 state flavor result in promotion.
188 *
189 * AVX512 promotion of the first thread in a task causes the default xstate
190 * of the task to be promoted so that any subsequently created or subsequently
191 * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
192 * a promoted xstate.
193 *
194 * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195 * and a second pool of larger AVX512-sized (2688 byte) areas.
196 *
197 * Note the initial state value is an AVX512 object but that the AVX initial
198 * value is a subset of it.
199 */
200 static uint32_t cpuid_reevaluated = 0;
201
202 static void fpu_store_registers(void *, boolean_t);
203 static void fpu_load_registers(void *);
204
205 static const uint32_t xstate_xmask[] = {
206 [FP] = FP_XMASK,
207 [AVX] = AVX_XMASK,
208 [AVX512] = AVX512_XMASK
209 };
210
211 static inline void
xsave(struct x86_fx_thread_state * a,uint32_t rfbm)212 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
213 {
214 __asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
215 }
216
217 static inline void
xsave64(struct x86_fx_thread_state * a,uint32_t rfbm)218 xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
219 {
220 __asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
221 }
222
223 static inline void
xrstor(struct x86_fx_thread_state * a,uint32_t rfbm)224 xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
225 {
226 __asm__ __volatile__ ("xrstor %0" :: "m" (*a), "a"(rfbm), "d"(0));
227 }
228
229 static inline void
xrstor64(struct x86_fx_thread_state * a,uint32_t rfbm)230 xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
231 {
232 __asm__ __volatile__ ("xrstor64 %0" :: "m" (*a), "a"(rfbm), "d"(0));
233 }
234
235 __unused static inline void
vzeroupper(void)236 vzeroupper(void)
237 {
238 __asm__ __volatile__ ("vzeroupper" ::);
239 }
240
241 static boolean_t fpu_thread_promote_avx512(thread_t); /* Forward */
242
243
244 /*
245 * Furthermore, make compile-time asserts that no padding creeps into structures
246 * for which we're doing this.
247 */
248 #define ASSERT_PACKED(t, m1, m2, n, mt) \
249 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2 \
250 [(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
251
252 ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
253
254 ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
255
256 ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
257 ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
258 ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
259
260 ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
261 ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
262 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
263 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
264
265 #if defined(DEBUG_AVX512)
266
267 #define DBG(x...) kprintf("DBG: " x)
268
269 typedef struct { uint8_t byte[8]; } opmask_t;
270 typedef struct { uint8_t byte[16]; } xmm_t;
271 typedef struct { uint8_t byte[32]; } ymm_t;
272 typedef struct { uint8_t byte[64]; } zmm_t;
273
274 static void
DBG_AVX512_STATE(struct x86_avx512_thread_state * sp)275 DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
276 {
277 int i, j;
278 xmm_t *xmm = (xmm_t *) &sp->fp.fx_XMM_reg;
279 xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
280 ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
281 zmm_t *zmm = (zmm_t *) &sp->x_Hi16_ZMM;
282 opmask_t *k = (opmask_t *) &sp->x_Opmask;
283
284 kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
285 kprintf("x_Opmask: %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
286 kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
287 kprintf("x_Hi16_ZMM: %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
288
289 kprintf("XCR0: 0x%016llx\n", xgetbv(XCR0));
290 kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
291
292 /* Print all ZMM registers */
293 for (i = 0; i < 16; i++) {
294 kprintf("zmm%d:\t0x", i);
295 for (j = 0; j < 16; j++) {
296 kprintf("%02x", xmm[i].byte[j]);
297 }
298 for (j = 0; j < 16; j++) {
299 kprintf("%02x", ymmh[i].byte[j]);
300 }
301 for (j = 0; j < 32; j++) {
302 kprintf("%02x", zmmh[i].byte[j]);
303 }
304 kprintf("\n");
305 }
306 for (i = 0; i < 16; i++) {
307 kprintf("zmm%d:\t0x", 16 + i);
308 for (j = 0; j < 64; j++) {
309 kprintf("%02x", zmm[i].byte[j]);
310 }
311 kprintf("\n");
312 }
313 for (i = 0; i < 8; i++) {
314 kprintf("k%d:\t0x", i);
315 for (j = 0; j < 8; j++) {
316 kprintf("%02x", k[i].byte[j]);
317 }
318 kprintf("\n");
319 }
320
321 kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
322 kprintf("xcomp_bv: 0x%016llx\n", sp->_xh.xcomp_bv);
323 }
324 #else
325 #define DBG(x...)
326 static void
DBG_AVX512_STATE(__unused struct x86_avx512_thread_state * sp)327 DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
328 {
329 return;
330 }
331 #endif /* DEBUG_AVX512 */
332
333 #if DEBUG
334 static inline unsigned short
fnstsw(void)335 fnstsw(void)
336 {
337 unsigned short status;
338 __asm__ volatile ("fnstsw %0" : "=ma" (status));
339 return status;
340 }
341 #endif
342
343 /*
344 * Configure the initial FPU state presented to new threads.
345 * Determine the MXCSR capability mask, which allows us to mask off any
346 * potentially unsafe "reserved" bits before restoring the FPU context.
347 * *Not* per-cpu, assumes symmetry.
348 */
349
350 static void
configure_mxcsr_capability_mask(x86_ext_thread_state_t * fps)351 configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
352 {
353 /* XSAVE requires a 64 byte aligned store */
354 assert(ALIGNED(fps, 64));
355 /* Clear, to prepare for the diagnostic FXSAVE */
356 bzero(fps, sizeof(*fps));
357
358 fpinit();
359 fpu_store_registers(fps, FALSE);
360
361 mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
362
363 /* Set default mask value if necessary */
364 if (mxcsr_capability_mask == 0) {
365 mxcsr_capability_mask = 0xffbf;
366 }
367
368 /* Clear vector register store */
369 bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
370 bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
371 if (fpu_ZMM_capable) {
372 bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
373 bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
374 bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
375 }
376
377 fps->fx.fp_valid = TRUE;
378 fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
379 fpu_load_registers(fps);
380
381 if (fpu_ZMM_capable) {
382 xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
383 }
384 if (fpu_YMM_capable) {
385 xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
386 } else {
387 fxsave64((struct x86_fx_thread_state *)&default_fx_state);
388 }
389
390 /* Poison values to trap unsafe usage */
391 fps->fx.fp_valid = 0xFFFFFFFF;
392 fps->fx.fp_save_layout = FP_UNUSED;
393
394 /* Re-enable FPU/SSE DNA exceptions */
395 set_ts();
396 }
397
398 #if DEBUG || DEVELOPMENT
399 int fpsimd_fault_popc = 1;
400 #endif
401
402 /*
403 * Look for FPU and initialize it.
404 * Called on each CPU.
405 */
406 void
init_fpu(void)407 init_fpu(void)
408 {
409 #if DEBUG
410 unsigned short status;
411 unsigned short control;
412 #endif
413 /*
414 * Check for FPU by initializing it,
415 * then trying to read the correct bit patterns from
416 * the control and status registers.
417 */
418 set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE); /* allow use of FPU */
419 fninit();
420 #if DEBUG
421 status = fnstsw();
422 fnstcw(&control);
423
424 assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
425 #endif
426 /* Advertise SSE support */
427 if (cpuid_features() & CPUID_FEATURE_FXSR) {
428 set_cr4(get_cr4() | CR4_OSFXS);
429 /* And allow SIMD exceptions if present */
430 if (cpuid_features() & CPUID_FEATURE_SSE) {
431 set_cr4(get_cr4() | CR4_OSXMM);
432 }
433 } else {
434 panic("fpu is not FP_FXSR");
435 }
436
437 fpu_capability = fpu_default = FP;
438
439 static boolean_t is_avx512_enabled = TRUE;
440 if (cpu_number() == master_cpu) {
441 if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
442 PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
443 kprintf("AVX512 supported %s\n",
444 is_avx512_enabled ? "and enabled" : "but disabled");
445 }
446 }
447
448 /* Configure the XSAVE context mechanism if the processor supports
449 * AVX/YMM registers
450 */
451 if (cpuid_features() & CPUID_FEATURE_XSAVE) {
452 cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
453 if (is_avx512_enabled &&
454 (xs0p->extended_state[eax] & XFEM_ZMM_OPMASK) == XFEM_ZMM_OPMASK) {
455 assert(xs0p->extended_state[eax] & XFEM_SSE);
456 assert(xs0p->extended_state[eax] & XFEM_YMM);
457 fpu_capability = AVX512;
458 /* XSAVE container size for all features */
459 set_cr4(get_cr4() | CR4_OSXSAVE);
460 xsetbv(0, AVX512_XMASK);
461 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
462 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
463 cpuid_set_info();
464 }
465 /* Verify that now selected state can be accommodated */
466 assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
467 /*
468 * AVX set until AVX512 is used.
469 * See comment above about on-demand AVX512 support.
470 */
471 xsetbv(0, AVX_XMASK);
472 fpu_default = AVX;
473 } else if (xs0p->extended_state[eax] & XFEM_YMM) {
474 assert(xs0p->extended_state[eax] & XFEM_SSE);
475 fpu_capability = AVX;
476 fpu_default = AVX;
477 /* XSAVE container size for all features */
478 set_cr4(get_cr4() | CR4_OSXSAVE);
479 xsetbv(0, AVX_XMASK);
480 /* Re-evaluate CPUID, once, to reflect OSXSAVE */
481 if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
482 cpuid_set_info();
483 }
484 /* Verify that now selected state can be accommodated */
485 assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
486 }
487 }
488
489 if (cpu_number() == master_cpu) {
490 kprintf("fpu_state: %s, state_size: %d\n",
491 xstate_name[fpu_capability],
492 fp_state_size[fpu_capability]);
493 }
494
495 fpinit();
496 current_cpu_datap()->cpu_xstate = fpu_default;
497
498 /*
499 * Trap wait instructions. Turn off FPU for now.
500 */
501 set_cr0(get_cr0() | CR0_TS | CR0_MP);
502 }
503
504 /*
505 * Allocate and initialize FP state for specified xstate.
506 * Don't load state.
507 */
508 static void *
fp_state_alloc(xstate_t xs)509 fp_state_alloc(xstate_t xs)
510 {
511 assert(ifps_zone[xs] != NULL);
512 return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
513 }
514
515 static inline void
fp_state_free(void * ifps,xstate_t xs)516 fp_state_free(void *ifps, xstate_t xs)
517 {
518 assert(ifps_zone[xs] != NULL);
519 zfree(ifps_zone[xs], ifps);
520 }
521
522 void
clear_fpu(void)523 clear_fpu(void)
524 {
525 set_ts();
526 }
527
528 static boolean_t
fpu_allzeroes(uint64_t * __attribute ((aligned (8)))ptr,uint32_t size)529 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
530 {
531 VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
532 assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
533
534 for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
535 if (ptr[count] != 0) {
536 return FALSE;
537 }
538 }
539 return TRUE;
540 }
541
542 static void
fpu_load_registers(void * fstate)543 fpu_load_registers(void *fstate)
544 {
545 struct x86_fx_thread_state *ifps = fstate;
546 fp_save_layout_t layout = ifps->fp_save_layout;
547
548 assert(startup_phase < STARTUP_SUB_EARLY_BOOT || \
549 (thread_is_64bit_addr(current_thread()) ? \
550 (layout == FXSAVE64 || layout == XSAVE64) : \
551 (layout == FXSAVE32 || layout == XSAVE32)));
552 assert(ALIGNED(ifps, 64));
553 assert(ml_get_interrupts_enabled() == FALSE);
554
555 #if DEBUG
556 if (layout == XSAVE32 || layout == XSAVE64) {
557 struct x86_avx_thread_state *iavx = fstate;
558 unsigned i;
559 /* Verify reserved bits in the XSAVE header*/
560 if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
561 panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
562 }
563 for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
564 if (iavx->_xh.xhrsvd[i]) {
565 panic("Reserved bit set");
566 }
567 }
568 }
569 if (fpu_YMM_capable) {
570 if (layout != XSAVE32 && layout != XSAVE64) {
571 panic("Inappropriate layout: %u", layout);
572 }
573 }
574 #endif /* DEBUG */
575
576 switch (layout) {
577 case FXSAVE64:
578 fxrstor64(ifps);
579 break;
580 case FXSAVE32:
581 fxrstor(ifps);
582 break;
583 case XSAVE64:
584 xrstor64(ifps, xstate_xmask[current_xstate()]);
585 break;
586 case XSAVE32:
587 xrstor(ifps, xstate_xmask[current_xstate()]);
588 break;
589 default:
590 panic("fpu_load_registers() bad layout: %d", layout);
591 }
592 }
593
594 static void
fpu_store_registers(void * fstate,boolean_t is64)595 fpu_store_registers(void *fstate, boolean_t is64)
596 {
597 struct x86_fx_thread_state *ifps = fstate;
598 assert(ALIGNED(ifps, 64));
599 xstate_t xs = current_xstate();
600 switch (xs) {
601 case FP:
602 if (is64) {
603 fxsave64(fstate);
604 ifps->fp_save_layout = FXSAVE64;
605 } else {
606 fxsave(fstate);
607 ifps->fp_save_layout = FXSAVE32;
608 }
609 break;
610 case AVX:
611 case AVX512:
612 if (is64) {
613 xsave64(ifps, xstate_xmask[xs]);
614 ifps->fp_save_layout = XSAVE64;
615 } else {
616 xsave(ifps, xstate_xmask[xs]);
617 ifps->fp_save_layout = XSAVE32;
618 }
619 break;
620 default:
621 panic("fpu_store_registers() bad xstate: %d", xs);
622 }
623 }
624
625 /*
626 * Initialize FP handling.
627 */
628
629 void
fpu_module_init(void)630 fpu_module_init(void)
631 {
632 if (!IS_VALID_XSTATE(fpu_default)) {
633 panic("fpu_module_init: invalid extended state %u",
634 fpu_default);
635 }
636
637 /* To maintain the required alignment, disable
638 * zone debugging for this zone as that appends
639 * 16 bytes to each element.
640 */
641 ifps_zone[fpu_default] = zone_create("x86 fpsave state",
642 fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
643
644 /*
645 * If AVX512 is supported, create a separate savearea zone.
646 */
647 if (fpu_capability == AVX512) {
648 ifps_zone[AVX512] = zone_create("x86 avx512 save state",
649 fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
650 }
651
652 /* Determine MXCSR reserved bits and configure initial FPU state*/
653 configure_mxcsr_capability_mask(&initial_fp_state);
654
655 #if DEBUG || DEVELOPMENT
656 if (kern_feature_override(KF_DISABLE_FP_POPC_ON_PGFLT)) {
657 fpsimd_fault_popc = 0;
658 }
659
660 /* Allow the explicit boot-arg to override the validation disables */
661 PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
662 #endif
663 }
664
665 /*
666 * Context switch fpu state.
667 * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
668 * Switch to the new task's xstate.
669 */
670
671 void
fpu_switch_context(thread_t old,thread_t new)672 fpu_switch_context(thread_t old, thread_t new)
673 {
674 struct x86_fx_thread_state *ifps;
675 cpu_data_t *cdp = current_cpu_datap();
676 xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
677
678 assert(ml_get_interrupts_enabled() == FALSE);
679 ifps = (old)->machine.ifps;
680 #if DEBUG
681 if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
682 panic("ifps->fp_valid: %u", ifps->fp_valid);
683 }
684 #endif
685 if (ifps != 0 && (ifps->fp_valid == FALSE)) {
686 /* Clear CR0.TS in preparation for the FP context save. In
687 * theory, this shouldn't be necessary since a live FPU should
688 * indicate that TS is clear. However, various routines
689 * (such as sendsig & sigreturn) manipulate TS directly.
690 */
691 clear_ts();
692 /* registers are in FPU - save to memory */
693 boolean_t is64 = (thread_is_64bit_addr(old) &&
694 is_saved_state64(old->machine.iss));
695
696 fpu_store_registers(ifps, is64);
697 ifps->fp_valid = TRUE;
698
699 if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
700 xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
701 } else if (fpu_YMM_capable) {
702 xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
703 } else {
704 fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
705 }
706 }
707
708 assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
709 if (new_xstate != (xstate_t) cdp->cpu_xstate) {
710 DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
711 old, new, xstate_name[new_xstate]);
712 xsetbv(0, xstate_xmask[new_xstate]);
713 cdp->cpu_xstate = new_xstate;
714 }
715 set_ts();
716 }
717
718
719 /*
720 * Free a FPU save area.
721 * Called only when thread terminating - no locking necessary.
722 */
723 void
fpu_free(thread_t thread,void * fps)724 fpu_free(thread_t thread, void *fps)
725 {
726 pcb_t pcb = THREAD_TO_PCB(thread);
727
728 fp_state_free(fps, pcb->xstate);
729 pcb->xstate = UNDEFINED;
730 }
731
732 /*
733 * Set the floating-point state for a thread based on the FXSave formatted data.
734 * This is basically the same as fpu_set_state except it uses the expanded data
735 * structure.
736 * If the thread is not the current thread, it is not running (held). Locking
737 * needed against concurrent fpu_set_state or fpu_get_state.
738 *
739 * While translating between XNU FP state structures and the CPU-native XSAVE area,
740 * if we detect state components that are all zeroes, we clear the corresponding
741 * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
742 * be initialized to a "clean" state. That's most important when clearing the YMM
743 * bit, since an initialized "upper clean" state results in a massive performance
744 * improvement due to elimination of false dependencies between the XMMs and the
745 * upper bits of the YMMs.
746 */
747 kern_return_t
fpu_set_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)748 fpu_set_fxstate(
749 thread_t thr_act,
750 thread_state_t tstate,
751 thread_flavor_t f)
752 {
753 struct x86_fx_thread_state *ifps;
754 struct x86_fx_thread_state *new_ifps;
755 x86_float_state64_t *state;
756 pcb_t pcb;
757 boolean_t old_valid, fresh_state = FALSE;
758 xstate_t thr_xstate;
759
760 if (fpu_capability == UNDEFINED) {
761 return KERN_FAILURE;
762 }
763
764 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
765 fpu_capability < AVX) {
766 return KERN_FAILURE;
767 }
768
769 assert(thr_act != THREAD_NULL);
770
771 thr_xstate = thread_xstate(thr_act);
772
773 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
774 thr_xstate == AVX) {
775 if (!fpu_thread_promote_avx512(thr_act)) {
776 return KERN_FAILURE;
777 } else {
778 /* Reload thr_xstate after successful promotion */
779 thr_xstate = thread_xstate(thr_act);
780 }
781 }
782
783 state = (x86_float_state64_t *)tstate;
784
785 pcb = THREAD_TO_PCB(thr_act);
786
787 if (state == NULL) {
788 /*
789 * new FPU state is 'invalid'.
790 * Deallocate the fp state if it exists.
791 */
792 simple_lock(&pcb->lock, LCK_GRP_NULL);
793
794 ifps = pcb->ifps;
795 pcb->ifps = 0;
796
797 simple_unlock(&pcb->lock);
798
799 if (ifps != 0) {
800 fp_state_free(ifps, thr_xstate);
801 }
802 } else {
803 /*
804 * Valid incoming state. Allocate the fp state if there is none.
805 */
806 new_ifps = 0;
807 Retry:
808 simple_lock(&pcb->lock, LCK_GRP_NULL);
809
810 ifps = pcb->ifps;
811 if (ifps == 0) {
812 if (new_ifps == 0) {
813 simple_unlock(&pcb->lock);
814 new_ifps = fp_state_alloc(thr_xstate);
815 goto Retry;
816 }
817 ifps = new_ifps;
818 new_ifps = 0;
819 pcb->ifps = ifps;
820 pcb->xstate = thr_xstate;
821 fresh_state = TRUE;
822 }
823
824 /*
825 * now copy over the new data.
826 */
827
828 old_valid = ifps->fp_valid;
829
830 #if DEBUG || DEVELOPMENT
831 if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
832 panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
833 }
834 #endif
835 /*
836 * Clear any reserved bits in the MXCSR to prevent a GPF
837 * when issuing an FXRSTOR.
838 */
839
840 state->fpu_mxcsr &= mxcsr_capability_mask;
841
842 __nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
843
844 switch (thr_xstate) {
845 case UNDEFINED_FULL:
846 case FP_FULL:
847 case AVX_FULL:
848 case AVX512_FULL:
849 panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
850 break;
851
852 case UNDEFINED:
853 panic("fpu_set_fxstate() UNDEFINED xstate");
854 break;
855 case FP:
856 ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
857 break;
858 case AVX: {
859 struct x86_avx_thread_state *iavx = (void *) ifps;
860 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
861
862 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
863
864 /* Sanitize XSAVE header */
865 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
866 iavx->_xh.xstate_bv = AVX_XMASK;
867 iavx->_xh.xcomp_bv = 0;
868
869 /*
870 * See the block comment at the top of the function for a description of why we're clearing
871 * xstate_bv bits.
872 */
873 if (f == x86_AVX_STATE32) {
874 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
875 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
876 iavx->_xh.xstate_bv &= ~XFEM_YMM;
877 }
878 } else if (f == x86_AVX_STATE64) {
879 __nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
880 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
881 iavx->_xh.xstate_bv &= ~XFEM_YMM;
882 }
883 } else {
884 iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
885 }
886 break;
887 }
888 case AVX512: {
889 struct x86_avx512_thread_state *iavx = (void *) ifps;
890 union {
891 thread_state_t ts;
892 x86_avx512_state32_t *s32;
893 x86_avx512_state64_t *s64;
894 } xs = { .ts = tstate };
895
896 iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
897
898 /* Sanitize XSAVE header */
899 bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
900 iavx->_xh.xstate_bv = AVX512_XMASK;
901 iavx->_xh.xcomp_bv = 0;
902
903 /*
904 * See the block comment at the top of the function for a description of why we're clearing
905 * xstate_bv bits.
906 */
907 switch (f) {
908 case x86_AVX512_STATE32:
909 __nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
910 __nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
911
912 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
913 iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
914 }
915
916 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
917 iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
918 }
919 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
920 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
921 iavx->_xh.xstate_bv &= ~XFEM_YMM;
922 }
923
924 DBG_AVX512_STATE(iavx);
925 break;
926 case x86_AVX_STATE32:
927 __nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
928 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
929 iavx->_xh.xstate_bv &= ~XFEM_YMM;
930 }
931 break;
932 case x86_AVX512_STATE64:
933 __nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
934 __nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
935 __nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
936 /*
937 * Note that it is valid to have XFEM_ZMM_OPMASK set but XFEM_YMM cleared. In that case,
938 * the upper bits of the YMMs would be cleared and would result in a clean-upper
939 * state, allowing SSE instruction to avoid false dependencies.
940 */
941 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
942 iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
943 }
944
945 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
946 fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
947 iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
948 }
949
950 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
951 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
952 iavx->_xh.xstate_bv &= ~XFEM_YMM;
953 }
954 DBG_AVX512_STATE(iavx);
955 break;
956 case x86_AVX_STATE64:
957 __nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
958 if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
959 iavx->_xh.xstate_bv &= ~XFEM_YMM;
960 }
961 break;
962 }
963 break;
964 }
965 }
966
967 ifps->fp_valid = old_valid;
968
969 if (old_valid == FALSE) {
970 boolean_t istate = ml_set_interrupts_enabled(FALSE);
971 ifps->fp_valid = TRUE;
972 /* If altering the current thread's state, disable FPU */
973 if (thr_act == current_thread()) {
974 set_ts();
975 }
976
977 ml_set_interrupts_enabled(istate);
978 }
979
980 simple_unlock(&pcb->lock);
981
982 if (new_ifps != 0) {
983 fp_state_free(new_ifps, thr_xstate);
984 }
985 }
986 return KERN_SUCCESS;
987 }
988
989 /*
990 * Get the floating-point state for a thread.
991 * If the thread is not the current thread, it is
992 * not running (held). Locking needed against
993 * concurrent fpu_set_state or fpu_get_state.
994 */
995 kern_return_t
fpu_get_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)996 fpu_get_fxstate(
997 thread_t thr_act,
998 thread_state_t tstate,
999 thread_flavor_t f)
1000 {
1001 struct x86_fx_thread_state *ifps;
1002 x86_float_state64_t *state;
1003 kern_return_t ret = KERN_FAILURE;
1004 pcb_t pcb;
1005 xstate_t thr_xstate = thread_xstate(thr_act);
1006
1007 if (fpu_capability == UNDEFINED) {
1008 return KERN_FAILURE;
1009 }
1010
1011 if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
1012 fpu_capability < AVX) {
1013 return KERN_FAILURE;
1014 }
1015
1016 if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
1017 thr_xstate != AVX512) {
1018 return KERN_FAILURE;
1019 }
1020
1021 state = (x86_float_state64_t *)tstate;
1022
1023 assert(thr_act != THREAD_NULL);
1024 pcb = THREAD_TO_PCB(thr_act);
1025
1026 simple_lock(&pcb->lock, LCK_GRP_NULL);
1027
1028 ifps = pcb->ifps;
1029 if (ifps == 0) {
1030 /*
1031 * No valid floating-point state.
1032 */
1033
1034 __nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
1035 fp_state_size[FP]);
1036
1037 simple_unlock(&pcb->lock);
1038
1039 return KERN_SUCCESS;
1040 }
1041 /*
1042 * Make sure we`ve got the latest fp state info
1043 * If the live fpu state belongs to our target
1044 */
1045 if (thr_act == current_thread()) {
1046 boolean_t intr;
1047
1048 intr = ml_set_interrupts_enabled(FALSE);
1049
1050 clear_ts();
1051 fp_save(thr_act);
1052 clear_fpu();
1053
1054 (void)ml_set_interrupts_enabled(intr);
1055 }
1056 if (ifps->fp_valid) {
1057 __nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
1058 switch (thr_xstate) {
1059 case UNDEFINED_FULL:
1060 case FP_FULL:
1061 case AVX_FULL:
1062 case AVX512_FULL:
1063 panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
1064 break;
1065
1066 case UNDEFINED:
1067 panic("fpu_get_fxstate() UNDEFINED xstate");
1068 break;
1069 case FP:
1070 break; /* already done */
1071 case AVX: {
1072 struct x86_avx_thread_state *iavx = (void *) ifps;
1073 x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1074 if (f == x86_AVX_STATE32) {
1075 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1076 } else if (f == x86_AVX_STATE64) {
1077 __nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1078 }
1079 break;
1080 }
1081 case AVX512: {
1082 struct x86_avx512_thread_state *iavx = (void *) ifps;
1083 union {
1084 thread_state_t ts;
1085 x86_avx512_state32_t *s32;
1086 x86_avx512_state64_t *s64;
1087 } xs = { .ts = tstate };
1088 switch (f) {
1089 case x86_AVX512_STATE32:
1090 __nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1091 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1092 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1093 DBG_AVX512_STATE(iavx);
1094 break;
1095 case x86_AVX_STATE32:
1096 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1097 break;
1098 case x86_AVX512_STATE64:
1099 __nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1100 __nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1101 __nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1102 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1103 DBG_AVX512_STATE(iavx);
1104 break;
1105 case x86_AVX_STATE64:
1106 __nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1107 break;
1108 }
1109 break;
1110 }
1111 }
1112
1113 ret = KERN_SUCCESS;
1114 }
1115 simple_unlock(&pcb->lock);
1116
1117 return ret;
1118 }
1119
1120
1121
1122 /*
1123 * the child thread is 'stopped' with the thread
1124 * mutex held and is currently not known by anyone
1125 * so no way for fpu state to get manipulated by an
1126 * outside agency -> no need for pcb lock
1127 */
1128
1129 void
fpu_dup_fxstate(thread_t parent,thread_t child)1130 fpu_dup_fxstate(
1131 thread_t parent,
1132 thread_t child)
1133 {
1134 struct x86_fx_thread_state *new_ifps = NULL;
1135 boolean_t intr;
1136 pcb_t ppcb;
1137 xstate_t xstate = thread_xstate(parent);
1138
1139 ppcb = THREAD_TO_PCB(parent);
1140
1141 if (ppcb->ifps == NULL) {
1142 return;
1143 }
1144
1145 if (child->machine.ifps) {
1146 panic("fpu_dup_fxstate: child's ifps non-null");
1147 }
1148
1149 new_ifps = fp_state_alloc(xstate);
1150
1151 simple_lock(&ppcb->lock, LCK_GRP_NULL);
1152
1153 if (ppcb->ifps != NULL) {
1154 struct x86_fx_thread_state *ifps = ppcb->ifps;
1155 /*
1156 * Make sure we`ve got the latest fp state info
1157 */
1158 if (current_thread() == parent) {
1159 intr = ml_set_interrupts_enabled(FALSE);
1160 assert(current_thread() == parent);
1161 clear_ts();
1162 fp_save(parent);
1163 clear_fpu();
1164
1165 (void)ml_set_interrupts_enabled(intr);
1166 }
1167
1168 if (ifps->fp_valid) {
1169 child->machine.ifps = new_ifps;
1170 child->machine.xstate = xstate;
1171 __nochk_bcopy((char *)(ppcb->ifps),
1172 (char *)(child->machine.ifps),
1173 fp_state_size[xstate]);
1174
1175 /* Mark the new fp saved state as non-live. */
1176 /* Temporarily disabled: radar 4647827
1177 * new_ifps->fp_valid = TRUE;
1178 */
1179
1180 /*
1181 * Clear any reserved bits in the MXCSR to prevent a GPF
1182 * when issuing an FXRSTOR.
1183 */
1184 new_ifps->fx_MXCSR &= mxcsr_capability_mask;
1185 new_ifps = NULL;
1186 }
1187 }
1188 simple_unlock(&ppcb->lock);
1189
1190 if (new_ifps != NULL) {
1191 fp_state_free(new_ifps, xstate);
1192 }
1193 }
1194
1195 /*
1196 * Initialize FPU.
1197 * FNINIT programs the x87 control word to 0x37f, which matches
1198 * the desired default for macOS.
1199 */
1200
1201 void
fpinit(void)1202 fpinit(void)
1203 {
1204 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1205 clear_ts();
1206 fninit();
1207 #if DEBUG
1208 /* We skip this power-on-default verification sequence on
1209 * non-DEBUG, as dirtying the x87 control word may slow down
1210 * xsave/xrstor and affect energy use.
1211 */
1212 unsigned short control, control2;
1213 fnstcw(&control);
1214 control2 = control;
1215 control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1216 control |= (FPC_PC_64 | /* Set precision */
1217 FPC_RC_RN | /* round-to-nearest */
1218 FPC_ZE | /* Suppress zero-divide */
1219 FPC_OE | /* and overflow */
1220 FPC_UE | /* underflow */
1221 FPC_IE | /* Allow NaNQs and +-INF */
1222 FPC_DE | /* Allow denorms as operands */
1223 FPC_PE); /* No trap for precision loss */
1224 assert(control == control2);
1225 fldcw(control);
1226 #endif
1227 /* Initialize SSE/SSE2 */
1228 __builtin_ia32_ldmxcsr(0x1f80);
1229 if (fpu_YMM_capable) {
1230 vzeroall();
1231 } else {
1232 xmmzeroall();
1233 }
1234 ml_set_interrupts_enabled(istate);
1235 }
1236
1237 /*
1238 * Coprocessor not present.
1239 */
1240
1241 uint64_t x86_isr_fp_simd_use;
1242
1243 void
fpnoextflt(void)1244 fpnoextflt(void)
1245 {
1246 boolean_t intr;
1247 thread_t thr_act;
1248 pcb_t pcb;
1249 struct x86_fx_thread_state *ifps = 0;
1250 xstate_t xstate = current_xstate();
1251
1252 thr_act = current_thread();
1253 pcb = THREAD_TO_PCB(thr_act);
1254
1255 if (pcb->ifps == 0 && !get_interrupt_level()) {
1256 ifps = fp_state_alloc(xstate);
1257 __nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
1258 fp_state_size[xstate]);
1259 if (!thread_is_64bit_addr(thr_act)) {
1260 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
1261 } else {
1262 ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
1263 }
1264 ifps->fp_valid = TRUE;
1265 }
1266 intr = ml_set_interrupts_enabled(FALSE);
1267
1268 clear_ts(); /* Enable FPU use */
1269
1270 if (__improbable(get_interrupt_level())) {
1271 /* Track number of #DNA traps at interrupt context,
1272 * which is likely suboptimal. Racy, but good enough.
1273 */
1274 x86_isr_fp_simd_use++;
1275 /*
1276 * Save current FP/SIMD context if valid
1277 * Initialize live FP/SIMD registers
1278 */
1279 if (pcb->ifps) {
1280 fp_save(thr_act);
1281 }
1282 fpinit();
1283 } else {
1284 if (pcb->ifps == 0) {
1285 pcb->ifps = ifps;
1286 pcb->xstate = xstate;
1287 ifps = 0;
1288 }
1289 /*
1290 * Load this thread`s state into coprocessor live context.
1291 */
1292 fp_load(thr_act);
1293 }
1294 (void)ml_set_interrupts_enabled(intr);
1295
1296 if (ifps) {
1297 fp_state_free(ifps, xstate);
1298 }
1299 }
1300
1301 /*
1302 * FPU overran end of segment.
1303 * Re-initialize FPU. Floating point state is not valid.
1304 */
1305
1306 void
fpextovrflt(void)1307 fpextovrflt(void)
1308 {
1309 thread_t thr_act = current_thread();
1310 pcb_t pcb;
1311 struct x86_fx_thread_state *ifps;
1312 boolean_t intr;
1313 xstate_t xstate = current_xstate();
1314
1315 intr = ml_set_interrupts_enabled(FALSE);
1316
1317 if (get_interrupt_level()) {
1318 panic("FPU segment overrun exception at interrupt context");
1319 }
1320 if (current_task() == kernel_task) {
1321 panic("FPU segment overrun exception in kernel thread context");
1322 }
1323
1324 /*
1325 * This is a non-recoverable error.
1326 * Invalidate the thread`s FPU state.
1327 */
1328 pcb = THREAD_TO_PCB(thr_act);
1329 simple_lock(&pcb->lock, LCK_GRP_NULL);
1330 ifps = pcb->ifps;
1331 pcb->ifps = 0;
1332 simple_unlock(&pcb->lock);
1333
1334 /*
1335 * Re-initialize the FPU.
1336 */
1337 clear_ts();
1338 fninit();
1339
1340 /*
1341 * And disable access.
1342 */
1343 clear_fpu();
1344
1345 (void)ml_set_interrupts_enabled(intr);
1346
1347 if (ifps) {
1348 fp_state_free(ifps, xstate);
1349 }
1350 }
1351
1352 /*
1353 * FPU error. Called by AST.
1354 */
1355
1356 void
fpexterrflt(void)1357 fpexterrflt(void)
1358 {
1359 thread_t thr_act = current_thread();
1360 boolean_t intr;
1361
1362 intr = ml_set_interrupts_enabled(FALSE);
1363
1364 if (get_interrupt_level()) {
1365 panic("FPU error exception at interrupt context");
1366 }
1367 if (current_task() == kernel_task) {
1368 panic("FPU error exception in kernel thread context");
1369 }
1370
1371 /*
1372 * Save the FPU state and turn off the FPU.
1373 */
1374 fp_save(thr_act);
1375 /* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1376 set_ts();
1377
1378 (void)ml_set_interrupts_enabled(intr);
1379 }
1380
1381 /*
1382 * Save FPU state.
1383 *
1384 * Locking not needed:
1385 * . if called from fpu_get_state, pcb already locked.
1386 * . if called from fpnoextflt or fp_intr, we are single-cpu
1387 * . otherwise, thread is running.
1388 * N.B.: Must be called with interrupts disabled
1389 */
1390
1391 void
fp_save(thread_t thr_act)1392 fp_save(
1393 thread_t thr_act)
1394 {
1395 pcb_t pcb = THREAD_TO_PCB(thr_act);
1396 struct x86_fx_thread_state *ifps = pcb->ifps;
1397
1398 assert(ifps != 0);
1399 if (ifps != 0 && !ifps->fp_valid) {
1400 assert((get_cr0() & CR0_TS) == 0);
1401 /* registers are in FPU */
1402 ifps->fp_valid = TRUE;
1403 fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1404 }
1405 }
1406
1407 /*
1408 * Restore FPU state from PCB.
1409 *
1410 * Locking not needed; always called on the current thread.
1411 */
1412
1413 void
fp_load(thread_t thr_act)1414 fp_load(
1415 thread_t thr_act)
1416 {
1417 pcb_t pcb = THREAD_TO_PCB(thr_act);
1418 struct x86_fx_thread_state *ifps = pcb->ifps;
1419
1420 assert(ifps);
1421 #if DEBUG
1422 if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1423 panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u",
1424 ifps->fp_valid, ifps->fp_save_layout);
1425 }
1426 #endif
1427
1428 if (ifps->fp_valid == FALSE) {
1429 fpinit();
1430 } else {
1431 fpu_load_registers(ifps);
1432 }
1433 ifps->fp_valid = FALSE; /* in FPU */
1434 }
1435
1436 /*
1437 * SSE arithmetic exception handling code.
1438 * Basically the same as the x87 exception handler with a different subtype
1439 */
1440
1441 void
fpSSEexterrflt(void)1442 fpSSEexterrflt(void)
1443 {
1444 thread_t thr_act = current_thread();
1445 boolean_t intr;
1446
1447 intr = ml_set_interrupts_enabled(FALSE);
1448
1449 if (get_interrupt_level()) {
1450 panic("SSE exception at interrupt context");
1451 }
1452 if (current_task() == kernel_task) {
1453 panic("SSE exception in kernel thread context");
1454 }
1455
1456 /*
1457 * Save the FPU state and turn off the FPU.
1458 */
1459 fp_save(thr_act);
1460 /* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1461 set_ts();
1462
1463 (void)ml_set_interrupts_enabled(intr);
1464 }
1465
1466
1467 /*
1468 * If a thread is using an AVX-sized savearea:
1469 * - allocate a new AVX512-sized area,
1470 * - copy the 256-bit state into the 512-bit area,
1471 * - deallocate the smaller area
1472 * ASSUMES: thread is the current thread.
1473 */
1474 static void
fpu_savearea_promote_avx512(thread_t thread)1475 fpu_savearea_promote_avx512(thread_t thread)
1476 {
1477 struct x86_avx_thread_state *ifps = NULL;
1478 struct x86_avx512_thread_state *ifps512 = NULL;
1479 pcb_t pcb = THREAD_TO_PCB(thread);
1480 boolean_t do_avx512_alloc = FALSE;
1481 boolean_t intr;
1482
1483 assert(thread == current_thread());
1484
1485 DBG("fpu_savearea_promote_avx512(%p)\n", thread);
1486
1487 simple_lock(&pcb->lock, LCK_GRP_NULL);
1488
1489 ifps = pcb->ifps;
1490 if (ifps == NULL) {
1491 pcb->xstate = AVX512;
1492 simple_unlock(&pcb->lock);
1493 /*
1494 * Now that the PCB xstate has been promoted, set XCR0 so
1495 * that we don't re-trip #UD on the next AVX-512 instruction.
1496 *
1497 * Since this branch is taken when the first FP instruction
1498 * attempted by this thread is an AVX-512 instruction, we
1499 * call fpnoextflt() to allocate an appropriately-sized
1500 * AVX-512 save-area, thereby avoiding the overhead of another
1501 * fault that would be triggered immediately on return.
1502 */
1503 intr = ml_set_interrupts_enabled(FALSE);
1504 xsetbv(0, AVX512_XMASK);
1505 current_cpu_datap()->cpu_xstate = AVX512;
1506 (void)ml_set_interrupts_enabled(intr);
1507
1508 fpnoextflt();
1509 return;
1510 }
1511
1512 if (pcb->xstate != AVX512) {
1513 do_avx512_alloc = TRUE;
1514 }
1515
1516 simple_unlock(&pcb->lock);
1517
1518 if (do_avx512_alloc == TRUE) {
1519 ifps512 = fp_state_alloc(AVX512);
1520 }
1521
1522 simple_lock(&pcb->lock, LCK_GRP_NULL);
1523
1524 intr = ml_set_interrupts_enabled(FALSE);
1525
1526 clear_ts();
1527 fp_save(thread);
1528 clear_fpu();
1529
1530 xsetbv(0, AVX512_XMASK);
1531 current_cpu_datap()->cpu_xstate = AVX512;
1532 (void)ml_set_interrupts_enabled(intr);
1533
1534 assert(ifps->fp.fp_valid);
1535
1536 /* Allocate an AVX512 savearea and copy AVX state into it */
1537 if (pcb->xstate != AVX512) {
1538 __nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
1539 pcb->ifps = ifps512;
1540 pcb->xstate = AVX512;
1541 ifps512 = NULL;
1542 } else {
1543 ifps = NULL;
1544 }
1545 /* The PCB lock is redundant in some scenarios given the higher level
1546 * thread mutex, but its pre-emption disablement is relied upon here
1547 */
1548 simple_unlock(&pcb->lock);
1549
1550 if (ifps) {
1551 fp_state_free(ifps, AVX);
1552 }
1553 if (ifps512) {
1554 fp_state_free(ifps, AVX512);
1555 }
1556 }
1557
1558 /*
1559 * Upgrade the calling thread to AVX512.
1560 */
1561 boolean_t
fpu_thread_promote_avx512(thread_t thread)1562 fpu_thread_promote_avx512(thread_t thread)
1563 {
1564 task_t task = current_task();
1565
1566 if (thread != current_thread()) {
1567 return FALSE;
1568 }
1569 if (!ml_fpu_avx512_enabled()) {
1570 return FALSE;
1571 }
1572
1573 fpu_savearea_promote_avx512(thread);
1574
1575 /* Racy but the task's xstate is only a hint */
1576 task->xstate = AVX512;
1577
1578 return TRUE;
1579 }
1580
1581
1582 /*
1583 * Called from user_trap() when an invalid opcode fault is taken.
1584 * If the user is attempting an AVX512 instruction on a machine
1585 * that supports this, we switch the calling thread to use
1586 * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1587 * return to user_trap() with a 0 return value.
1588 * Otherwise, simply return a nonzero value.
1589 */
1590
1591 #define MAX_X86_INSN_LENGTH (15)
1592 int
fpUDflt(user_addr_t rip)1593 fpUDflt(user_addr_t rip)
1594 {
1595 uint8_t instruction_prefix;
1596 boolean_t is_AVX512_instruction = FALSE;
1597 user_addr_t original_rip = rip;
1598
1599 /*
1600 * If this thread's xstate is already AVX512, then this #UD is
1601 * a true #UD.
1602 */
1603 if (thread_xstate(current_thread()) == AVX512) {
1604 return 1;
1605 }
1606
1607 do {
1608 /* TODO: as an optimisation, copy up to the lesser of the
1609 * next page boundary or maximal prefix length in one pass
1610 * rather than issue multiple copyins
1611 */
1612 if (copyin(rip, (char *) &instruction_prefix, 1)) {
1613 return 1;
1614 }
1615 DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1616 rip, instruction_prefix);
1617 /* TODO: determine more specifically which prefixes
1618 * are sane possibilities for AVX512 insns
1619 */
1620 switch (instruction_prefix) {
1621 case 0x2E: /* CS segment override */
1622 case 0x36: /* SS segment override */
1623 case 0x3E: /* DS segment override */
1624 case 0x26: /* ES segment override */
1625 case 0x64: /* FS segment override */
1626 case 0x65: /* GS segment override */
1627 case 0x66: /* Operand-size override */
1628 case 0x67: /* address-size override */
1629 /* Skip optional prefixes */
1630 rip++;
1631 if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1632 return 1;
1633 }
1634 break;
1635 case 0x62: /* EVEX */
1636 case 0xC5: /* VEX 2-byte */
1637 case 0xC4: /* VEX 3-byte */
1638 is_AVX512_instruction = TRUE;
1639 break;
1640 default:
1641 return 1;
1642 }
1643 } while (!is_AVX512_instruction);
1644
1645 /* Here if we detect attempted execution of an AVX512 instruction */
1646
1647 /*
1648 * Fail if this machine doesn't support AVX512
1649 */
1650 if (fpu_capability != AVX512) {
1651 return 1;
1652 }
1653
1654 assert(xgetbv(XCR0) == AVX_XMASK);
1655
1656 DBG("fpUDflt() switching xstate to AVX512\n");
1657 (void) fpu_thread_promote_avx512(current_thread());
1658
1659 return 0;
1660 }
1661
1662 void
fp_setvalid(boolean_t value)1663 fp_setvalid(boolean_t value)
1664 {
1665 thread_t thr_act = current_thread();
1666 struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
1667
1668 if (ifps) {
1669 ifps->fp_valid = value;
1670
1671 if (value == TRUE) {
1672 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1673 clear_fpu();
1674 ml_set_interrupts_enabled(istate);
1675 }
1676 }
1677 }
1678
1679 boolean_t
ml_fpu_avx_enabled(void)1680 ml_fpu_avx_enabled(void)
1681 {
1682 return fpu_capability >= AVX;
1683 }
1684
1685 boolean_t
ml_fpu_avx512_enabled(void)1686 ml_fpu_avx512_enabled(void)
1687 {
1688 return fpu_capability == AVX512;
1689 }
1690
1691 static xstate_t
thread_xstate(thread_t thread)1692 thread_xstate(thread_t thread)
1693 {
1694 xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1695 if (xs != UNDEFINED) {
1696 return xs;
1697 } else if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
1698 return fpu_default;
1699 } else {
1700 return get_threadtask(thread)->xstate;
1701 }
1702 }
1703
1704 xstate_t
current_xstate(void)1705 current_xstate(void)
1706 {
1707 return thread_xstate(current_thread());
1708 }
1709
1710 /*
1711 * Called when exec'ing between bitnesses.
1712 * If valid FPU state exists, adjust the layout.
1713 */
1714 void
fpu_switch_addrmode(thread_t thread,boolean_t is_64bit)1715 fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1716 {
1717 struct x86_fx_thread_state *ifps = thread->machine.ifps;
1718 mp_disable_preemption();
1719
1720 if (ifps && ifps->fp_valid) {
1721 if (thread_xstate(thread) == FP) {
1722 ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1723 } else {
1724 ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1725 }
1726 }
1727 mp_enable_preemption();
1728 }
1729
1730 #if DEBUG || DEVELOPMENT
1731 static inline uint32_t
fpsimd_pop(uintptr_t ins,int sz)1732 fpsimd_pop(uintptr_t ins, int sz)
1733 {
1734 uint32_t rv = 0;
1735
1736
1737 while (sz >= 16) {
1738 uint32_t rv1, rv2;
1739 uint64_t *ins64 = (uint64_t *) ins;
1740 uint64_t *ins642 = (uint64_t *) (ins + 8);
1741 rv1 = __builtin_popcountll(*ins64);
1742 rv2 = __builtin_popcountll(*ins642);
1743 rv += rv1 + rv2;
1744 sz -= 16;
1745 ins += 16;
1746 }
1747
1748 while (sz >= 4) {
1749 uint32_t *ins32 = (uint32_t *) ins;
1750 rv += __builtin_popcount(*ins32);
1751 sz -= 4;
1752 ins += 4;
1753 }
1754
1755 while (sz > 0) {
1756 char *ins8 = (char *)ins;
1757 rv += __builtin_popcount(*ins8);
1758 sz--;
1759 ins++;
1760 }
1761 return rv;
1762 }
1763
1764 bool
thread_fpsimd_hash_enabled(void)1765 thread_fpsimd_hash_enabled(void)
1766 {
1767 return fpsimd_fault_popc ? true : false;
1768 }
1769
1770 uint32_t __attribute__((noinline))
thread_fpsimd_hash(thread_t ft)1771 thread_fpsimd_hash(thread_t ft)
1772 {
1773 uint32_t prv = 0;
1774 boolean_t istate = ml_set_interrupts_enabled(FALSE);
1775 struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1776
1777 if (pifps) {
1778 if (pifps->fp_valid) {
1779 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1780 sizeof(pifps->fx_XMM_reg));
1781 } else {
1782 uintptr_t cr0 = get_cr0();
1783 /*
1784 * The unusual case where the fp save area is not valid, yet TS is set,
1785 * is used to perform a lazy-init of FP state, so for this specific case,
1786 * assume that the popcount of the FP regs is 0.
1787 */
1788 if (!(cr0 & CR0_TS)) {
1789 fp_save(ft);
1790 prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1791 sizeof(pifps->fx_XMM_reg));
1792 pifps->fp_valid = FALSE;
1793 }
1794 }
1795 }
1796 ml_set_interrupts_enabled(istate);
1797 return prv;
1798 }
1799 #endif /* DEBUG || DEVELOPMENT */
1800