xref: /xnu-8020.101.4/osfmk/i386/fpu.c (revision e7776783b89a353188416a9a346c6cdb4928faad)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1992-1990 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
60 
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
66 #include <kern/spl.h>
67 #include <kern/assert.h>
68 
69 #include <libkern/OSAtomic.h>
70 
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
73 #include <i386/fpu.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap.h>
78 
79 xstate_t        fpu_capability = UNDEFINED;     /* extended state capability */
80 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
81 
82 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 	assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 	    "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
86 
87 /* Forward */
88 
89 extern void             fpinit(void);
90 extern void             fp_save(
91 	thread_t        thr_act);
92 extern void             fp_load(
93 	thread_t        thr_act);
94 
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96 static xstate_t thread_xstate(thread_t);
97 
98 x86_ext_thread_state_t  initial_fp_state __attribute((aligned(64)));
99 x86_ext_thread_state_t  default_avx512_state __attribute((aligned(64)));
100 x86_ext_thread_state_t  default_avx_state __attribute((aligned(64)));
101 x86_ext_thread_state_t  default_fx_state __attribute((aligned(64)));
102 
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask;
105 
106 #define fninit() \
107 	__asm__ volatile("fninit")
108 
109 #define fnstcw(control) \
110 	__asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111 
112 #define fldcw(control) \
113 	__asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114 
115 #define fnclex() \
116 	__asm__ volatile("fnclex")
117 
118 #define fnsave(state)  \
119 	__asm__ volatile("fnsave %0" : "=m" (*state))
120 
121 #define frstor(state) \
122 	__asm__ volatile("frstor %0" : : "m" (state))
123 
124 #define fwait() \
125 	__asm__("fwait");
126 
127 static inline void
fxrstor(struct x86_fx_thread_state * a)128 fxrstor(struct x86_fx_thread_state *a)
129 {
130 	__asm__ __volatile__ ("fxrstor %0" ::  "m" (*a));
131 }
132 
133 static inline void
fxsave(struct x86_fx_thread_state * a)134 fxsave(struct x86_fx_thread_state *a)
135 {
136 	__asm__ __volatile__ ("fxsave %0" : "=m" (*a));
137 }
138 
139 static inline void
fxrstor64(struct x86_fx_thread_state * a)140 fxrstor64(struct x86_fx_thread_state *a)
141 {
142 	__asm__ __volatile__ ("fxrstor64 %0" ::  "m" (*a));
143 }
144 
145 static inline void
fxsave64(struct x86_fx_thread_state * a)146 fxsave64(struct x86_fx_thread_state *a)
147 {
148 	__asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
149 }
150 
151 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
152 
153 SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
154 	[FP]     = NULL,
155 	[AVX]    = NULL,
156 	[AVX512] = NULL
157 };
158 static const uint32_t fp_state_size[] = {
159 	[FP]     = sizeof(struct x86_fx_thread_state),
160 	[AVX]    = sizeof(struct x86_avx_thread_state),
161 	[AVX512] = sizeof(struct x86_avx512_thread_state)
162 };
163 
164 static const char *const xstate_name[] = {
165 	[UNDEFINED] = "UNDEFINED",
166 	[FP] = "FP",
167 	[AVX] = "AVX",
168 	[AVX512] = "AVX512"
169 };
170 
171 #define fpu_ZMM_capable (fpu_capability == AVX512)
172 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
173 /*
174  * On-demand AVX512 support
175  * ------------------------
176  * On machines with AVX512 support, by default, threads are created with
177  * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178  * capabilities are advertised in the commpage and via sysctl. If a thread
179  * opts to use AVX512 instructions, the first will result in a #UD exception.
180  * Faulting AVX512 intructions are recognizable by their unique prefix.
181  * This exception results in the thread being promoted to use an AVX512-sized
182  * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183  * instruction is re-driven and the thread can proceed to perform AVX512
184  * operations.
185  *
186  * In addition to AVX512 instructions causing promotion, the thread_set_state()
187  * primitive with an AVX512 state flavor result in promotion.
188  *
189  * AVX512 promotion of the first thread in a task causes the default xstate
190  * of the task to be promoted so that any subsequently created or subsequently
191  * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
192  * a promoted xstate.
193  *
194  * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195  * and a second pool of larger AVX512-sized (2688 byte) areas.
196  *
197  * Note the initial state value is an AVX512 object but that the AVX initial
198  * value is a subset of it.
199  */
200 static uint32_t cpuid_reevaluated = 0;
201 
202 static void fpu_store_registers(void *, boolean_t);
203 static void fpu_load_registers(void *);
204 
205 static const uint32_t xstate_xmask[] = {
206 	[FP] =          FP_XMASK,
207 	[AVX] =         AVX_XMASK,
208 	[AVX512] =      AVX512_XMASK
209 };
210 
211 static inline void
xsave(struct x86_fx_thread_state * a,uint32_t rfbm)212 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
213 {
214 	__asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
215 }
216 
217 static inline void
xsave64(struct x86_fx_thread_state * a,uint32_t rfbm)218 xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
219 {
220 	__asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
221 }
222 
223 static inline void
xrstor(struct x86_fx_thread_state * a,uint32_t rfbm)224 xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
225 {
226 	__asm__ __volatile__ ("xrstor %0" ::  "m" (*a), "a"(rfbm), "d"(0));
227 }
228 
229 static inline void
xrstor64(struct x86_fx_thread_state * a,uint32_t rfbm)230 xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
231 {
232 	__asm__ __volatile__ ("xrstor64 %0" ::  "m" (*a), "a"(rfbm), "d"(0));
233 }
234 
235 __unused static inline void
vzeroupper(void)236 vzeroupper(void)
237 {
238 	__asm__ __volatile__ ("vzeroupper" ::);
239 }
240 
241 static boolean_t fpu_thread_promote_avx512(thread_t);   /* Forward */
242 
243 
244 /*
245  * Furthermore, make compile-time asserts that no padding creeps into structures
246  * for which we're doing this.
247  */
248 #define ASSERT_PACKED(t, m1, m2, n, mt)                 \
249 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2   \
250 	[(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
251 
252 ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
253 
254 ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
255 
256 ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
257 ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
258 ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
259 
260 ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
261 ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
262 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
263 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
264 
265 #if defined(DEBUG_AVX512)
266 
267 #define DBG(x...)       kprintf("DBG: " x)
268 
269 typedef struct { uint8_t byte[8]; }  opmask_t;
270 typedef struct { uint8_t byte[16]; } xmm_t;
271 typedef struct { uint8_t byte[32]; } ymm_t;
272 typedef struct { uint8_t byte[64]; } zmm_t;
273 
274 static void
DBG_AVX512_STATE(struct x86_avx512_thread_state * sp)275 DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
276 {
277 	int     i, j;
278 	xmm_t *xmm  = (xmm_t *) &sp->fp.fx_XMM_reg;
279 	xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
280 	ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
281 	zmm_t *zmm  = (zmm_t *) &sp->x_Hi16_ZMM;
282 	opmask_t *k = (opmask_t *) &sp->x_Opmask;
283 
284 	kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
285 	kprintf("x_Opmask:    %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
286 	kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
287 	kprintf("x_Hi16_ZMM:  %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
288 
289 	kprintf("XCR0:   0x%016llx\n", xgetbv(XCR0));
290 	kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
291 
292 	/* Print all ZMM registers */
293 	for (i = 0; i < 16; i++) {
294 		kprintf("zmm%d:\t0x", i);
295 		for (j = 0; j < 16; j++) {
296 			kprintf("%02x", xmm[i].byte[j]);
297 		}
298 		for (j = 0; j < 16; j++) {
299 			kprintf("%02x", ymmh[i].byte[j]);
300 		}
301 		for (j = 0; j < 32; j++) {
302 			kprintf("%02x", zmmh[i].byte[j]);
303 		}
304 		kprintf("\n");
305 	}
306 	for (i = 0; i < 16; i++) {
307 		kprintf("zmm%d:\t0x", 16 + i);
308 		for (j = 0; j < 64; j++) {
309 			kprintf("%02x", zmm[i].byte[j]);
310 		}
311 		kprintf("\n");
312 	}
313 	for (i = 0; i < 8; i++) {
314 		kprintf("k%d:\t0x", i);
315 		for (j = 0; j < 8; j++) {
316 			kprintf("%02x", k[i].byte[j]);
317 		}
318 		kprintf("\n");
319 	}
320 
321 	kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
322 	kprintf("xcomp_bv:  0x%016llx\n", sp->_xh.xcomp_bv);
323 }
324 #else
325 #define DBG(x...)
326 static void
DBG_AVX512_STATE(__unused struct x86_avx512_thread_state * sp)327 DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
328 {
329 	return;
330 }
331 #endif /* DEBUG_AVX512 */
332 
333 #if     DEBUG
334 static inline unsigned short
fnstsw(void)335 fnstsw(void)
336 {
337 	unsigned short status;
338 	__asm__ volatile ("fnstsw %0" : "=ma" (status));
339 	return status;
340 }
341 #endif
342 
343 /*
344  * Configure the initial FPU state presented to new threads.
345  * Determine the MXCSR capability mask, which allows us to mask off any
346  * potentially unsafe "reserved" bits before restoring the FPU context.
347  * *Not* per-cpu, assumes symmetry.
348  */
349 
350 static void
configure_mxcsr_capability_mask(x86_ext_thread_state_t * fps)351 configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
352 {
353 	/* XSAVE requires a 64 byte aligned store */
354 	assert(ALIGNED(fps, 64));
355 	/* Clear, to prepare for the diagnostic FXSAVE */
356 	bzero(fps, sizeof(*fps));
357 
358 	fpinit();
359 	fpu_store_registers(fps, FALSE);
360 
361 	mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
362 
363 	/* Set default mask value if necessary */
364 	if (mxcsr_capability_mask == 0) {
365 		mxcsr_capability_mask = 0xffbf;
366 	}
367 
368 	/* Clear vector register store */
369 	bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
370 	bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
371 	if (fpu_ZMM_capable) {
372 		bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
373 		bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
374 		bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
375 	}
376 
377 	fps->fx.fp_valid = TRUE;
378 	fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
379 	fpu_load_registers(fps);
380 
381 	if (fpu_ZMM_capable) {
382 		xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
383 	}
384 	if (fpu_YMM_capable) {
385 		xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
386 	} else {
387 		fxsave64((struct x86_fx_thread_state *)&default_fx_state);
388 	}
389 
390 	/* Poison values to trap unsafe usage */
391 	fps->fx.fp_valid = 0xFFFFFFFF;
392 	fps->fx.fp_save_layout = FP_UNUSED;
393 
394 	/* Re-enable FPU/SSE DNA exceptions */
395 	set_ts();
396 }
397 
398 int fpsimd_fault_popc = 0;
399 /*
400  * Look for FPU and initialize it.
401  * Called on each CPU.
402  */
403 void
init_fpu(void)404 init_fpu(void)
405 {
406 #if     DEBUG
407 	unsigned short  status;
408 	unsigned short  control;
409 #endif
410 	/*
411 	 * Check for FPU by initializing it,
412 	 * then trying to read the correct bit patterns from
413 	 * the control and status registers.
414 	 */
415 	set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE);       /* allow use of FPU */
416 	fninit();
417 #if     DEBUG
418 	status = fnstsw();
419 	fnstcw(&control);
420 
421 	assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
422 #endif
423 	/* Advertise SSE support */
424 	if (cpuid_features() & CPUID_FEATURE_FXSR) {
425 		set_cr4(get_cr4() | CR4_OSFXS);
426 		/* And allow SIMD exceptions if present */
427 		if (cpuid_features() & CPUID_FEATURE_SSE) {
428 			set_cr4(get_cr4() | CR4_OSXMM);
429 		}
430 	} else {
431 		panic("fpu is not FP_FXSR");
432 	}
433 
434 	fpu_capability = fpu_default = FP;
435 
436 	PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
437 
438 	static boolean_t is_avx512_enabled = TRUE;
439 	if (cpu_number() == master_cpu) {
440 		if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
441 			PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
442 			kprintf("AVX512 supported %s\n",
443 			    is_avx512_enabled ? "and enabled" : "but disabled");
444 		}
445 	}
446 
447 	/* Configure the XSAVE context mechanism if the processor supports
448 	 * AVX/YMM registers
449 	 */
450 	if (cpuid_features() & CPUID_FEATURE_XSAVE) {
451 		cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
452 		if (is_avx512_enabled &&
453 		    (xs0p->extended_state[eax] & XFEM_ZMM_OPMASK) == XFEM_ZMM_OPMASK) {
454 			assert(xs0p->extended_state[eax] & XFEM_SSE);
455 			assert(xs0p->extended_state[eax] & XFEM_YMM);
456 			fpu_capability = AVX512;
457 			/* XSAVE container size for all features */
458 			set_cr4(get_cr4() | CR4_OSXSAVE);
459 			xsetbv(0, AVX512_XMASK);
460 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
461 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
462 				cpuid_set_info();
463 			}
464 			/* Verify that now selected state can be accommodated */
465 			assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
466 			/*
467 			 * AVX set until AVX512 is used.
468 			 * See comment above about on-demand AVX512 support.
469 			 */
470 			xsetbv(0, AVX_XMASK);
471 			fpu_default = AVX;
472 		} else if (xs0p->extended_state[eax] & XFEM_YMM) {
473 			assert(xs0p->extended_state[eax] & XFEM_SSE);
474 			fpu_capability = AVX;
475 			fpu_default = AVX;
476 			/* XSAVE container size for all features */
477 			set_cr4(get_cr4() | CR4_OSXSAVE);
478 			xsetbv(0, AVX_XMASK);
479 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
480 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
481 				cpuid_set_info();
482 			}
483 			/* Verify that now selected state can be accommodated */
484 			assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
485 		}
486 	}
487 
488 	if (cpu_number() == master_cpu) {
489 		kprintf("fpu_state: %s, state_size: %d\n",
490 		    xstate_name[fpu_capability],
491 		    fp_state_size[fpu_capability]);
492 	}
493 
494 	fpinit();
495 	current_cpu_datap()->cpu_xstate = fpu_default;
496 
497 	/*
498 	 * Trap wait instructions.  Turn off FPU for now.
499 	 */
500 	set_cr0(get_cr0() | CR0_TS | CR0_MP);
501 }
502 
503 /*
504  * Allocate and initialize FP state for specified xstate.
505  * Don't load state.
506  */
507 static void *
fp_state_alloc(xstate_t xs)508 fp_state_alloc(xstate_t xs)
509 {
510 	assert(ifps_zone[xs] != NULL);
511 	return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
512 }
513 
514 static inline void
fp_state_free(void * ifps,xstate_t xs)515 fp_state_free(void *ifps, xstate_t xs)
516 {
517 	assert(ifps_zone[xs] != NULL);
518 	zfree(ifps_zone[xs], ifps);
519 }
520 
521 void
clear_fpu(void)522 clear_fpu(void)
523 {
524 	set_ts();
525 }
526 
527 static boolean_t
fpu_allzeroes(uint64_t * __attribute ((aligned (8)))ptr,uint32_t size)528 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
529 {
530 	VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
531 	assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
532 
533 	for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
534 		if (ptr[count] != 0) {
535 			return FALSE;
536 		}
537 	}
538 	return TRUE;
539 }
540 
541 static void
fpu_load_registers(void * fstate)542 fpu_load_registers(void *fstate)
543 {
544 	struct x86_fx_thread_state *ifps = fstate;
545 	fp_save_layout_t layout = ifps->fp_save_layout;
546 
547 	assert(startup_phase < STARTUP_SUB_EARLY_BOOT || \
548 	    (thread_is_64bit_addr(current_thread()) ?                        \
549 	    (layout == FXSAVE64 || layout == XSAVE64) :     \
550 	    (layout == FXSAVE32 || layout == XSAVE32)));
551 	assert(ALIGNED(ifps, 64));
552 	assert(ml_get_interrupts_enabled() == FALSE);
553 
554 #if     DEBUG
555 	if (layout == XSAVE32 || layout == XSAVE64) {
556 		struct x86_avx_thread_state *iavx = fstate;
557 		unsigned i;
558 		/* Verify reserved bits in the XSAVE header*/
559 		if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
560 			panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
561 		}
562 		for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
563 			if (iavx->_xh.xhrsvd[i]) {
564 				panic("Reserved bit set");
565 			}
566 		}
567 	}
568 	if (fpu_YMM_capable) {
569 		if (layout != XSAVE32 && layout != XSAVE64) {
570 			panic("Inappropriate layout: %u", layout);
571 		}
572 	}
573 #endif  /* DEBUG */
574 
575 	switch (layout) {
576 	case FXSAVE64:
577 		fxrstor64(ifps);
578 		break;
579 	case FXSAVE32:
580 		fxrstor(ifps);
581 		break;
582 	case XSAVE64:
583 		xrstor64(ifps, xstate_xmask[current_xstate()]);
584 		break;
585 	case XSAVE32:
586 		xrstor(ifps, xstate_xmask[current_xstate()]);
587 		break;
588 	default:
589 		panic("fpu_load_registers() bad layout: %d", layout);
590 	}
591 }
592 
593 static void
fpu_store_registers(void * fstate,boolean_t is64)594 fpu_store_registers(void *fstate, boolean_t is64)
595 {
596 	struct x86_fx_thread_state *ifps = fstate;
597 	assert(ALIGNED(ifps, 64));
598 	xstate_t xs = current_xstate();
599 	switch (xs) {
600 	case FP:
601 		if (is64) {
602 			fxsave64(fstate);
603 			ifps->fp_save_layout = FXSAVE64;
604 		} else {
605 			fxsave(fstate);
606 			ifps->fp_save_layout = FXSAVE32;
607 		}
608 		break;
609 	case AVX:
610 	case AVX512:
611 		if (is64) {
612 			xsave64(ifps, xstate_xmask[xs]);
613 			ifps->fp_save_layout = XSAVE64;
614 		} else {
615 			xsave(ifps, xstate_xmask[xs]);
616 			ifps->fp_save_layout = XSAVE32;
617 		}
618 		break;
619 	default:
620 		panic("fpu_store_registers() bad xstate: %d", xs);
621 	}
622 }
623 
624 /*
625  * Initialize FP handling.
626  */
627 
628 void
fpu_module_init(void)629 fpu_module_init(void)
630 {
631 	if (!IS_VALID_XSTATE(fpu_default)) {
632 		panic("fpu_module_init: invalid extended state %u",
633 		    fpu_default);
634 	}
635 
636 	/* To maintain the required alignment, disable
637 	 * zone debugging for this zone as that appends
638 	 * 16 bytes to each element.
639 	 */
640 	ifps_zone[fpu_default] = zone_create("x86 fpsave state",
641 	    fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
642 
643 	/*
644 	 * If AVX512 is supported, create a separate savearea zone.
645 	 */
646 	if (fpu_capability == AVX512) {
647 		ifps_zone[AVX512] = zone_create("x86 avx512 save state",
648 		    fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
649 	}
650 
651 	/* Determine MXCSR reserved bits and configure initial FPU state*/
652 	configure_mxcsr_capability_mask(&initial_fp_state);
653 }
654 
655 /*
656  * Context switch fpu state.
657  * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
658  * Switch to the new task's xstate.
659  */
660 
661 void
fpu_switch_context(thread_t old,thread_t new)662 fpu_switch_context(thread_t old, thread_t new)
663 {
664 	struct x86_fx_thread_state      *ifps;
665 	cpu_data_t *cdp = current_cpu_datap();
666 	xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
667 
668 	assert(ml_get_interrupts_enabled() == FALSE);
669 	ifps = (old)->machine.ifps;
670 #if     DEBUG
671 	if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
672 		panic("ifps->fp_valid: %u", ifps->fp_valid);
673 	}
674 #endif
675 	if (ifps != 0 && (ifps->fp_valid == FALSE)) {
676 		/* Clear CR0.TS in preparation for the FP context save. In
677 		 * theory, this shouldn't be necessary since a live FPU should
678 		 * indicate that TS is clear. However, various routines
679 		 * (such as sendsig & sigreturn) manipulate TS directly.
680 		 */
681 		clear_ts();
682 		/* registers are in FPU - save to memory */
683 		boolean_t is64 = (thread_is_64bit_addr(old) &&
684 		    is_saved_state64(old->machine.iss));
685 
686 		fpu_store_registers(ifps, is64);
687 		ifps->fp_valid = TRUE;
688 
689 		if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
690 			xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
691 		} else if (fpu_YMM_capable) {
692 			xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
693 		} else {
694 			fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
695 		}
696 	}
697 
698 	assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
699 	if (new_xstate != (xstate_t) cdp->cpu_xstate) {
700 		DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
701 		    old, new, xstate_name[new_xstate]);
702 		xsetbv(0, xstate_xmask[new_xstate]);
703 		cdp->cpu_xstate = new_xstate;
704 	}
705 	set_ts();
706 }
707 
708 
709 /*
710  * Free a FPU save area.
711  * Called only when thread terminating - no locking necessary.
712  */
713 void
fpu_free(thread_t thread,void * fps)714 fpu_free(thread_t thread, void *fps)
715 {
716 	pcb_t   pcb = THREAD_TO_PCB(thread);
717 
718 	fp_state_free(fps, pcb->xstate);
719 	pcb->xstate = UNDEFINED;
720 }
721 
722 /*
723  * Set the floating-point state for a thread based on the FXSave formatted data.
724  * This is basically the same as fpu_set_state except it uses the expanded data
725  * structure.
726  * If the thread is not the current thread, it is not running (held).  Locking
727  * needed against concurrent fpu_set_state or fpu_get_state.
728  *
729  * While translating between XNU FP state structures and the CPU-native XSAVE area,
730  * if we detect state components that are all zeroes, we clear the corresponding
731  * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
732  * be initialized to a "clean" state.  That's most important when clearing the YMM
733  * bit, since an initialized "upper clean" state results in a massive performance
734  * improvement due to elimination of false dependencies between the XMMs and the
735  * upper bits of the YMMs.
736  */
737 kern_return_t
fpu_set_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)738 fpu_set_fxstate(
739 	thread_t        thr_act,
740 	thread_state_t  tstate,
741 	thread_flavor_t f)
742 {
743 	struct x86_fx_thread_state      *ifps;
744 	struct x86_fx_thread_state      *new_ifps;
745 	x86_float_state64_t             *state;
746 	pcb_t                           pcb;
747 	boolean_t                       old_valid, fresh_state = FALSE;
748 	xstate_t                        thr_xstate;
749 
750 	if (fpu_capability == UNDEFINED) {
751 		return KERN_FAILURE;
752 	}
753 
754 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
755 	    fpu_capability < AVX) {
756 		return KERN_FAILURE;
757 	}
758 
759 	assert(thr_act != THREAD_NULL);
760 
761 	thr_xstate = thread_xstate(thr_act);
762 
763 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
764 	    thr_xstate == AVX) {
765 		if (!fpu_thread_promote_avx512(thr_act)) {
766 			return KERN_FAILURE;
767 		} else {
768 			/* Reload thr_xstate after successful promotion */
769 			thr_xstate = thread_xstate(thr_act);
770 		}
771 	}
772 
773 	state = (x86_float_state64_t *)tstate;
774 
775 	pcb = THREAD_TO_PCB(thr_act);
776 
777 	if (state == NULL) {
778 		/*
779 		 * new FPU state is 'invalid'.
780 		 * Deallocate the fp state if it exists.
781 		 */
782 		simple_lock(&pcb->lock, LCK_GRP_NULL);
783 
784 		ifps = pcb->ifps;
785 		pcb->ifps = 0;
786 
787 		simple_unlock(&pcb->lock);
788 
789 		if (ifps != 0) {
790 			fp_state_free(ifps, thr_xstate);
791 		}
792 	} else {
793 		/*
794 		 * Valid incoming state. Allocate the fp state if there is none.
795 		 */
796 		new_ifps = 0;
797 Retry:
798 		simple_lock(&pcb->lock, LCK_GRP_NULL);
799 
800 		ifps = pcb->ifps;
801 		if (ifps == 0) {
802 			if (new_ifps == 0) {
803 				simple_unlock(&pcb->lock);
804 				new_ifps = fp_state_alloc(thr_xstate);
805 				goto Retry;
806 			}
807 			ifps = new_ifps;
808 			new_ifps = 0;
809 			pcb->ifps = ifps;
810 			pcb->xstate = thr_xstate;
811 			fresh_state = TRUE;
812 		}
813 
814 		/*
815 		 * now copy over the new data.
816 		 */
817 
818 		old_valid = ifps->fp_valid;
819 
820 #if     DEBUG || DEVELOPMENT
821 		if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
822 			panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
823 		}
824 #endif
825 		/*
826 		 * Clear any reserved bits in the MXCSR to prevent a GPF
827 		 * when issuing an FXRSTOR.
828 		 */
829 
830 		state->fpu_mxcsr &= mxcsr_capability_mask;
831 
832 		__nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
833 
834 		switch (thr_xstate) {
835 		case UNDEFINED_FULL:
836 		case FP_FULL:
837 		case AVX_FULL:
838 		case AVX512_FULL:
839 			panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
840 			break;
841 
842 		case UNDEFINED:
843 			panic("fpu_set_fxstate() UNDEFINED xstate");
844 			break;
845 		case FP:
846 			ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
847 			break;
848 		case AVX: {
849 			struct x86_avx_thread_state *iavx = (void *) ifps;
850 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
851 
852 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
853 
854 			/* Sanitize XSAVE header */
855 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
856 			iavx->_xh.xstate_bv = AVX_XMASK;
857 			iavx->_xh.xcomp_bv  = 0;
858 
859 			/*
860 			 * See the block comment at the top of the function for a description of why we're clearing
861 			 * xstate_bv bits.
862 			 */
863 			if (f == x86_AVX_STATE32) {
864 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
865 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
866 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
867 				}
868 			} else if (f == x86_AVX_STATE64) {
869 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
870 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
871 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
872 				}
873 			} else {
874 				iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
875 			}
876 			break;
877 		}
878 		case AVX512: {
879 			struct x86_avx512_thread_state *iavx = (void *) ifps;
880 			union {
881 				thread_state_t       ts;
882 				x86_avx512_state32_t *s32;
883 				x86_avx512_state64_t *s64;
884 			} xs = { .ts = tstate };
885 
886 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
887 
888 			/* Sanitize XSAVE header */
889 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
890 			iavx->_xh.xstate_bv = AVX512_XMASK;
891 			iavx->_xh.xcomp_bv  = 0;
892 
893 			/*
894 			 * See the block comment at the top of the function for a description of why we're clearing
895 			 * xstate_bv bits.
896 			 */
897 			switch (f) {
898 			case x86_AVX512_STATE32:
899 				__nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
900 				__nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
901 
902 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
903 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
904 				}
905 
906 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
907 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
908 				}
909 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
910 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
911 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
912 				}
913 
914 				DBG_AVX512_STATE(iavx);
915 				break;
916 			case x86_AVX_STATE32:
917 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
918 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
919 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
920 				}
921 				break;
922 			case x86_AVX512_STATE64:
923 				__nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
924 				__nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
925 				__nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
926 				/*
927 				 * Note that it is valid to have XFEM_ZMM_OPMASK set but XFEM_YMM cleared.  In that case,
928 				 * the upper bits of the YMMs would be cleared and would result in a clean-upper
929 				 * state, allowing SSE instruction to avoid false dependencies.
930 				 */
931 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
932 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
933 				}
934 
935 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
936 				    fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
937 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
938 				}
939 
940 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
941 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
942 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
943 				}
944 				DBG_AVX512_STATE(iavx);
945 				break;
946 			case x86_AVX_STATE64:
947 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
948 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
949 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
950 				}
951 				break;
952 			}
953 			break;
954 		}
955 		}
956 
957 		ifps->fp_valid = old_valid;
958 
959 		if (old_valid == FALSE) {
960 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
961 			ifps->fp_valid = TRUE;
962 			/* If altering the current thread's state, disable FPU */
963 			if (thr_act == current_thread()) {
964 				set_ts();
965 			}
966 
967 			ml_set_interrupts_enabled(istate);
968 		}
969 
970 		simple_unlock(&pcb->lock);
971 
972 		if (new_ifps != 0) {
973 			fp_state_free(new_ifps, thr_xstate);
974 		}
975 	}
976 	return KERN_SUCCESS;
977 }
978 
979 /*
980  * Get the floating-point state for a thread.
981  * If the thread is not the current thread, it is
982  * not running (held).  Locking needed against
983  * concurrent fpu_set_state or fpu_get_state.
984  */
985 kern_return_t
fpu_get_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)986 fpu_get_fxstate(
987 	thread_t        thr_act,
988 	thread_state_t  tstate,
989 	thread_flavor_t f)
990 {
991 	struct x86_fx_thread_state      *ifps;
992 	x86_float_state64_t             *state;
993 	kern_return_t                   ret = KERN_FAILURE;
994 	pcb_t                           pcb;
995 	xstate_t                        thr_xstate = thread_xstate(thr_act);
996 
997 	if (fpu_capability == UNDEFINED) {
998 		return KERN_FAILURE;
999 	}
1000 
1001 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
1002 	    fpu_capability < AVX) {
1003 		return KERN_FAILURE;
1004 	}
1005 
1006 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
1007 	    thr_xstate != AVX512) {
1008 		return KERN_FAILURE;
1009 	}
1010 
1011 	state = (x86_float_state64_t *)tstate;
1012 
1013 	assert(thr_act != THREAD_NULL);
1014 	pcb = THREAD_TO_PCB(thr_act);
1015 
1016 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1017 
1018 	ifps = pcb->ifps;
1019 	if (ifps == 0) {
1020 		/*
1021 		 * No valid floating-point state.
1022 		 */
1023 
1024 		__nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
1025 		    fp_state_size[FP]);
1026 
1027 		simple_unlock(&pcb->lock);
1028 
1029 		return KERN_SUCCESS;
1030 	}
1031 	/*
1032 	 * Make sure we`ve got the latest fp state info
1033 	 * If the live fpu state belongs to our target
1034 	 */
1035 	if (thr_act == current_thread()) {
1036 		boolean_t       intr;
1037 
1038 		intr = ml_set_interrupts_enabled(FALSE);
1039 
1040 		clear_ts();
1041 		fp_save(thr_act);
1042 		clear_fpu();
1043 
1044 		(void)ml_set_interrupts_enabled(intr);
1045 	}
1046 	if (ifps->fp_valid) {
1047 		__nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
1048 		switch (thr_xstate) {
1049 		case UNDEFINED_FULL:
1050 		case FP_FULL:
1051 		case AVX_FULL:
1052 		case AVX512_FULL:
1053 			panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
1054 			break;
1055 
1056 		case UNDEFINED:
1057 			panic("fpu_get_fxstate() UNDEFINED xstate");
1058 			break;
1059 		case FP:
1060 			break;                  /* already done */
1061 		case AVX: {
1062 			struct x86_avx_thread_state *iavx = (void *) ifps;
1063 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1064 			if (f == x86_AVX_STATE32) {
1065 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1066 			} else if (f == x86_AVX_STATE64) {
1067 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1068 			}
1069 			break;
1070 		}
1071 		case AVX512: {
1072 			struct x86_avx512_thread_state *iavx = (void *) ifps;
1073 			union {
1074 				thread_state_t       ts;
1075 				x86_avx512_state32_t *s32;
1076 				x86_avx512_state64_t *s64;
1077 			} xs = { .ts = tstate };
1078 			switch (f) {
1079 			case x86_AVX512_STATE32:
1080 				__nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1081 				__nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1082 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1083 				DBG_AVX512_STATE(iavx);
1084 				break;
1085 			case x86_AVX_STATE32:
1086 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1087 				break;
1088 			case x86_AVX512_STATE64:
1089 				__nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1090 				__nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1091 				__nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1092 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1093 				DBG_AVX512_STATE(iavx);
1094 				break;
1095 			case x86_AVX_STATE64:
1096 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1097 				break;
1098 			}
1099 			break;
1100 		}
1101 		}
1102 
1103 		ret = KERN_SUCCESS;
1104 	}
1105 	simple_unlock(&pcb->lock);
1106 
1107 	return ret;
1108 }
1109 
1110 
1111 
1112 /*
1113  * the child thread is 'stopped' with the thread
1114  * mutex held and is currently not known by anyone
1115  * so no way for fpu state to get manipulated by an
1116  * outside agency -> no need for pcb lock
1117  */
1118 
1119 void
fpu_dup_fxstate(thread_t parent,thread_t child)1120 fpu_dup_fxstate(
1121 	thread_t        parent,
1122 	thread_t        child)
1123 {
1124 	struct x86_fx_thread_state *new_ifps = NULL;
1125 	boolean_t       intr;
1126 	pcb_t           ppcb;
1127 	xstate_t        xstate = thread_xstate(parent);
1128 
1129 	ppcb = THREAD_TO_PCB(parent);
1130 
1131 	if (ppcb->ifps == NULL) {
1132 		return;
1133 	}
1134 
1135 	if (child->machine.ifps) {
1136 		panic("fpu_dup_fxstate: child's ifps non-null");
1137 	}
1138 
1139 	new_ifps = fp_state_alloc(xstate);
1140 
1141 	simple_lock(&ppcb->lock, LCK_GRP_NULL);
1142 
1143 	if (ppcb->ifps != NULL) {
1144 		struct x86_fx_thread_state *ifps = ppcb->ifps;
1145 		/*
1146 		 * Make sure we`ve got the latest fp state info
1147 		 */
1148 		if (current_thread() == parent) {
1149 			intr = ml_set_interrupts_enabled(FALSE);
1150 			assert(current_thread() == parent);
1151 			clear_ts();
1152 			fp_save(parent);
1153 			clear_fpu();
1154 
1155 			(void)ml_set_interrupts_enabled(intr);
1156 		}
1157 
1158 		if (ifps->fp_valid) {
1159 			child->machine.ifps = new_ifps;
1160 			child->machine.xstate = xstate;
1161 			__nochk_bcopy((char *)(ppcb->ifps),
1162 			    (char *)(child->machine.ifps),
1163 			    fp_state_size[xstate]);
1164 
1165 			/* Mark the new fp saved state as non-live. */
1166 			/* Temporarily disabled: radar 4647827
1167 			 * new_ifps->fp_valid = TRUE;
1168 			 */
1169 
1170 			/*
1171 			 * Clear any reserved bits in the MXCSR to prevent a GPF
1172 			 * when issuing an FXRSTOR.
1173 			 */
1174 			new_ifps->fx_MXCSR &= mxcsr_capability_mask;
1175 			new_ifps = NULL;
1176 		}
1177 	}
1178 	simple_unlock(&ppcb->lock);
1179 
1180 	if (new_ifps != NULL) {
1181 		fp_state_free(new_ifps, xstate);
1182 	}
1183 }
1184 
1185 /*
1186  * Initialize FPU.
1187  * FNINIT programs the x87 control word to 0x37f, which matches
1188  * the desired default for macOS.
1189  */
1190 
1191 void
fpinit(void)1192 fpinit(void)
1193 {
1194 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1195 	clear_ts();
1196 	fninit();
1197 #if DEBUG
1198 	/* We skip this power-on-default verification sequence on
1199 	 * non-DEBUG, as dirtying the x87 control word may slow down
1200 	 * xsave/xrstor and affect energy use.
1201 	 */
1202 	unsigned short  control, control2;
1203 	fnstcw(&control);
1204 	control2 = control;
1205 	control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1206 	control |= (FPC_PC_64 |         /* Set precision */
1207 	    FPC_RC_RN |                 /* round-to-nearest */
1208 	    FPC_ZE |                    /* Suppress zero-divide */
1209 	    FPC_OE |                    /*  and overflow */
1210 	    FPC_UE |                    /*  underflow */
1211 	    FPC_IE |                    /* Allow NaNQs and +-INF */
1212 	    FPC_DE |                    /* Allow denorms as operands  */
1213 	    FPC_PE);                    /* No trap for precision loss */
1214 	assert(control == control2);
1215 	fldcw(control);
1216 #endif
1217 	/* Initialize SSE/SSE2 */
1218 	__builtin_ia32_ldmxcsr(0x1f80);
1219 	if (fpu_YMM_capable) {
1220 		vzeroall();
1221 	} else {
1222 		xmmzeroall();
1223 	}
1224 	ml_set_interrupts_enabled(istate);
1225 }
1226 
1227 /*
1228  * Coprocessor not present.
1229  */
1230 
1231 uint64_t x86_isr_fp_simd_use;
1232 
1233 void
fpnoextflt(void)1234 fpnoextflt(void)
1235 {
1236 	boolean_t       intr;
1237 	thread_t        thr_act;
1238 	pcb_t           pcb;
1239 	struct x86_fx_thread_state *ifps = 0;
1240 	xstate_t        xstate = current_xstate();
1241 
1242 	thr_act = current_thread();
1243 	pcb = THREAD_TO_PCB(thr_act);
1244 
1245 	if (pcb->ifps == 0 && !get_interrupt_level()) {
1246 		ifps = fp_state_alloc(xstate);
1247 		__nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
1248 		    fp_state_size[xstate]);
1249 		if (!thread_is_64bit_addr(thr_act)) {
1250 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
1251 		} else {
1252 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
1253 		}
1254 		ifps->fp_valid = TRUE;
1255 	}
1256 	intr = ml_set_interrupts_enabled(FALSE);
1257 
1258 	clear_ts();                     /*  Enable FPU use */
1259 
1260 	if (__improbable(get_interrupt_level())) {
1261 		/* Track number of #DNA traps at interrupt context,
1262 		 * which is likely suboptimal. Racy, but good enough.
1263 		 */
1264 		x86_isr_fp_simd_use++;
1265 		/*
1266 		 * Save current FP/SIMD context if valid
1267 		 * Initialize live FP/SIMD registers
1268 		 */
1269 		if (pcb->ifps) {
1270 			fp_save(thr_act);
1271 		}
1272 		fpinit();
1273 	} else {
1274 		if (pcb->ifps == 0) {
1275 			pcb->ifps = ifps;
1276 			pcb->xstate = xstate;
1277 			ifps = 0;
1278 		}
1279 		/*
1280 		 * Load this thread`s state into coprocessor live context.
1281 		 */
1282 		fp_load(thr_act);
1283 	}
1284 	(void)ml_set_interrupts_enabled(intr);
1285 
1286 	if (ifps) {
1287 		fp_state_free(ifps, xstate);
1288 	}
1289 }
1290 
1291 /*
1292  * FPU overran end of segment.
1293  * Re-initialize FPU.  Floating point state is not valid.
1294  */
1295 
1296 void
fpextovrflt(void)1297 fpextovrflt(void)
1298 {
1299 	thread_t        thr_act = current_thread();
1300 	pcb_t           pcb;
1301 	struct x86_fx_thread_state *ifps;
1302 	boolean_t       intr;
1303 	xstate_t        xstate = current_xstate();
1304 
1305 	intr = ml_set_interrupts_enabled(FALSE);
1306 
1307 	if (get_interrupt_level()) {
1308 		panic("FPU segment overrun exception at interrupt context");
1309 	}
1310 	if (current_task() == kernel_task) {
1311 		panic("FPU segment overrun exception in kernel thread context");
1312 	}
1313 
1314 	/*
1315 	 * This is a non-recoverable error.
1316 	 * Invalidate the thread`s FPU state.
1317 	 */
1318 	pcb = THREAD_TO_PCB(thr_act);
1319 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1320 	ifps = pcb->ifps;
1321 	pcb->ifps = 0;
1322 	simple_unlock(&pcb->lock);
1323 
1324 	/*
1325 	 * Re-initialize the FPU.
1326 	 */
1327 	clear_ts();
1328 	fninit();
1329 
1330 	/*
1331 	 * And disable access.
1332 	 */
1333 	clear_fpu();
1334 
1335 	(void)ml_set_interrupts_enabled(intr);
1336 
1337 	if (ifps) {
1338 		fp_state_free(ifps, xstate);
1339 	}
1340 }
1341 
1342 /*
1343  * FPU error. Called by AST.
1344  */
1345 
1346 void
fpexterrflt(void)1347 fpexterrflt(void)
1348 {
1349 	thread_t        thr_act = current_thread();
1350 	boolean_t       intr;
1351 
1352 	intr = ml_set_interrupts_enabled(FALSE);
1353 
1354 	if (get_interrupt_level()) {
1355 		panic("FPU error exception at interrupt context");
1356 	}
1357 	if (current_task() == kernel_task) {
1358 		panic("FPU error exception in kernel thread context");
1359 	}
1360 
1361 	/*
1362 	 * Save the FPU state and turn off the FPU.
1363 	 */
1364 	fp_save(thr_act);
1365 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1366 	set_ts();
1367 
1368 	(void)ml_set_interrupts_enabled(intr);
1369 }
1370 
1371 /*
1372  * Save FPU state.
1373  *
1374  * Locking not needed:
1375  * .	if called from fpu_get_state, pcb already locked.
1376  * .	if called from fpnoextflt or fp_intr, we are single-cpu
1377  * .	otherwise, thread is running.
1378  * N.B.: Must be called with interrupts disabled
1379  */
1380 
1381 void
fp_save(thread_t thr_act)1382 fp_save(
1383 	thread_t        thr_act)
1384 {
1385 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1386 	struct x86_fx_thread_state *ifps = pcb->ifps;
1387 
1388 	assert(ifps != 0);
1389 	if (ifps != 0 && !ifps->fp_valid) {
1390 		assert((get_cr0() & CR0_TS) == 0);
1391 		/* registers are in FPU */
1392 		ifps->fp_valid = TRUE;
1393 		fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1394 	}
1395 }
1396 
1397 /*
1398  * Restore FPU state from PCB.
1399  *
1400  * Locking not needed; always called on the current thread.
1401  */
1402 
1403 void
fp_load(thread_t thr_act)1404 fp_load(
1405 	thread_t        thr_act)
1406 {
1407 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1408 	struct x86_fx_thread_state *ifps = pcb->ifps;
1409 
1410 	assert(ifps);
1411 #if     DEBUG
1412 	if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1413 		panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u",
1414 		    ifps->fp_valid, ifps->fp_save_layout);
1415 	}
1416 #endif
1417 
1418 	if (ifps->fp_valid == FALSE) {
1419 		fpinit();
1420 	} else {
1421 		fpu_load_registers(ifps);
1422 	}
1423 	ifps->fp_valid = FALSE;         /* in FPU */
1424 }
1425 
1426 /*
1427  * SSE arithmetic exception handling code.
1428  * Basically the same as the x87 exception handler with a different subtype
1429  */
1430 
1431 void
fpSSEexterrflt(void)1432 fpSSEexterrflt(void)
1433 {
1434 	thread_t        thr_act = current_thread();
1435 	boolean_t       intr;
1436 
1437 	intr = ml_set_interrupts_enabled(FALSE);
1438 
1439 	if (get_interrupt_level()) {
1440 		panic("SSE exception at interrupt context");
1441 	}
1442 	if (current_task() == kernel_task) {
1443 		panic("SSE exception in kernel thread context");
1444 	}
1445 
1446 	/*
1447 	 * Save the FPU state and turn off the FPU.
1448 	 */
1449 	fp_save(thr_act);
1450 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1451 	set_ts();
1452 
1453 	(void)ml_set_interrupts_enabled(intr);
1454 }
1455 
1456 
1457 /*
1458  * If a thread is using an AVX-sized savearea:
1459  * - allocate a new AVX512-sized  area,
1460  * - copy the 256-bit state into the 512-bit area,
1461  * - deallocate the smaller area
1462  * ASSUMES: thread is the current thread.
1463  */
1464 static void
fpu_savearea_promote_avx512(thread_t thread)1465 fpu_savearea_promote_avx512(thread_t thread)
1466 {
1467 	struct x86_avx_thread_state     *ifps = NULL;
1468 	struct x86_avx512_thread_state  *ifps512 = NULL;
1469 	pcb_t                           pcb = THREAD_TO_PCB(thread);
1470 	boolean_t                       do_avx512_alloc = FALSE;
1471 	boolean_t                       intr;
1472 
1473 	assert(thread == current_thread());
1474 
1475 	DBG("fpu_savearea_promote_avx512(%p)\n", thread);
1476 
1477 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1478 
1479 	ifps = pcb->ifps;
1480 	if (ifps == NULL) {
1481 		pcb->xstate = AVX512;
1482 		simple_unlock(&pcb->lock);
1483 		/*
1484 		 * Now that the PCB xstate has been promoted, set XCR0 so
1485 		 * that we don't re-trip #UD on the next AVX-512 instruction.
1486 		 *
1487 		 * Since this branch is taken when the first FP instruction
1488 		 * attempted by this thread is an AVX-512 instruction, we
1489 		 * call fpnoextflt() to allocate an appropriately-sized
1490 		 * AVX-512 save-area, thereby avoiding the overhead of another
1491 		 * fault that would be triggered immediately on return.
1492 		 */
1493 		intr = ml_set_interrupts_enabled(FALSE);
1494 		xsetbv(0, AVX512_XMASK);
1495 		current_cpu_datap()->cpu_xstate = AVX512;
1496 		(void)ml_set_interrupts_enabled(intr);
1497 
1498 		fpnoextflt();
1499 		return;
1500 	}
1501 
1502 	if (pcb->xstate != AVX512) {
1503 		do_avx512_alloc = TRUE;
1504 	}
1505 
1506 	simple_unlock(&pcb->lock);
1507 
1508 	if (do_avx512_alloc == TRUE) {
1509 		ifps512 = fp_state_alloc(AVX512);
1510 	}
1511 
1512 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1513 
1514 	intr = ml_set_interrupts_enabled(FALSE);
1515 
1516 	clear_ts();
1517 	fp_save(thread);
1518 	clear_fpu();
1519 
1520 	xsetbv(0, AVX512_XMASK);
1521 	current_cpu_datap()->cpu_xstate = AVX512;
1522 	(void)ml_set_interrupts_enabled(intr);
1523 
1524 	assert(ifps->fp.fp_valid);
1525 
1526 	/* Allocate an AVX512 savearea and copy AVX state into it */
1527 	if (pcb->xstate != AVX512) {
1528 		__nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
1529 		pcb->ifps = ifps512;
1530 		pcb->xstate = AVX512;
1531 		ifps512 = NULL;
1532 	} else {
1533 		ifps = NULL;
1534 	}
1535 	/* The PCB lock is redundant in some scenarios given the higher level
1536 	 * thread mutex, but its pre-emption disablement is relied upon here
1537 	 */
1538 	simple_unlock(&pcb->lock);
1539 
1540 	if (ifps) {
1541 		fp_state_free(ifps, AVX);
1542 	}
1543 	if (ifps512) {
1544 		fp_state_free(ifps, AVX512);
1545 	}
1546 }
1547 
1548 /*
1549  * Upgrade the calling thread to AVX512.
1550  */
1551 boolean_t
fpu_thread_promote_avx512(thread_t thread)1552 fpu_thread_promote_avx512(thread_t thread)
1553 {
1554 	task_t          task = current_task();
1555 
1556 	if (thread != current_thread()) {
1557 		return FALSE;
1558 	}
1559 	if (!ml_fpu_avx512_enabled()) {
1560 		return FALSE;
1561 	}
1562 
1563 	fpu_savearea_promote_avx512(thread);
1564 
1565 	/* Racy but the task's xstate is only a hint */
1566 	task->xstate = AVX512;
1567 
1568 	return TRUE;
1569 }
1570 
1571 
1572 /*
1573  * Called from user_trap() when an invalid opcode fault is taken.
1574  * If the user is attempting an AVX512 instruction on a machine
1575  * that supports this, we switch the calling thread to use
1576  * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1577  * return to user_trap() with a 0 return value.
1578  * Otherwise, simply return a nonzero value.
1579  */
1580 
1581 #define MAX_X86_INSN_LENGTH (15)
1582 int
fpUDflt(user_addr_t rip)1583 fpUDflt(user_addr_t rip)
1584 {
1585 	uint8_t         instruction_prefix;
1586 	boolean_t       is_AVX512_instruction = FALSE;
1587 	user_addr_t     original_rip = rip;
1588 
1589 	/*
1590 	 * If this thread's xstate is already AVX512, then this #UD is
1591 	 * a true #UD.
1592 	 */
1593 	if (thread_xstate(current_thread()) == AVX512) {
1594 		return 1;
1595 	}
1596 
1597 	do {
1598 		/* TODO: as an optimisation, copy up to the lesser of the
1599 		 * next page boundary or maximal prefix length in one pass
1600 		 * rather than issue multiple copyins
1601 		 */
1602 		if (copyin(rip, (char *) &instruction_prefix, 1)) {
1603 			return 1;
1604 		}
1605 		DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1606 		    rip, instruction_prefix);
1607 		/* TODO: determine more specifically which prefixes
1608 		 * are sane possibilities for AVX512 insns
1609 		 */
1610 		switch (instruction_prefix) {
1611 		case 0x2E:      /* CS segment override */
1612 		case 0x36:      /* SS segment override */
1613 		case 0x3E:      /* DS segment override */
1614 		case 0x26:      /* ES segment override */
1615 		case 0x64:      /* FS segment override */
1616 		case 0x65:      /* GS segment override */
1617 		case 0x66:      /* Operand-size override */
1618 		case 0x67:      /* address-size override */
1619 			/* Skip optional prefixes */
1620 			rip++;
1621 			if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1622 				return 1;
1623 			}
1624 			break;
1625 		case 0x62:      /* EVEX */
1626 		case 0xC5:      /* VEX 2-byte */
1627 		case 0xC4:      /* VEX 3-byte */
1628 			is_AVX512_instruction = TRUE;
1629 			break;
1630 		default:
1631 			return 1;
1632 		}
1633 	} while (!is_AVX512_instruction);
1634 
1635 	/* Here if we detect attempted execution of an AVX512 instruction */
1636 
1637 	/*
1638 	 * Fail if this machine doesn't support AVX512
1639 	 */
1640 	if (fpu_capability != AVX512) {
1641 		return 1;
1642 	}
1643 
1644 	assert(xgetbv(XCR0) == AVX_XMASK);
1645 
1646 	DBG("fpUDflt() switching xstate to AVX512\n");
1647 	(void) fpu_thread_promote_avx512(current_thread());
1648 
1649 	return 0;
1650 }
1651 
1652 void
fp_setvalid(boolean_t value)1653 fp_setvalid(boolean_t value)
1654 {
1655 	thread_t        thr_act = current_thread();
1656 	struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
1657 
1658 	if (ifps) {
1659 		ifps->fp_valid = value;
1660 
1661 		if (value == TRUE) {
1662 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
1663 			clear_fpu();
1664 			ml_set_interrupts_enabled(istate);
1665 		}
1666 	}
1667 }
1668 
1669 boolean_t
ml_fpu_avx_enabled(void)1670 ml_fpu_avx_enabled(void)
1671 {
1672 	return fpu_capability >= AVX;
1673 }
1674 
1675 boolean_t
ml_fpu_avx512_enabled(void)1676 ml_fpu_avx512_enabled(void)
1677 {
1678 	return fpu_capability == AVX512;
1679 }
1680 
1681 static xstate_t
thread_xstate(thread_t thread)1682 thread_xstate(thread_t thread)
1683 {
1684 	xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1685 	if (xs != UNDEFINED) {
1686 		return xs;
1687 	} else if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
1688 		return fpu_default;
1689 	} else {
1690 		return get_threadtask(thread)->xstate;
1691 	}
1692 }
1693 
1694 xstate_t
current_xstate(void)1695 current_xstate(void)
1696 {
1697 	return thread_xstate(current_thread());
1698 }
1699 
1700 /*
1701  * Called when exec'ing between bitnesses.
1702  * If valid FPU state exists, adjust the layout.
1703  */
1704 void
fpu_switch_addrmode(thread_t thread,boolean_t is_64bit)1705 fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1706 {
1707 	struct x86_fx_thread_state *ifps = thread->machine.ifps;
1708 	mp_disable_preemption();
1709 
1710 	if (ifps && ifps->fp_valid) {
1711 		if (thread_xstate(thread) == FP) {
1712 			ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1713 		} else {
1714 			ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1715 		}
1716 	}
1717 	mp_enable_preemption();
1718 }
1719 
1720 static inline uint32_t
fpsimd_pop(uintptr_t ins,int sz)1721 fpsimd_pop(uintptr_t ins, int sz)
1722 {
1723 	uint32_t rv = 0;
1724 
1725 
1726 	while (sz >= 16) {
1727 		uint32_t rv1, rv2;
1728 		uint64_t *ins64 = (uint64_t *) ins;
1729 		uint64_t *ins642 = (uint64_t *) (ins + 8);
1730 		rv1 = __builtin_popcountll(*ins64);
1731 		rv2 = __builtin_popcountll(*ins642);
1732 		rv += rv1 + rv2;
1733 		sz -= 16;
1734 		ins += 16;
1735 	}
1736 
1737 	while (sz >= 4) {
1738 		uint32_t *ins32 = (uint32_t *) ins;
1739 		rv += __builtin_popcount(*ins32);
1740 		sz -= 4;
1741 		ins += 4;
1742 	}
1743 
1744 	while (sz > 0) {
1745 		char *ins8 = (char *)ins;
1746 		rv += __builtin_popcount(*ins8);
1747 		sz--;
1748 		ins++;
1749 	}
1750 	return rv;
1751 }
1752 
1753 uint32_t
thread_fpsimd_hash(thread_t ft)1754 thread_fpsimd_hash(thread_t ft)
1755 {
1756 	if (fpsimd_fault_popc == 0) {
1757 		return 0;
1758 	}
1759 
1760 	uint32_t prv = 0;
1761 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1762 	struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1763 
1764 	if (pifps) {
1765 		if (pifps->fp_valid) {
1766 			prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1767 			    sizeof(pifps->fx_XMM_reg));
1768 		} else {
1769 			uintptr_t cr0 = get_cr0();
1770 			clear_ts();
1771 			fp_save(ft);
1772 			prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1773 			    sizeof(pifps->fx_XMM_reg));
1774 			pifps->fp_valid = FALSE;
1775 			if (cr0 & CR0_TS) {
1776 				set_cr0(cr0);
1777 			}
1778 		}
1779 	}
1780 	ml_set_interrupts_enabled(istate);
1781 	return prv;
1782 }
1783