xref: /xnu-8792.41.9/osfmk/i386/fpu.c (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1 /*
2  * Copyright (c) 2000-2020 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 /*
29  * @OSF_COPYRIGHT@
30  */
31 /*
32  * Mach Operating System
33  * Copyright (c) 1992-1990 Carnegie Mellon University
34  * All Rights Reserved.
35  *
36  * Permission to use, copy, modify and distribute this software and its
37  * documentation is hereby granted, provided that both the copyright
38  * notice and this permission notice appear in all copies of the
39  * software, derivative works or modified versions, and any portions
40  * thereof, and that both notices appear in supporting documentation.
41  *
42  * CARNEGIE MELLON ALLOWS FREE USE OF THIS SOFTWARE IN ITS "AS IS"
43  * CONDITION.  CARNEGIE MELLON DISCLAIMS ANY LIABILITY OF ANY KIND FOR
44  * ANY DAMAGES WHATSOEVER RESULTING FROM THE USE OF THIS SOFTWARE.
45  *
46  * Carnegie Mellon requests users of this software to return to
47  *
48  *  Software Distribution Coordinator  or  [email protected]
49  *  School of Computer Science
50  *  Carnegie Mellon University
51  *  Pittsburgh PA 15213-3890
52  *
53  * any improvements or extensions that they make and grant Carnegie Mellon
54  * the rights to redistribute these changes.
55  */
56 
57 #include <mach/exception_types.h>
58 #include <mach/i386/thread_status.h>
59 #include <mach/i386/fp_reg.h>
60 
61 #include <kern/mach_param.h>
62 #include <kern/processor.h>
63 #include <kern/thread.h>
64 #include <kern/zalloc.h>
65 #include <kern/misc_protos.h>
66 #include <kern/spl.h>
67 #include <kern/assert.h>
68 
69 #include <libkern/OSAtomic.h>
70 
71 #include <architecture/i386/pio.h>
72 #include <i386/cpuid.h>
73 #include <i386/fpu.h>
74 #include <i386/proc_reg.h>
75 #include <i386/misc_protos.h>
76 #include <i386/thread.h>
77 #include <i386/trap.h>
78 
79 xstate_t        fpu_capability = UNDEFINED;     /* extended state capability */
80 xstate_t        fpu_default = UNDEFINED;        /* default extended state */
81 
82 #define ALIGNED(addr, size)      (((uintptr_t)(addr)&((size)-1))==0)
83 #define VERIFY_SAVEAREA_ALIGNED(p, a) \
84 	assertf(!(((uintptr_t)(p)) & ((a) - 1)), \
85 	    "FP save area component @ 0x%lx not 8-byte aligned", ((uintptr_t)(p)))
86 
87 /* Forward */
88 
89 extern void             fpinit(void);
90 extern void             fp_save(
91 	thread_t        thr_act);
92 extern void             fp_load(
93 	thread_t        thr_act);
94 
95 static void configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps);
96 static xstate_t thread_xstate(thread_t);
97 
98 x86_ext_thread_state_t  initial_fp_state __attribute((aligned(64)));
99 x86_ext_thread_state_t  default_avx512_state __attribute((aligned(64)));
100 x86_ext_thread_state_t  default_avx_state __attribute((aligned(64)));
101 x86_ext_thread_state_t  default_fx_state __attribute((aligned(64)));
102 
103 /* Global MXCSR capability bitmask */
104 static unsigned int mxcsr_capability_mask;
105 
106 #define fninit() \
107 	__asm__ volatile("fninit")
108 
109 #define fnstcw(control) \
110 	__asm__("fnstcw %0" : "=m" (*(unsigned short *)(control)))
111 
112 #define fldcw(control) \
113 	__asm__ volatile("fldcw %0" : : "m" (*(unsigned short *) &(control)) )
114 
115 #define fnclex() \
116 	__asm__ volatile("fnclex")
117 
118 #define fnsave(state)  \
119 	__asm__ volatile("fnsave %0" : "=m" (*state))
120 
121 #define frstor(state) \
122 	__asm__ volatile("frstor %0" : : "m" (state))
123 
124 #define fwait() \
125 	__asm__("fwait");
126 
127 static inline void
fxrstor(struct x86_fx_thread_state * a)128 fxrstor(struct x86_fx_thread_state *a)
129 {
130 	__asm__ __volatile__ ("fxrstor %0" ::  "m" (*a));
131 }
132 
133 static inline void
fxsave(struct x86_fx_thread_state * a)134 fxsave(struct x86_fx_thread_state *a)
135 {
136 	__asm__ __volatile__ ("fxsave %0" : "=m" (*a));
137 }
138 
139 static inline void
fxrstor64(struct x86_fx_thread_state * a)140 fxrstor64(struct x86_fx_thread_state *a)
141 {
142 	__asm__ __volatile__ ("fxrstor64 %0" ::  "m" (*a));
143 }
144 
145 static inline void
fxsave64(struct x86_fx_thread_state * a)146 fxsave64(struct x86_fx_thread_state *a)
147 {
148 	__asm__ __volatile__ ("fxsave64 %0" : "=m" (*a));
149 }
150 
151 #define IS_VALID_XSTATE(x)      ((x) == FP || (x) == AVX || (x) == AVX512)
152 
153 SECURITY_READ_ONLY_LATE(zone_t) ifps_zone[] = {
154 	[FP]     = NULL,
155 	[AVX]    = NULL,
156 	[AVX512] = NULL
157 };
158 static const uint32_t fp_state_size[] = {
159 	[FP]     = sizeof(struct x86_fx_thread_state),
160 	[AVX]    = sizeof(struct x86_avx_thread_state),
161 	[AVX512] = sizeof(struct x86_avx512_thread_state)
162 };
163 
164 static const char *const xstate_name[] = {
165 	[UNDEFINED] = "UNDEFINED",
166 	[FP] = "FP",
167 	[AVX] = "AVX",
168 	[AVX512] = "AVX512"
169 };
170 
171 #define fpu_ZMM_capable (fpu_capability == AVX512)
172 #define fpu_YMM_capable (fpu_capability == AVX || fpu_capability == AVX512)
173 /*
174  * On-demand AVX512 support
175  * ------------------------
176  * On machines with AVX512 support, by default, threads are created with
177  * AVX512 masked off in XCR0 and an AVX-sized savearea is used. However, AVX512
178  * capabilities are advertised in the commpage and via sysctl. If a thread
179  * opts to use AVX512 instructions, the first will result in a #UD exception.
180  * Faulting AVX512 intructions are recognizable by their unique prefix.
181  * This exception results in the thread being promoted to use an AVX512-sized
182  * savearea and for the AVX512 bit masks being set in its XCR0. The faulting
183  * instruction is re-driven and the thread can proceed to perform AVX512
184  * operations.
185  *
186  * In addition to AVX512 instructions causing promotion, the thread_set_state()
187  * primitive with an AVX512 state flavor result in promotion.
188  *
189  * AVX512 promotion of the first thread in a task causes the default xstate
190  * of the task to be promoted so that any subsequently created or subsequently
191  * DNA-faulted thread will have AVX512 xstate and it will not need to fault-in
192  * a promoted xstate.
193  *
194  * Two savearea zones are used: the default pool of AVX-sized (832 byte) areas
195  * and a second pool of larger AVX512-sized (2688 byte) areas.
196  *
197  * Note the initial state value is an AVX512 object but that the AVX initial
198  * value is a subset of it.
199  */
200 static uint32_t cpuid_reevaluated = 0;
201 
202 static void fpu_store_registers(void *, boolean_t);
203 static void fpu_load_registers(void *);
204 
205 static const uint32_t xstate_xmask[] = {
206 	[FP] =          FP_XMASK,
207 	[AVX] =         AVX_XMASK,
208 	[AVX512] =      AVX512_XMASK
209 };
210 
211 static inline void
xsave(struct x86_fx_thread_state * a,uint32_t rfbm)212 xsave(struct x86_fx_thread_state *a, uint32_t rfbm)
213 {
214 	__asm__ __volatile__ ("xsave %0" :"=m" (*a) : "a"(rfbm), "d"(0));
215 }
216 
217 static inline void
xsave64(struct x86_fx_thread_state * a,uint32_t rfbm)218 xsave64(struct x86_fx_thread_state *a, uint32_t rfbm)
219 {
220 	__asm__ __volatile__ ("xsave64 %0" :"=m" (*a) : "a"(rfbm), "d"(0));
221 }
222 
223 static inline void
xrstor(struct x86_fx_thread_state * a,uint32_t rfbm)224 xrstor(struct x86_fx_thread_state *a, uint32_t rfbm)
225 {
226 	__asm__ __volatile__ ("xrstor %0" ::  "m" (*a), "a"(rfbm), "d"(0));
227 }
228 
229 static inline void
xrstor64(struct x86_fx_thread_state * a,uint32_t rfbm)230 xrstor64(struct x86_fx_thread_state *a, uint32_t rfbm)
231 {
232 	__asm__ __volatile__ ("xrstor64 %0" ::  "m" (*a), "a"(rfbm), "d"(0));
233 }
234 
235 __unused static inline void
vzeroupper(void)236 vzeroupper(void)
237 {
238 	__asm__ __volatile__ ("vzeroupper" ::);
239 }
240 
241 static boolean_t fpu_thread_promote_avx512(thread_t);   /* Forward */
242 
243 
244 /*
245  * Furthermore, make compile-time asserts that no padding creeps into structures
246  * for which we're doing this.
247  */
248 #define ASSERT_PACKED(t, m1, m2, n, mt)                 \
249 extern char assert_packed_ ## t ## _ ## m1 ## _ ## m2   \
250 	[(offsetof(t,m2) - offsetof(t,m1) == (n - 1)*sizeof(mt)) ? 1 : -1]
251 
252 ASSERT_PACKED(x86_avx_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
253 
254 ASSERT_PACKED(x86_avx_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
255 
256 ASSERT_PACKED(x86_avx512_state32_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
257 ASSERT_PACKED(x86_avx512_state32_t, fpu_ymmh0, fpu_ymmh7, 8, _STRUCT_XMM_REG);
258 ASSERT_PACKED(x86_avx512_state32_t, fpu_zmmh0, fpu_zmmh7, 8, _STRUCT_YMM_REG);
259 
260 ASSERT_PACKED(x86_avx512_state64_t, fpu_k0, fpu_k7, 8, _STRUCT_OPMASK_REG);
261 ASSERT_PACKED(x86_avx512_state64_t, fpu_ymmh0, fpu_ymmh15, 16, _STRUCT_XMM_REG);
262 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmmh0, fpu_zmmh15, 16, _STRUCT_YMM_REG);
263 ASSERT_PACKED(x86_avx512_state64_t, fpu_zmm16, fpu_zmm31, 16, _STRUCT_ZMM_REG);
264 
265 #if defined(DEBUG_AVX512)
266 
267 #define DBG(x...)       kprintf("DBG: " x)
268 
269 typedef struct { uint8_t byte[8]; }  opmask_t;
270 typedef struct { uint8_t byte[16]; } xmm_t;
271 typedef struct { uint8_t byte[32]; } ymm_t;
272 typedef struct { uint8_t byte[64]; } zmm_t;
273 
274 static void
DBG_AVX512_STATE(struct x86_avx512_thread_state * sp)275 DBG_AVX512_STATE(struct x86_avx512_thread_state *sp)
276 {
277 	int     i, j;
278 	xmm_t *xmm  = (xmm_t *) &sp->fp.fx_XMM_reg;
279 	xmm_t *ymmh = (xmm_t *) &sp->x_YMM_Hi128;
280 	ymm_t *zmmh = (ymm_t *) &sp->x_ZMM_Hi256;
281 	zmm_t *zmm  = (zmm_t *) &sp->x_Hi16_ZMM;
282 	opmask_t *k = (opmask_t *) &sp->x_Opmask;
283 
284 	kprintf("x_YMM_Hi128: %lu\n", offsetof(struct x86_avx512_thread_state, x_YMM_Hi128));
285 	kprintf("x_Opmask:    %lu\n", offsetof(struct x86_avx512_thread_state, x_Opmask));
286 	kprintf("x_ZMM_Hi256: %lu\n", offsetof(struct x86_avx512_thread_state, x_ZMM_Hi256));
287 	kprintf("x_Hi16_ZMM:  %lu\n", offsetof(struct x86_avx512_thread_state, x_Hi16_ZMM));
288 
289 	kprintf("XCR0:   0x%016llx\n", xgetbv(XCR0));
290 	kprintf("XINUSE: 0x%016llx\n", xgetbv(1));
291 
292 	/* Print all ZMM registers */
293 	for (i = 0; i < 16; i++) {
294 		kprintf("zmm%d:\t0x", i);
295 		for (j = 0; j < 16; j++) {
296 			kprintf("%02x", xmm[i].byte[j]);
297 		}
298 		for (j = 0; j < 16; j++) {
299 			kprintf("%02x", ymmh[i].byte[j]);
300 		}
301 		for (j = 0; j < 32; j++) {
302 			kprintf("%02x", zmmh[i].byte[j]);
303 		}
304 		kprintf("\n");
305 	}
306 	for (i = 0; i < 16; i++) {
307 		kprintf("zmm%d:\t0x", 16 + i);
308 		for (j = 0; j < 64; j++) {
309 			kprintf("%02x", zmm[i].byte[j]);
310 		}
311 		kprintf("\n");
312 	}
313 	for (i = 0; i < 8; i++) {
314 		kprintf("k%d:\t0x", i);
315 		for (j = 0; j < 8; j++) {
316 			kprintf("%02x", k[i].byte[j]);
317 		}
318 		kprintf("\n");
319 	}
320 
321 	kprintf("xstate_bv: 0x%016llx\n", sp->_xh.xstate_bv);
322 	kprintf("xcomp_bv:  0x%016llx\n", sp->_xh.xcomp_bv);
323 }
324 #else
325 #define DBG(x...)
326 static void
DBG_AVX512_STATE(__unused struct x86_avx512_thread_state * sp)327 DBG_AVX512_STATE(__unused struct x86_avx512_thread_state *sp)
328 {
329 	return;
330 }
331 #endif /* DEBUG_AVX512 */
332 
333 #if     DEBUG
334 static inline unsigned short
fnstsw(void)335 fnstsw(void)
336 {
337 	unsigned short status;
338 	__asm__ volatile ("fnstsw %0" : "=ma" (status));
339 	return status;
340 }
341 #endif
342 
343 /*
344  * Configure the initial FPU state presented to new threads.
345  * Determine the MXCSR capability mask, which allows us to mask off any
346  * potentially unsafe "reserved" bits before restoring the FPU context.
347  * *Not* per-cpu, assumes symmetry.
348  */
349 
350 static void
configure_mxcsr_capability_mask(x86_ext_thread_state_t * fps)351 configure_mxcsr_capability_mask(x86_ext_thread_state_t *fps)
352 {
353 	/* XSAVE requires a 64 byte aligned store */
354 	assert(ALIGNED(fps, 64));
355 	/* Clear, to prepare for the diagnostic FXSAVE */
356 	bzero(fps, sizeof(*fps));
357 
358 	fpinit();
359 	fpu_store_registers(fps, FALSE);
360 
361 	mxcsr_capability_mask = fps->fx.fx_MXCSR_MASK;
362 
363 	/* Set default mask value if necessary */
364 	if (mxcsr_capability_mask == 0) {
365 		mxcsr_capability_mask = 0xffbf;
366 	}
367 
368 	/* Clear vector register store */
369 	bzero(&fps->fx.fx_XMM_reg[0][0], sizeof(fps->fx.fx_XMM_reg));
370 	bzero(fps->avx.x_YMM_Hi128, sizeof(fps->avx.x_YMM_Hi128));
371 	if (fpu_ZMM_capable) {
372 		bzero(fps->avx512.x_ZMM_Hi256, sizeof(fps->avx512.x_ZMM_Hi256));
373 		bzero(fps->avx512.x_Hi16_ZMM, sizeof(fps->avx512.x_Hi16_ZMM));
374 		bzero(fps->avx512.x_Opmask, sizeof(fps->avx512.x_Opmask));
375 	}
376 
377 	fps->fx.fp_valid = TRUE;
378 	fps->fx.fp_save_layout = fpu_YMM_capable ? XSAVE32: FXSAVE32;
379 	fpu_load_registers(fps);
380 
381 	if (fpu_ZMM_capable) {
382 		xsave64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
383 	}
384 	if (fpu_YMM_capable) {
385 		xsave64((struct x86_fx_thread_state *)&default_avx_state, xstate_xmask[AVX]);
386 	} else {
387 		fxsave64((struct x86_fx_thread_state *)&default_fx_state);
388 	}
389 
390 	/* Poison values to trap unsafe usage */
391 	fps->fx.fp_valid = 0xFFFFFFFF;
392 	fps->fx.fp_save_layout = FP_UNUSED;
393 
394 	/* Re-enable FPU/SSE DNA exceptions */
395 	set_ts();
396 }
397 
398 #if DEBUG || DEVELOPMENT
399 int fpsimd_fault_popc = 1;
400 #endif
401 
402 /*
403  * Look for FPU and initialize it.
404  * Called on each CPU.
405  */
406 void
init_fpu(void)407 init_fpu(void)
408 {
409 #if     DEBUG
410 	unsigned short  status;
411 	unsigned short  control;
412 #endif
413 	/*
414 	 * Check for FPU by initializing it,
415 	 * then trying to read the correct bit patterns from
416 	 * the control and status registers.
417 	 */
418 	set_cr0((get_cr0() & ~(CR0_EM | CR0_TS)) | CR0_NE);       /* allow use of FPU */
419 	fninit();
420 #if     DEBUG
421 	status = fnstsw();
422 	fnstcw(&control);
423 
424 	assert(((status & 0xff) == 0) && ((control & 0x103f) == 0x3f));
425 #endif
426 	/* Advertise SSE support */
427 	if (cpuid_features() & CPUID_FEATURE_FXSR) {
428 		set_cr4(get_cr4() | CR4_OSFXS);
429 		/* And allow SIMD exceptions if present */
430 		if (cpuid_features() & CPUID_FEATURE_SSE) {
431 			set_cr4(get_cr4() | CR4_OSXMM);
432 		}
433 	} else {
434 		panic("fpu is not FP_FXSR");
435 	}
436 
437 	fpu_capability = fpu_default = FP;
438 
439 	static boolean_t is_avx512_enabled = TRUE;
440 	if (cpu_number() == master_cpu) {
441 		if (cpuid_leaf7_features() & CPUID_LEAF7_FEATURE_AVX512F) {
442 			PE_parse_boot_argn("avx512", &is_avx512_enabled, sizeof(boolean_t));
443 			kprintf("AVX512 supported %s\n",
444 			    is_avx512_enabled ? "and enabled" : "but disabled");
445 		}
446 	}
447 
448 	/* Configure the XSAVE context mechanism if the processor supports
449 	 * AVX/YMM registers
450 	 */
451 	if (cpuid_features() & CPUID_FEATURE_XSAVE) {
452 		cpuid_xsave_leaf_t *xs0p = &cpuid_info()->cpuid_xsave_leaf[0];
453 		if (is_avx512_enabled &&
454 		    (xs0p->extended_state[eax] & XFEM_ZMM_OPMASK) == XFEM_ZMM_OPMASK) {
455 			assert(xs0p->extended_state[eax] & XFEM_SSE);
456 			assert(xs0p->extended_state[eax] & XFEM_YMM);
457 			fpu_capability = AVX512;
458 			/* XSAVE container size for all features */
459 			set_cr4(get_cr4() | CR4_OSXSAVE);
460 			xsetbv(0, AVX512_XMASK);
461 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
462 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
463 				cpuid_set_info();
464 			}
465 			/* Verify that now selected state can be accommodated */
466 			assert(xs0p->extended_state[ebx] == fp_state_size[AVX512]);
467 			/*
468 			 * AVX set until AVX512 is used.
469 			 * See comment above about on-demand AVX512 support.
470 			 */
471 			xsetbv(0, AVX_XMASK);
472 			fpu_default = AVX;
473 		} else if (xs0p->extended_state[eax] & XFEM_YMM) {
474 			assert(xs0p->extended_state[eax] & XFEM_SSE);
475 			fpu_capability = AVX;
476 			fpu_default = AVX;
477 			/* XSAVE container size for all features */
478 			set_cr4(get_cr4() | CR4_OSXSAVE);
479 			xsetbv(0, AVX_XMASK);
480 			/* Re-evaluate CPUID, once, to reflect OSXSAVE */
481 			if (OSCompareAndSwap(0, 1, &cpuid_reevaluated)) {
482 				cpuid_set_info();
483 			}
484 			/* Verify that now selected state can be accommodated */
485 			assert(xs0p->extended_state[ebx] == fp_state_size[AVX]);
486 		}
487 	}
488 
489 	if (cpu_number() == master_cpu) {
490 		kprintf("fpu_state: %s, state_size: %d\n",
491 		    xstate_name[fpu_capability],
492 		    fp_state_size[fpu_capability]);
493 	}
494 
495 	fpinit();
496 	current_cpu_datap()->cpu_xstate = fpu_default;
497 
498 	/*
499 	 * Trap wait instructions.  Turn off FPU for now.
500 	 */
501 	set_cr0(get_cr0() | CR0_TS | CR0_MP);
502 }
503 
504 /*
505  * Allocate and initialize FP state for specified xstate.
506  * Don't load state.
507  */
508 static void *
fp_state_alloc(xstate_t xs)509 fp_state_alloc(xstate_t xs)
510 {
511 	assert(ifps_zone[xs] != NULL);
512 	return zalloc_flags(ifps_zone[xs], Z_WAITOK | Z_ZERO);
513 }
514 
515 static inline void
fp_state_free(void * ifps,xstate_t xs)516 fp_state_free(void *ifps, xstate_t xs)
517 {
518 	assert(ifps_zone[xs] != NULL);
519 	zfree(ifps_zone[xs], ifps);
520 }
521 
522 void
clear_fpu(void)523 clear_fpu(void)
524 {
525 	set_ts();
526 }
527 
528 static boolean_t
fpu_allzeroes(uint64_t * __attribute ((aligned (8)))ptr,uint32_t size)529 fpu_allzeroes(uint64_t * __attribute((aligned(8)))ptr, uint32_t size)
530 {
531 	VERIFY_SAVEAREA_ALIGNED(ptr, sizeof(uint64_t));
532 	assertf((size & (sizeof(uint64_t) - 1)) == 0, "FP save area component not a multiple of 8 bytes");
533 
534 	for (uint32_t count = 0; count < (size / sizeof(uint64_t)); count++) {
535 		if (ptr[count] != 0) {
536 			return FALSE;
537 		}
538 	}
539 	return TRUE;
540 }
541 
542 static void
fpu_load_registers(void * fstate)543 fpu_load_registers(void *fstate)
544 {
545 	struct x86_fx_thread_state *ifps = fstate;
546 	fp_save_layout_t layout = ifps->fp_save_layout;
547 
548 	assert(startup_phase < STARTUP_SUB_EARLY_BOOT || \
549 	    (thread_is_64bit_addr(current_thread()) ?                        \
550 	    (layout == FXSAVE64 || layout == XSAVE64) :     \
551 	    (layout == FXSAVE32 || layout == XSAVE32)));
552 	assert(ALIGNED(ifps, 64));
553 	assert(ml_get_interrupts_enabled() == FALSE);
554 
555 #if     DEBUG
556 	if (layout == XSAVE32 || layout == XSAVE64) {
557 		struct x86_avx_thread_state *iavx = fstate;
558 		unsigned i;
559 		/* Verify reserved bits in the XSAVE header*/
560 		if (iavx->_xh.xstate_bv & ~xstate_xmask[current_xstate()]) {
561 			panic("iavx->_xh.xstate_bv: 0x%llx", iavx->_xh.xstate_bv);
562 		}
563 		for (i = 0; i < sizeof(iavx->_xh.xhrsvd); i++) {
564 			if (iavx->_xh.xhrsvd[i]) {
565 				panic("Reserved bit set");
566 			}
567 		}
568 	}
569 	if (fpu_YMM_capable) {
570 		if (layout != XSAVE32 && layout != XSAVE64) {
571 			panic("Inappropriate layout: %u", layout);
572 		}
573 	}
574 #endif  /* DEBUG */
575 
576 	switch (layout) {
577 	case FXSAVE64:
578 		fxrstor64(ifps);
579 		break;
580 	case FXSAVE32:
581 		fxrstor(ifps);
582 		break;
583 	case XSAVE64:
584 		xrstor64(ifps, xstate_xmask[current_xstate()]);
585 		break;
586 	case XSAVE32:
587 		xrstor(ifps, xstate_xmask[current_xstate()]);
588 		break;
589 	default:
590 		panic("fpu_load_registers() bad layout: %d", layout);
591 	}
592 }
593 
594 static void
fpu_store_registers(void * fstate,boolean_t is64)595 fpu_store_registers(void *fstate, boolean_t is64)
596 {
597 	struct x86_fx_thread_state *ifps = fstate;
598 	assert(ALIGNED(ifps, 64));
599 	xstate_t xs = current_xstate();
600 	switch (xs) {
601 	case FP:
602 		if (is64) {
603 			fxsave64(fstate);
604 			ifps->fp_save_layout = FXSAVE64;
605 		} else {
606 			fxsave(fstate);
607 			ifps->fp_save_layout = FXSAVE32;
608 		}
609 		break;
610 	case AVX:
611 	case AVX512:
612 		if (is64) {
613 			xsave64(ifps, xstate_xmask[xs]);
614 			ifps->fp_save_layout = XSAVE64;
615 		} else {
616 			xsave(ifps, xstate_xmask[xs]);
617 			ifps->fp_save_layout = XSAVE32;
618 		}
619 		break;
620 	default:
621 		panic("fpu_store_registers() bad xstate: %d", xs);
622 	}
623 }
624 
625 /*
626  * Initialize FP handling.
627  */
628 
629 void
fpu_module_init(void)630 fpu_module_init(void)
631 {
632 	if (!IS_VALID_XSTATE(fpu_default)) {
633 		panic("fpu_module_init: invalid extended state %u",
634 		    fpu_default);
635 	}
636 
637 	/* To maintain the required alignment, disable
638 	 * zone debugging for this zone as that appends
639 	 * 16 bytes to each element.
640 	 */
641 	ifps_zone[fpu_default] = zone_create("x86 fpsave state",
642 	    fp_state_size[fpu_default], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
643 
644 	/*
645 	 * If AVX512 is supported, create a separate savearea zone.
646 	 */
647 	if (fpu_capability == AVX512) {
648 		ifps_zone[AVX512] = zone_create("x86 avx512 save state",
649 		    fp_state_size[AVX512], ZC_ALIGNMENT_REQUIRED | ZC_ZFREE_CLEARMEM);
650 	}
651 
652 	/* Determine MXCSR reserved bits and configure initial FPU state*/
653 	configure_mxcsr_capability_mask(&initial_fp_state);
654 
655 #if DEBUG || DEVELOPMENT
656 	if (kern_feature_override(KF_DISABLE_FP_POPC_ON_PGFLT)) {
657 		fpsimd_fault_popc = 0;
658 	}
659 
660 	/* Allow the explicit boot-arg to override the validation disables */
661 	PE_parse_boot_argn("fpsimd_fault_popc", &fpsimd_fault_popc, sizeof(fpsimd_fault_popc));
662 #endif
663 }
664 
665 /*
666  * Context switch fpu state.
667  * Always save old thread`s FPU context but don't load new .. allow that to fault-in.
668  * Switch to the new task's xstate.
669  */
670 
671 void
fpu_switch_context(thread_t old,thread_t new)672 fpu_switch_context(thread_t old, thread_t new)
673 {
674 	struct x86_fx_thread_state      *ifps;
675 	cpu_data_t *cdp = current_cpu_datap();
676 	xstate_t new_xstate = new ? thread_xstate(new) : fpu_default;
677 
678 	assert(ml_get_interrupts_enabled() == FALSE);
679 	ifps = (old)->machine.ifps;
680 #if     DEBUG
681 	if (ifps && ((ifps->fp_valid != FALSE) && (ifps->fp_valid != TRUE))) {
682 		panic("ifps->fp_valid: %u", ifps->fp_valid);
683 	}
684 #endif
685 	if (ifps != 0 && (ifps->fp_valid == FALSE)) {
686 		/* Clear CR0.TS in preparation for the FP context save. In
687 		 * theory, this shouldn't be necessary since a live FPU should
688 		 * indicate that TS is clear. However, various routines
689 		 * (such as sendsig & sigreturn) manipulate TS directly.
690 		 */
691 		clear_ts();
692 		/* registers are in FPU - save to memory */
693 		boolean_t is64 = (thread_is_64bit_addr(old) &&
694 		    is_saved_state64(old->machine.iss));
695 
696 		fpu_store_registers(ifps, is64);
697 		ifps->fp_valid = TRUE;
698 
699 		if (fpu_ZMM_capable && (cdp->cpu_xstate == AVX512)) {
700 			xrstor64((struct x86_fx_thread_state *)&default_avx512_state, xstate_xmask[AVX512]);
701 		} else if (fpu_YMM_capable) {
702 			xrstor64((struct x86_fx_thread_state *) &default_avx_state, xstate_xmask[AVX]);
703 		} else {
704 			fxrstor64((struct x86_fx_thread_state *)&default_fx_state);
705 		}
706 	}
707 
708 	assertf(fpu_YMM_capable ? (xgetbv(XCR0) == xstate_xmask[cdp->cpu_xstate]) : TRUE, "XCR0 mismatch: 0x%llx 0x%x 0x%x", xgetbv(XCR0), cdp->cpu_xstate, xstate_xmask[cdp->cpu_xstate]);
709 	if (new_xstate != (xstate_t) cdp->cpu_xstate) {
710 		DBG("fpu_switch_context(%p,%p) new xstate: %s\n",
711 		    old, new, xstate_name[new_xstate]);
712 		xsetbv(0, xstate_xmask[new_xstate]);
713 		cdp->cpu_xstate = new_xstate;
714 	}
715 	set_ts();
716 }
717 
718 
719 /*
720  * Free a FPU save area.
721  * Called only when thread terminating - no locking necessary.
722  */
723 void
fpu_free(thread_t thread,void * fps)724 fpu_free(thread_t thread, void *fps)
725 {
726 	pcb_t   pcb = THREAD_TO_PCB(thread);
727 
728 	fp_state_free(fps, pcb->xstate);
729 	pcb->xstate = UNDEFINED;
730 }
731 
732 /*
733  * Set the floating-point state for a thread based on the FXSave formatted data.
734  * This is basically the same as fpu_set_state except it uses the expanded data
735  * structure.
736  * If the thread is not the current thread, it is not running (held).  Locking
737  * needed against concurrent fpu_set_state or fpu_get_state.
738  *
739  * While translating between XNU FP state structures and the CPU-native XSAVE area,
740  * if we detect state components that are all zeroes, we clear the corresponding
741  * xstate_bv bit in the XSAVE area, because that allows the corresponding state to
742  * be initialized to a "clean" state.  That's most important when clearing the YMM
743  * bit, since an initialized "upper clean" state results in a massive performance
744  * improvement due to elimination of false dependencies between the XMMs and the
745  * upper bits of the YMMs.
746  */
747 kern_return_t
fpu_set_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)748 fpu_set_fxstate(
749 	thread_t        thr_act,
750 	thread_state_t  tstate,
751 	thread_flavor_t f)
752 {
753 	struct x86_fx_thread_state      *ifps;
754 	struct x86_fx_thread_state      *new_ifps;
755 	x86_float_state64_t             *state;
756 	pcb_t                           pcb;
757 	boolean_t                       old_valid, fresh_state = FALSE;
758 	xstate_t                        thr_xstate;
759 
760 	if (fpu_capability == UNDEFINED) {
761 		return KERN_FAILURE;
762 	}
763 
764 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
765 	    fpu_capability < AVX) {
766 		return KERN_FAILURE;
767 	}
768 
769 	assert(thr_act != THREAD_NULL);
770 
771 	thr_xstate = thread_xstate(thr_act);
772 
773 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
774 	    thr_xstate == AVX) {
775 		if (!fpu_thread_promote_avx512(thr_act)) {
776 			return KERN_FAILURE;
777 		} else {
778 			/* Reload thr_xstate after successful promotion */
779 			thr_xstate = thread_xstate(thr_act);
780 		}
781 	}
782 
783 	state = (x86_float_state64_t *)tstate;
784 
785 	pcb = THREAD_TO_PCB(thr_act);
786 
787 	if (state == NULL) {
788 		/*
789 		 * new FPU state is 'invalid'.
790 		 * Deallocate the fp state if it exists.
791 		 */
792 		simple_lock(&pcb->lock, LCK_GRP_NULL);
793 
794 		ifps = pcb->ifps;
795 		pcb->ifps = 0;
796 
797 		simple_unlock(&pcb->lock);
798 
799 		if (ifps != 0) {
800 			fp_state_free(ifps, thr_xstate);
801 		}
802 	} else {
803 		/*
804 		 * Valid incoming state. Allocate the fp state if there is none.
805 		 */
806 		new_ifps = 0;
807 Retry:
808 		simple_lock(&pcb->lock, LCK_GRP_NULL);
809 
810 		ifps = pcb->ifps;
811 		if (ifps == 0) {
812 			if (new_ifps == 0) {
813 				simple_unlock(&pcb->lock);
814 				new_ifps = fp_state_alloc(thr_xstate);
815 				goto Retry;
816 			}
817 			ifps = new_ifps;
818 			new_ifps = 0;
819 			pcb->ifps = ifps;
820 			pcb->xstate = thr_xstate;
821 			fresh_state = TRUE;
822 		}
823 
824 		/*
825 		 * now copy over the new data.
826 		 */
827 
828 		old_valid = ifps->fp_valid;
829 
830 #if     DEBUG || DEVELOPMENT
831 		if ((fresh_state == FALSE) && (old_valid == FALSE) && (thr_act != current_thread())) {
832 			panic("fpu_set_fxstate inconsistency, thread: %p not stopped", thr_act);
833 		}
834 #endif
835 		/*
836 		 * Clear any reserved bits in the MXCSR to prevent a GPF
837 		 * when issuing an FXRSTOR.
838 		 */
839 
840 		state->fpu_mxcsr &= mxcsr_capability_mask;
841 
842 		__nochk_bcopy((char *)&state->fpu_fcw, (char *)ifps, fp_state_size[FP]);
843 
844 		switch (thr_xstate) {
845 		case UNDEFINED_FULL:
846 		case FP_FULL:
847 		case AVX_FULL:
848 		case AVX512_FULL:
849 			panic("fpu_set_fxstate() INVALID xstate: 0x%x", thr_xstate);
850 			break;
851 
852 		case UNDEFINED:
853 			panic("fpu_set_fxstate() UNDEFINED xstate");
854 			break;
855 		case FP:
856 			ifps->fp_save_layout = thread_is_64bit_addr(thr_act) ? FXSAVE64 : FXSAVE32;
857 			break;
858 		case AVX: {
859 			struct x86_avx_thread_state *iavx = (void *) ifps;
860 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
861 
862 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
863 
864 			/* Sanitize XSAVE header */
865 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
866 			iavx->_xh.xstate_bv = AVX_XMASK;
867 			iavx->_xh.xcomp_bv  = 0;
868 
869 			/*
870 			 * See the block comment at the top of the function for a description of why we're clearing
871 			 * xstate_bv bits.
872 			 */
873 			if (f == x86_AVX_STATE32) {
874 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
875 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
876 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
877 				}
878 			} else if (f == x86_AVX_STATE64) {
879 				__nochk_bcopy(&xs->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
880 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
881 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
882 				}
883 			} else {
884 				iavx->_xh.xstate_bv = (XFEM_SSE | XFEM_X87);
885 			}
886 			break;
887 		}
888 		case AVX512: {
889 			struct x86_avx512_thread_state *iavx = (void *) ifps;
890 			union {
891 				thread_state_t       ts;
892 				x86_avx512_state32_t *s32;
893 				x86_avx512_state64_t *s64;
894 			} xs = { .ts = tstate };
895 
896 			iavx->fp.fp_save_layout = thread_is_64bit_addr(thr_act) ? XSAVE64 : XSAVE32;
897 
898 			/* Sanitize XSAVE header */
899 			bzero(&iavx->_xh.xhrsvd[0], sizeof(iavx->_xh.xhrsvd));
900 			iavx->_xh.xstate_bv = AVX512_XMASK;
901 			iavx->_xh.xcomp_bv  = 0;
902 
903 			/*
904 			 * See the block comment at the top of the function for a description of why we're clearing
905 			 * xstate_bv bits.
906 			 */
907 			switch (f) {
908 			case x86_AVX512_STATE32:
909 				__nochk_bcopy(&xs.s32->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
910 				__nochk_bcopy(&xs.s32->fpu_zmmh0, iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG));
911 
912 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
913 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
914 				}
915 
916 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 8 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
917 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
918 				}
919 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
920 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
921 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
922 				}
923 
924 				DBG_AVX512_STATE(iavx);
925 				break;
926 			case x86_AVX_STATE32:
927 				__nochk_bcopy(&xs.s32->fpu_ymmh0, iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG));
928 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 8 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
929 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
930 				}
931 				break;
932 			case x86_AVX512_STATE64:
933 				__nochk_bcopy(&xs.s64->fpu_k0, iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG));
934 				__nochk_bcopy(&xs.s64->fpu_zmm16, iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG));
935 				__nochk_bcopy(&xs.s64->fpu_zmmh0, iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG));
936 				/*
937 				 * Note that it is valid to have XFEM_ZMM_OPMASK set but XFEM_YMM cleared.  In that case,
938 				 * the upper bits of the YMMs would be cleared and would result in a clean-upper
939 				 * state, allowing SSE instruction to avoid false dependencies.
940 				 */
941 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Opmask, 8 * sizeof(_STRUCT_OPMASK_REG)) == TRUE) {
942 					iavx->_xh.xstate_bv &= ~XFEM_OPMASK;
943 				}
944 
945 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_Hi16_ZMM, 16 * sizeof(_STRUCT_ZMM_REG)) == TRUE &&
946 				    fpu_allzeroes((uint64_t *)(void *)iavx->x_ZMM_Hi256, 16 * sizeof(_STRUCT_YMM_REG)) == TRUE) {
947 					iavx->_xh.xstate_bv &= ~(XFEM_ZMM_HI256 | XFEM_HI16_ZMM);
948 				}
949 
950 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
951 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
952 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
953 				}
954 				DBG_AVX512_STATE(iavx);
955 				break;
956 			case x86_AVX_STATE64:
957 				__nochk_bcopy(&xs.s64->fpu_ymmh0, iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG));
958 				if (fpu_allzeroes((uint64_t *)(void *)iavx->x_YMM_Hi128, 16 * sizeof(_STRUCT_XMM_REG)) == TRUE) {
959 					iavx->_xh.xstate_bv &= ~XFEM_YMM;
960 				}
961 				break;
962 			}
963 			break;
964 		}
965 		}
966 
967 		ifps->fp_valid = old_valid;
968 
969 		if (old_valid == FALSE) {
970 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
971 			ifps->fp_valid = TRUE;
972 			/* If altering the current thread's state, disable FPU */
973 			if (thr_act == current_thread()) {
974 				set_ts();
975 			}
976 
977 			ml_set_interrupts_enabled(istate);
978 		}
979 
980 		simple_unlock(&pcb->lock);
981 
982 		if (new_ifps != 0) {
983 			fp_state_free(new_ifps, thr_xstate);
984 		}
985 	}
986 	return KERN_SUCCESS;
987 }
988 
989 /*
990  * Get the floating-point state for a thread.
991  * If the thread is not the current thread, it is
992  * not running (held).  Locking needed against
993  * concurrent fpu_set_state or fpu_get_state.
994  */
995 kern_return_t
fpu_get_fxstate(thread_t thr_act,thread_state_t tstate,thread_flavor_t f)996 fpu_get_fxstate(
997 	thread_t        thr_act,
998 	thread_state_t  tstate,
999 	thread_flavor_t f)
1000 {
1001 	struct x86_fx_thread_state      *ifps;
1002 	x86_float_state64_t             *state;
1003 	kern_return_t                   ret = KERN_FAILURE;
1004 	pcb_t                           pcb;
1005 	xstate_t                        thr_xstate = thread_xstate(thr_act);
1006 
1007 	if (fpu_capability == UNDEFINED) {
1008 		return KERN_FAILURE;
1009 	}
1010 
1011 	if ((f == x86_AVX_STATE32 || f == x86_AVX_STATE64) &&
1012 	    fpu_capability < AVX) {
1013 		return KERN_FAILURE;
1014 	}
1015 
1016 	if ((f == x86_AVX512_STATE32 || f == x86_AVX512_STATE64) &&
1017 	    thr_xstate != AVX512) {
1018 		return KERN_FAILURE;
1019 	}
1020 
1021 	state = (x86_float_state64_t *)tstate;
1022 
1023 	assert(thr_act != THREAD_NULL);
1024 	pcb = THREAD_TO_PCB(thr_act);
1025 
1026 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1027 
1028 	ifps = pcb->ifps;
1029 	if (ifps == 0) {
1030 		/*
1031 		 * No valid floating-point state.
1032 		 */
1033 
1034 		__nochk_bcopy((char *)&initial_fp_state, (char *)&state->fpu_fcw,
1035 		    fp_state_size[FP]);
1036 
1037 		simple_unlock(&pcb->lock);
1038 
1039 		return KERN_SUCCESS;
1040 	}
1041 	/*
1042 	 * Make sure we`ve got the latest fp state info
1043 	 * If the live fpu state belongs to our target
1044 	 */
1045 	if (thr_act == current_thread()) {
1046 		boolean_t       intr;
1047 
1048 		intr = ml_set_interrupts_enabled(FALSE);
1049 
1050 		clear_ts();
1051 		fp_save(thr_act);
1052 		clear_fpu();
1053 
1054 		(void)ml_set_interrupts_enabled(intr);
1055 	}
1056 	if (ifps->fp_valid) {
1057 		__nochk_bcopy((char *)ifps, (char *)&state->fpu_fcw, fp_state_size[FP]);
1058 		switch (thr_xstate) {
1059 		case UNDEFINED_FULL:
1060 		case FP_FULL:
1061 		case AVX_FULL:
1062 		case AVX512_FULL:
1063 			panic("fpu_get_fxstate() INVALID xstate: 0x%x", thr_xstate);
1064 			break;
1065 
1066 		case UNDEFINED:
1067 			panic("fpu_get_fxstate() UNDEFINED xstate");
1068 			break;
1069 		case FP:
1070 			break;                  /* already done */
1071 		case AVX: {
1072 			struct x86_avx_thread_state *iavx = (void *) ifps;
1073 			x86_avx_state64_t *xs = (x86_avx_state64_t *) state;
1074 			if (f == x86_AVX_STATE32) {
1075 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1076 			} else if (f == x86_AVX_STATE64) {
1077 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1078 			}
1079 			break;
1080 		}
1081 		case AVX512: {
1082 			struct x86_avx512_thread_state *iavx = (void *) ifps;
1083 			union {
1084 				thread_state_t       ts;
1085 				x86_avx512_state32_t *s32;
1086 				x86_avx512_state64_t *s64;
1087 			} xs = { .ts = tstate };
1088 			switch (f) {
1089 			case x86_AVX512_STATE32:
1090 				__nochk_bcopy(iavx->x_Opmask, &xs.s32->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1091 				__nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s32->fpu_zmmh0, 8 * sizeof(_STRUCT_YMM_REG));
1092 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1093 				DBG_AVX512_STATE(iavx);
1094 				break;
1095 			case x86_AVX_STATE32:
1096 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s32->fpu_ymmh0, 8 * sizeof(_STRUCT_XMM_REG));
1097 				break;
1098 			case x86_AVX512_STATE64:
1099 				__nochk_bcopy(iavx->x_Opmask, &xs.s64->fpu_k0, 8 * sizeof(_STRUCT_OPMASK_REG));
1100 				__nochk_bcopy(iavx->x_Hi16_ZMM, &xs.s64->fpu_zmm16, 16 * sizeof(_STRUCT_ZMM_REG));
1101 				__nochk_bcopy(iavx->x_ZMM_Hi256, &xs.s64->fpu_zmmh0, 16 * sizeof(_STRUCT_YMM_REG));
1102 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1103 				DBG_AVX512_STATE(iavx);
1104 				break;
1105 			case x86_AVX_STATE64:
1106 				__nochk_bcopy(iavx->x_YMM_Hi128, &xs.s64->fpu_ymmh0, 16 * sizeof(_STRUCT_XMM_REG));
1107 				break;
1108 			}
1109 			break;
1110 		}
1111 		}
1112 
1113 		ret = KERN_SUCCESS;
1114 	}
1115 	simple_unlock(&pcb->lock);
1116 
1117 	return ret;
1118 }
1119 
1120 
1121 
1122 /*
1123  * the child thread is 'stopped' with the thread
1124  * mutex held and is currently not known by anyone
1125  * so no way for fpu state to get manipulated by an
1126  * outside agency -> no need for pcb lock
1127  */
1128 
1129 void
fpu_dup_fxstate(thread_t parent,thread_t child)1130 fpu_dup_fxstate(
1131 	thread_t        parent,
1132 	thread_t        child)
1133 {
1134 	struct x86_fx_thread_state *new_ifps = NULL;
1135 	boolean_t       intr;
1136 	pcb_t           ppcb;
1137 	xstate_t        xstate = thread_xstate(parent);
1138 
1139 	ppcb = THREAD_TO_PCB(parent);
1140 
1141 	if (ppcb->ifps == NULL) {
1142 		return;
1143 	}
1144 
1145 	if (child->machine.ifps) {
1146 		panic("fpu_dup_fxstate: child's ifps non-null");
1147 	}
1148 
1149 	new_ifps = fp_state_alloc(xstate);
1150 
1151 	simple_lock(&ppcb->lock, LCK_GRP_NULL);
1152 
1153 	if (ppcb->ifps != NULL) {
1154 		struct x86_fx_thread_state *ifps = ppcb->ifps;
1155 		/*
1156 		 * Make sure we`ve got the latest fp state info
1157 		 */
1158 		if (current_thread() == parent) {
1159 			intr = ml_set_interrupts_enabled(FALSE);
1160 			assert(current_thread() == parent);
1161 			clear_ts();
1162 			fp_save(parent);
1163 			clear_fpu();
1164 
1165 			(void)ml_set_interrupts_enabled(intr);
1166 		}
1167 
1168 		if (ifps->fp_valid) {
1169 			child->machine.ifps = new_ifps;
1170 			child->machine.xstate = xstate;
1171 			__nochk_bcopy((char *)(ppcb->ifps),
1172 			    (char *)(child->machine.ifps),
1173 			    fp_state_size[xstate]);
1174 
1175 			/* Mark the new fp saved state as non-live. */
1176 			/* Temporarily disabled: radar 4647827
1177 			 * new_ifps->fp_valid = TRUE;
1178 			 */
1179 
1180 			/*
1181 			 * Clear any reserved bits in the MXCSR to prevent a GPF
1182 			 * when issuing an FXRSTOR.
1183 			 */
1184 			new_ifps->fx_MXCSR &= mxcsr_capability_mask;
1185 			new_ifps = NULL;
1186 		}
1187 	}
1188 	simple_unlock(&ppcb->lock);
1189 
1190 	if (new_ifps != NULL) {
1191 		fp_state_free(new_ifps, xstate);
1192 	}
1193 }
1194 
1195 /*
1196  * Initialize FPU.
1197  * FNINIT programs the x87 control word to 0x37f, which matches
1198  * the desired default for macOS.
1199  */
1200 
1201 void
fpinit(void)1202 fpinit(void)
1203 {
1204 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1205 	clear_ts();
1206 	fninit();
1207 #if DEBUG
1208 	/* We skip this power-on-default verification sequence on
1209 	 * non-DEBUG, as dirtying the x87 control word may slow down
1210 	 * xsave/xrstor and affect energy use.
1211 	 */
1212 	unsigned short  control, control2;
1213 	fnstcw(&control);
1214 	control2 = control;
1215 	control &= ~(FPC_PC | FPC_RC); /* Clear precision & rounding control */
1216 	control |= (FPC_PC_64 |         /* Set precision */
1217 	    FPC_RC_RN |                 /* round-to-nearest */
1218 	    FPC_ZE |                    /* Suppress zero-divide */
1219 	    FPC_OE |                    /*  and overflow */
1220 	    FPC_UE |                    /*  underflow */
1221 	    FPC_IE |                    /* Allow NaNQs and +-INF */
1222 	    FPC_DE |                    /* Allow denorms as operands  */
1223 	    FPC_PE);                    /* No trap for precision loss */
1224 	assert(control == control2);
1225 	fldcw(control);
1226 #endif
1227 	/* Initialize SSE/SSE2 */
1228 	__builtin_ia32_ldmxcsr(0x1f80);
1229 	if (fpu_YMM_capable) {
1230 		vzeroall();
1231 	} else {
1232 		xmmzeroall();
1233 	}
1234 	ml_set_interrupts_enabled(istate);
1235 }
1236 
1237 /*
1238  * Coprocessor not present.
1239  */
1240 
1241 uint64_t x86_isr_fp_simd_use;
1242 
1243 void
fpnoextflt(void)1244 fpnoextflt(void)
1245 {
1246 	boolean_t       intr;
1247 	thread_t        thr_act;
1248 	pcb_t           pcb;
1249 	struct x86_fx_thread_state *ifps = 0;
1250 	xstate_t        xstate = current_xstate();
1251 
1252 	thr_act = current_thread();
1253 	pcb = THREAD_TO_PCB(thr_act);
1254 
1255 	if (pcb->ifps == 0 && !get_interrupt_level()) {
1256 		ifps = fp_state_alloc(xstate);
1257 		__nochk_bcopy((char *)&initial_fp_state, (char *)ifps,
1258 		    fp_state_size[xstate]);
1259 		if (!thread_is_64bit_addr(thr_act)) {
1260 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE32 : FXSAVE32;
1261 		} else {
1262 			ifps->fp_save_layout = fpu_YMM_capable ? XSAVE64 : FXSAVE64;
1263 		}
1264 		ifps->fp_valid = TRUE;
1265 	}
1266 	intr = ml_set_interrupts_enabled(FALSE);
1267 
1268 	clear_ts();                     /*  Enable FPU use */
1269 
1270 	if (__improbable(get_interrupt_level())) {
1271 		/* Track number of #DNA traps at interrupt context,
1272 		 * which is likely suboptimal. Racy, but good enough.
1273 		 */
1274 		x86_isr_fp_simd_use++;
1275 		/*
1276 		 * Save current FP/SIMD context if valid
1277 		 * Initialize live FP/SIMD registers
1278 		 */
1279 		if (pcb->ifps) {
1280 			fp_save(thr_act);
1281 		}
1282 		fpinit();
1283 	} else {
1284 		if (pcb->ifps == 0) {
1285 			pcb->ifps = ifps;
1286 			pcb->xstate = xstate;
1287 			ifps = 0;
1288 		}
1289 		/*
1290 		 * Load this thread`s state into coprocessor live context.
1291 		 */
1292 		fp_load(thr_act);
1293 	}
1294 	(void)ml_set_interrupts_enabled(intr);
1295 
1296 	if (ifps) {
1297 		fp_state_free(ifps, xstate);
1298 	}
1299 }
1300 
1301 /*
1302  * FPU overran end of segment.
1303  * Re-initialize FPU.  Floating point state is not valid.
1304  */
1305 
1306 void
fpextovrflt(void)1307 fpextovrflt(void)
1308 {
1309 	thread_t        thr_act = current_thread();
1310 	pcb_t           pcb;
1311 	struct x86_fx_thread_state *ifps;
1312 	boolean_t       intr;
1313 	xstate_t        xstate = current_xstate();
1314 
1315 	intr = ml_set_interrupts_enabled(FALSE);
1316 
1317 	if (get_interrupt_level()) {
1318 		panic("FPU segment overrun exception at interrupt context");
1319 	}
1320 	if (current_task() == kernel_task) {
1321 		panic("FPU segment overrun exception in kernel thread context");
1322 	}
1323 
1324 	/*
1325 	 * This is a non-recoverable error.
1326 	 * Invalidate the thread`s FPU state.
1327 	 */
1328 	pcb = THREAD_TO_PCB(thr_act);
1329 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1330 	ifps = pcb->ifps;
1331 	pcb->ifps = 0;
1332 	simple_unlock(&pcb->lock);
1333 
1334 	/*
1335 	 * Re-initialize the FPU.
1336 	 */
1337 	clear_ts();
1338 	fninit();
1339 
1340 	/*
1341 	 * And disable access.
1342 	 */
1343 	clear_fpu();
1344 
1345 	(void)ml_set_interrupts_enabled(intr);
1346 
1347 	if (ifps) {
1348 		fp_state_free(ifps, xstate);
1349 	}
1350 }
1351 
1352 /*
1353  * FPU error. Called by AST.
1354  */
1355 
1356 void
fpexterrflt(void)1357 fpexterrflt(void)
1358 {
1359 	thread_t        thr_act = current_thread();
1360 	boolean_t       intr;
1361 
1362 	intr = ml_set_interrupts_enabled(FALSE);
1363 
1364 	if (get_interrupt_level()) {
1365 		panic("FPU error exception at interrupt context");
1366 	}
1367 	if (current_task() == kernel_task) {
1368 		panic("FPU error exception in kernel thread context");
1369 	}
1370 
1371 	/*
1372 	 * Save the FPU state and turn off the FPU.
1373 	 */
1374 	fp_save(thr_act);
1375 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1376 	set_ts();
1377 
1378 	(void)ml_set_interrupts_enabled(intr);
1379 }
1380 
1381 /*
1382  * Save FPU state.
1383  *
1384  * Locking not needed:
1385  * .	if called from fpu_get_state, pcb already locked.
1386  * .	if called from fpnoextflt or fp_intr, we are single-cpu
1387  * .	otherwise, thread is running.
1388  * N.B.: Must be called with interrupts disabled
1389  */
1390 
1391 void
fp_save(thread_t thr_act)1392 fp_save(
1393 	thread_t        thr_act)
1394 {
1395 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1396 	struct x86_fx_thread_state *ifps = pcb->ifps;
1397 
1398 	assert(ifps != 0);
1399 	if (ifps != 0 && !ifps->fp_valid) {
1400 		assert((get_cr0() & CR0_TS) == 0);
1401 		/* registers are in FPU */
1402 		ifps->fp_valid = TRUE;
1403 		fpu_store_registers(ifps, thread_is_64bit_addr(thr_act));
1404 	}
1405 }
1406 
1407 /*
1408  * Restore FPU state from PCB.
1409  *
1410  * Locking not needed; always called on the current thread.
1411  */
1412 
1413 void
fp_load(thread_t thr_act)1414 fp_load(
1415 	thread_t        thr_act)
1416 {
1417 	pcb_t pcb = THREAD_TO_PCB(thr_act);
1418 	struct x86_fx_thread_state *ifps = pcb->ifps;
1419 
1420 	assert(ifps);
1421 #if     DEBUG
1422 	if (ifps->fp_valid != FALSE && ifps->fp_valid != TRUE) {
1423 		panic("fp_load() invalid fp_valid: %u, fp_save_layout: %u",
1424 		    ifps->fp_valid, ifps->fp_save_layout);
1425 	}
1426 #endif
1427 
1428 	if (ifps->fp_valid == FALSE) {
1429 		fpinit();
1430 	} else {
1431 		fpu_load_registers(ifps);
1432 	}
1433 	ifps->fp_valid = FALSE;         /* in FPU */
1434 }
1435 
1436 /*
1437  * SSE arithmetic exception handling code.
1438  * Basically the same as the x87 exception handler with a different subtype
1439  */
1440 
1441 void
fpSSEexterrflt(void)1442 fpSSEexterrflt(void)
1443 {
1444 	thread_t        thr_act = current_thread();
1445 	boolean_t       intr;
1446 
1447 	intr = ml_set_interrupts_enabled(FALSE);
1448 
1449 	if (get_interrupt_level()) {
1450 		panic("SSE exception at interrupt context");
1451 	}
1452 	if (current_task() == kernel_task) {
1453 		panic("SSE exception in kernel thread context");
1454 	}
1455 
1456 	/*
1457 	 * Save the FPU state and turn off the FPU.
1458 	 */
1459 	fp_save(thr_act);
1460 	/* Set TS to ensure we catch attempts to use the FPU before returning from trap handling */
1461 	set_ts();
1462 
1463 	(void)ml_set_interrupts_enabled(intr);
1464 }
1465 
1466 
1467 /*
1468  * If a thread is using an AVX-sized savearea:
1469  * - allocate a new AVX512-sized  area,
1470  * - copy the 256-bit state into the 512-bit area,
1471  * - deallocate the smaller area
1472  * ASSUMES: thread is the current thread.
1473  */
1474 static void
fpu_savearea_promote_avx512(thread_t thread)1475 fpu_savearea_promote_avx512(thread_t thread)
1476 {
1477 	struct x86_avx_thread_state     *ifps = NULL;
1478 	struct x86_avx512_thread_state  *ifps512 = NULL;
1479 	pcb_t                           pcb = THREAD_TO_PCB(thread);
1480 	boolean_t                       do_avx512_alloc = FALSE;
1481 	boolean_t                       intr;
1482 
1483 	assert(thread == current_thread());
1484 
1485 	DBG("fpu_savearea_promote_avx512(%p)\n", thread);
1486 
1487 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1488 
1489 	ifps = pcb->ifps;
1490 	if (ifps == NULL) {
1491 		pcb->xstate = AVX512;
1492 		simple_unlock(&pcb->lock);
1493 		/*
1494 		 * Now that the PCB xstate has been promoted, set XCR0 so
1495 		 * that we don't re-trip #UD on the next AVX-512 instruction.
1496 		 *
1497 		 * Since this branch is taken when the first FP instruction
1498 		 * attempted by this thread is an AVX-512 instruction, we
1499 		 * call fpnoextflt() to allocate an appropriately-sized
1500 		 * AVX-512 save-area, thereby avoiding the overhead of another
1501 		 * fault that would be triggered immediately on return.
1502 		 */
1503 		intr = ml_set_interrupts_enabled(FALSE);
1504 		xsetbv(0, AVX512_XMASK);
1505 		current_cpu_datap()->cpu_xstate = AVX512;
1506 		(void)ml_set_interrupts_enabled(intr);
1507 
1508 		fpnoextflt();
1509 		return;
1510 	}
1511 
1512 	if (pcb->xstate != AVX512) {
1513 		do_avx512_alloc = TRUE;
1514 	}
1515 
1516 	simple_unlock(&pcb->lock);
1517 
1518 	if (do_avx512_alloc == TRUE) {
1519 		ifps512 = fp_state_alloc(AVX512);
1520 	}
1521 
1522 	simple_lock(&pcb->lock, LCK_GRP_NULL);
1523 
1524 	intr = ml_set_interrupts_enabled(FALSE);
1525 
1526 	clear_ts();
1527 	fp_save(thread);
1528 	clear_fpu();
1529 
1530 	xsetbv(0, AVX512_XMASK);
1531 	current_cpu_datap()->cpu_xstate = AVX512;
1532 	(void)ml_set_interrupts_enabled(intr);
1533 
1534 	assert(ifps->fp.fp_valid);
1535 
1536 	/* Allocate an AVX512 savearea and copy AVX state into it */
1537 	if (pcb->xstate != AVX512) {
1538 		__nochk_bcopy(ifps, ifps512, fp_state_size[AVX]);
1539 		pcb->ifps = ifps512;
1540 		pcb->xstate = AVX512;
1541 		ifps512 = NULL;
1542 	} else {
1543 		ifps = NULL;
1544 	}
1545 	/* The PCB lock is redundant in some scenarios given the higher level
1546 	 * thread mutex, but its pre-emption disablement is relied upon here
1547 	 */
1548 	simple_unlock(&pcb->lock);
1549 
1550 	if (ifps) {
1551 		fp_state_free(ifps, AVX);
1552 	}
1553 	if (ifps512) {
1554 		fp_state_free(ifps, AVX512);
1555 	}
1556 }
1557 
1558 /*
1559  * Upgrade the calling thread to AVX512.
1560  */
1561 boolean_t
fpu_thread_promote_avx512(thread_t thread)1562 fpu_thread_promote_avx512(thread_t thread)
1563 {
1564 	task_t          task = current_task();
1565 
1566 	if (thread != current_thread()) {
1567 		return FALSE;
1568 	}
1569 	if (!ml_fpu_avx512_enabled()) {
1570 		return FALSE;
1571 	}
1572 
1573 	fpu_savearea_promote_avx512(thread);
1574 
1575 	/* Racy but the task's xstate is only a hint */
1576 	task->xstate = AVX512;
1577 
1578 	return TRUE;
1579 }
1580 
1581 
1582 /*
1583  * Called from user_trap() when an invalid opcode fault is taken.
1584  * If the user is attempting an AVX512 instruction on a machine
1585  * that supports this, we switch the calling thread to use
1586  * a larger savearea, set its XCR0 bit mask to enable AVX512 and
1587  * return to user_trap() with a 0 return value.
1588  * Otherwise, simply return a nonzero value.
1589  */
1590 
1591 #define MAX_X86_INSN_LENGTH (15)
1592 int
fpUDflt(user_addr_t rip)1593 fpUDflt(user_addr_t rip)
1594 {
1595 	uint8_t         instruction_prefix;
1596 	boolean_t       is_AVX512_instruction = FALSE;
1597 	user_addr_t     original_rip = rip;
1598 
1599 	/*
1600 	 * If this thread's xstate is already AVX512, then this #UD is
1601 	 * a true #UD.
1602 	 */
1603 	if (thread_xstate(current_thread()) == AVX512) {
1604 		return 1;
1605 	}
1606 
1607 	do {
1608 		/* TODO: as an optimisation, copy up to the lesser of the
1609 		 * next page boundary or maximal prefix length in one pass
1610 		 * rather than issue multiple copyins
1611 		 */
1612 		if (copyin(rip, (char *) &instruction_prefix, 1)) {
1613 			return 1;
1614 		}
1615 		DBG("fpUDflt(0x%016llx) prefix: 0x%x\n",
1616 		    rip, instruction_prefix);
1617 		/* TODO: determine more specifically which prefixes
1618 		 * are sane possibilities for AVX512 insns
1619 		 */
1620 		switch (instruction_prefix) {
1621 		case 0x2E:      /* CS segment override */
1622 		case 0x36:      /* SS segment override */
1623 		case 0x3E:      /* DS segment override */
1624 		case 0x26:      /* ES segment override */
1625 		case 0x64:      /* FS segment override */
1626 		case 0x65:      /* GS segment override */
1627 		case 0x66:      /* Operand-size override */
1628 		case 0x67:      /* address-size override */
1629 			/* Skip optional prefixes */
1630 			rip++;
1631 			if ((rip - original_rip) > MAX_X86_INSN_LENGTH) {
1632 				return 1;
1633 			}
1634 			break;
1635 		case 0x62:      /* EVEX */
1636 		case 0xC5:      /* VEX 2-byte */
1637 		case 0xC4:      /* VEX 3-byte */
1638 			is_AVX512_instruction = TRUE;
1639 			break;
1640 		default:
1641 			return 1;
1642 		}
1643 	} while (!is_AVX512_instruction);
1644 
1645 	/* Here if we detect attempted execution of an AVX512 instruction */
1646 
1647 	/*
1648 	 * Fail if this machine doesn't support AVX512
1649 	 */
1650 	if (fpu_capability != AVX512) {
1651 		return 1;
1652 	}
1653 
1654 	assert(xgetbv(XCR0) == AVX_XMASK);
1655 
1656 	DBG("fpUDflt() switching xstate to AVX512\n");
1657 	(void) fpu_thread_promote_avx512(current_thread());
1658 
1659 	return 0;
1660 }
1661 
1662 void
fp_setvalid(boolean_t value)1663 fp_setvalid(boolean_t value)
1664 {
1665 	thread_t        thr_act = current_thread();
1666 	struct x86_fx_thread_state *ifps = thr_act->machine.ifps;
1667 
1668 	if (ifps) {
1669 		ifps->fp_valid = value;
1670 
1671 		if (value == TRUE) {
1672 			boolean_t istate = ml_set_interrupts_enabled(FALSE);
1673 			clear_fpu();
1674 			ml_set_interrupts_enabled(istate);
1675 		}
1676 	}
1677 }
1678 
1679 boolean_t
ml_fpu_avx_enabled(void)1680 ml_fpu_avx_enabled(void)
1681 {
1682 	return fpu_capability >= AVX;
1683 }
1684 
1685 boolean_t
ml_fpu_avx512_enabled(void)1686 ml_fpu_avx512_enabled(void)
1687 {
1688 	return fpu_capability == AVX512;
1689 }
1690 
1691 static xstate_t
thread_xstate(thread_t thread)1692 thread_xstate(thread_t thread)
1693 {
1694 	xstate_t xs = THREAD_TO_PCB(thread)->xstate;
1695 	if (xs != UNDEFINED) {
1696 		return xs;
1697 	} else if (startup_phase < STARTUP_SUB_EARLY_BOOT) {
1698 		return fpu_default;
1699 	} else {
1700 		return get_threadtask(thread)->xstate;
1701 	}
1702 }
1703 
1704 xstate_t
current_xstate(void)1705 current_xstate(void)
1706 {
1707 	return thread_xstate(current_thread());
1708 }
1709 
1710 /*
1711  * Called when exec'ing between bitnesses.
1712  * If valid FPU state exists, adjust the layout.
1713  */
1714 void
fpu_switch_addrmode(thread_t thread,boolean_t is_64bit)1715 fpu_switch_addrmode(thread_t thread, boolean_t is_64bit)
1716 {
1717 	struct x86_fx_thread_state *ifps = thread->machine.ifps;
1718 	mp_disable_preemption();
1719 
1720 	if (ifps && ifps->fp_valid) {
1721 		if (thread_xstate(thread) == FP) {
1722 			ifps->fp_save_layout = is_64bit ? FXSAVE64 : FXSAVE32;
1723 		} else {
1724 			ifps->fp_save_layout = is_64bit ? XSAVE64 : XSAVE32;
1725 		}
1726 	}
1727 	mp_enable_preemption();
1728 }
1729 
1730 #if DEBUG || DEVELOPMENT
1731 static inline uint32_t
fpsimd_pop(uintptr_t ins,int sz)1732 fpsimd_pop(uintptr_t ins, int sz)
1733 {
1734 	uint32_t rv = 0;
1735 
1736 
1737 	while (sz >= 16) {
1738 		uint32_t rv1, rv2;
1739 		uint64_t *ins64 = (uint64_t *) ins;
1740 		uint64_t *ins642 = (uint64_t *) (ins + 8);
1741 		rv1 = __builtin_popcountll(*ins64);
1742 		rv2 = __builtin_popcountll(*ins642);
1743 		rv += rv1 + rv2;
1744 		sz -= 16;
1745 		ins += 16;
1746 	}
1747 
1748 	while (sz >= 4) {
1749 		uint32_t *ins32 = (uint32_t *) ins;
1750 		rv += __builtin_popcount(*ins32);
1751 		sz -= 4;
1752 		ins += 4;
1753 	}
1754 
1755 	while (sz > 0) {
1756 		char *ins8 = (char *)ins;
1757 		rv += __builtin_popcount(*ins8);
1758 		sz--;
1759 		ins++;
1760 	}
1761 	return rv;
1762 }
1763 
1764 bool
thread_fpsimd_hash_enabled(void)1765 thread_fpsimd_hash_enabled(void)
1766 {
1767 	return fpsimd_fault_popc ? true : false;
1768 }
1769 
1770 uint32_t __attribute__((noinline))
thread_fpsimd_hash(thread_t ft)1771 thread_fpsimd_hash(thread_t ft)
1772 {
1773 	uint32_t prv = 0;
1774 	boolean_t istate = ml_set_interrupts_enabled(FALSE);
1775 	struct x86_fx_thread_state *pifps = THREAD_TO_PCB(ft)->ifps;
1776 
1777 	if (pifps) {
1778 		if (pifps->fp_valid) {
1779 			prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1780 			    sizeof(pifps->fx_XMM_reg));
1781 		} else {
1782 			uintptr_t cr0 = get_cr0();
1783 			/*
1784 			 * The unusual case where the fp save area is not valid, yet TS is set,
1785 			 * is used to perform a lazy-init of FP state, so for this specific case,
1786 			 * assume that the popcount of the FP regs is 0.
1787 			 */
1788 			if (!(cr0 & CR0_TS)) {
1789 				fp_save(ft);
1790 				prv = fpsimd_pop((uintptr_t) &pifps->fx_XMM_reg[0][0],
1791 				    sizeof(pifps->fx_XMM_reg));
1792 				pifps->fp_valid = FALSE;
1793 			}
1794 		}
1795 	}
1796 	ml_set_interrupts_enabled(istate);
1797 	return prv;
1798 }
1799 #endif /* DEBUG || DEVELOPMENT */
1800