xref: /xnu-10002.61.3/bsd/skywalk/core/skywalk_common.h (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #ifndef _SKYWALK_COMMON_H_
30 #define _SKYWALK_COMMON_H_
31 
32 #if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE)
33 /*
34  * Routines common to kernel and userland.  This file is intended to
35  * be included by the Skywalk kernel and libsyscall code.
36  */
37 
38 #include <skywalk/os_skywalk_private.h>
39 
40 #ifndef KERNEL
41 #if defined(LIBSYSCALL_INTERFACE)
42 __BEGIN_DECLS
43 extern int fprintf_stderr(const char *format, ...);
44 __END_DECLS
45 
46 /* CSTYLED */
47 
48 #define SK_ABORT(msg) do {                                              \
49 	(void) fprintf_stderr("%s\n", msg);                             \
50 	__asm__(""); __builtin_trap();                                  \
51 } while (0)
52 
53 #define SK_ABORT_WITH_CAUSE(msg, cause) do {                            \
54 	(void) fprintf_stderr("%s: cause 0x%x\n", msg, cause);          \
55 	__asm__(""); __builtin_trap();                                  \
56 } while (0)
57 
58 #define SK_ABORT_DYNAMIC(msg)   SK_ABORT(msg)
59 
60 
61 #define VERIFY(EX) do {                                                 \
62 	if (__improbable(!(EX))) {                                      \
63 	        SK_ABORT("assertion failed: " #EX);                     \
64 	/* NOTREACHED */                                        \
65 	        __builtin_unreachable();                                \
66 	}                                                               \
67 } while (0)
68 
69 #if (DEBUG || DEVELOPMENT)
70 #define ASSERT(EX)      VERIFY(EX)
71 #else /* !DEBUG && !DEVELOPMENT */
72 #define ASSERT(EX)      ((void)0)
73 #endif /* !DEBUG && !DEVELOPMENT */
74 #endif /* !LIBSYSCALL_INTERFACE */
75 #endif /* !KERNEL */
76 
77 #ifndef container_of
78 #define container_of(ptr, type, member) \
79 	((type*)(((uintptr_t)ptr) - offsetof(type, member)))
80 #endif
81 
82 /*
83  * Prefetch.
84  */
85 #define SK_PREFETCH(a, n) \
86 	__builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 0, 3)
87 #define SK_PREFETCHW(a, n) \
88 	__builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 1, 3)
89 
90 /*
91  * Slower roundup function; if "align" is not power of 2 (else use P2ROUNDUP)
92  */
93 #define SK_ROUNDUP(x, align)    \
94 	((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align)))))
95 
96 /* compile time assert */
97 #ifndef _CASSERT
98 #define _CASSERT(x)     _Static_assert(x, "compile-time assertion failed")
99 #endif /* !_CASSERT */
100 
101 /* power of 2 address alignment */
102 #ifndef IS_P2ALIGNED
103 #define IS_P2ALIGNED(v, a)      \
104 	((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
105 #endif /* IS_P2ALIGNED */
106 
107 #define __sk_aligned(a) __attribute__((__aligned__(a)))
108 #define __sk_packed     __attribute__((__packed__))
109 #define __sk_unused     __attribute__((__unused__))
110 
111 #ifdef KERNEL
112 #include <sys/sdt.h>
113 
114 /*
115  * Copy 8-bytes total, 64-bit aligned, scalar.
116  */
117 __attribute__((always_inline))
118 static inline void
__sk_copy64_8(uint64_t * src,uint64_t * dst)119 __sk_copy64_8(uint64_t *src, uint64_t *dst)
120 {
121 	*dst = *src;            /* [#0*8] */
122 }
123 
124 /*
125  * Copy 8-bytes total, 32-bit aligned, scalar.
126  */
127 __attribute__((always_inline))
128 static inline void
__sk_copy32_8(uint32_t * src,uint32_t * dst)129 __sk_copy32_8(uint32_t *src, uint32_t *dst)
130 {
131 #if defined(__x86_64__)
132 	/* use unaligned scalar move on x86_64 */
133 	__sk_copy64_8((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
134 #else
135 	*dst++ = *src++;                /* dw[0] */
136 	*dst = *src;                    /* dw[1] */
137 #endif
138 }
139 
140 /*
141  * Copy 16-bytes total, 64-bit aligned, scalar.
142  */
143 static inline void
__sk_copy64_16(uint64_t * src,uint64_t * dst)144 __sk_copy64_16(uint64_t *src, uint64_t *dst)
145 {
146 	*dst++ = *src++;        /* [#0*8] */
147 	*dst = *src;            /* [#1*8] */
148 }
149 
150 /*
151  * Copy 16-bytes total, 64-bit aligned, SIMD (if available).
152  */
153 __attribute__((always_inline))
154 static inline void
__sk_vcopy64_16(uint64_t * src,uint64_t * dst)155 __sk_vcopy64_16(uint64_t *src, uint64_t *dst)
156 {
157 #if defined(__arm64__)
158 	/* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
159 	/* BEGIN CSTYLED */
160 	__asm__ __volatile__ (
161                 "ldr	q0, [%[src]]		\n\t"
162                 "str	q0, [%[dst]]		\n\t"
163                 :
164                 : [src] "r" (src), [dst] "r" (dst)
165                 : "v0", "memory"
166         );
167 	/* END CSTYLED */
168 #else
169 	__sk_copy64_16(src, dst);
170 #endif
171 }
172 
173 /*
174  * Copy 16-bytes total, 32-bit aligned, scalar.
175  */
176 __attribute__((always_inline))
177 static inline void
__sk_copy32_16(uint32_t * src,uint32_t * dst)178 __sk_copy32_16(uint32_t *src, uint32_t *dst)
179 {
180 	*dst++ = *src++;        /* [#0*4] */
181 	*dst++ = *src++;        /* [#1*4] */
182 	*dst++ = *src++;        /* [#2*4] */
183 	*dst = *src;            /* [#3*4] */
184 }
185 
186 /*
187  * Copy 16-bytes total, 32-bit aligned, SIMD (if available).
188  */
189 __attribute__((always_inline))
190 static inline void
__sk_vcopy32_16(uint32_t * src,uint32_t * dst)191 __sk_vcopy32_16(uint32_t *src, uint32_t *dst)
192 {
193 #if defined(__arm64__)
194 	/* use SIMD unaligned move on arm64 */
195 	__sk_vcopy64_16((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
196 #else
197 	__sk_copy32_16(src, dst);
198 #endif
199 }
200 
201 /*
202  * Copy 20-bytes total, 64-bit aligned, scalar.
203  */
204 __attribute__((always_inline))
205 static inline void
__sk_copy64_20(uint64_t * src,uint64_t * dst)206 __sk_copy64_20(uint64_t *src, uint64_t *dst)
207 {
208 	*dst++ = *src++;                        /* [#0*8] */
209 	*dst++ = *src++;                        /* [#1*8] */
210 	*(uint32_t *)dst = *(uint32_t *)src;    /* [#2*4] */
211 }
212 
213 /*
214  * Copy 20-bytes total, 64-bit aligned, SIMD (if available).
215  */
216 __attribute__((always_inline))
217 static inline void
__sk_vcopy64_20(uint64_t * src,uint64_t * dst)218 __sk_vcopy64_20(uint64_t *src, uint64_t *dst)
219 {
220 #if defined(__arm64__)
221 	/*
222 	 * Load/store 16 + 4 bytes;
223 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
224 	 */
225 	/* BEGIN CSTYLED */
226 	__asm__ __volatile__ (
227                 "ldr	q0, [%[src]]		\n\t"
228                 "str	q0, [%[dst]]		\n\t"
229                 "ldr	s0, [%[src], #16]	\n\t"
230                 "str	s0, [%[dst], #16]	\n\t"
231                 :
232                 : [src] "r" (src), [dst] "r" (dst)
233                 : "v0", "memory"
234         );
235 	/* END CSTYLED */
236 #else
237 	__sk_copy64_20(src, dst);
238 #endif
239 }
240 
241 /*
242  * Copy 24-bytes total, 64-bit aligned, scalar.
243  */
244 __attribute__((always_inline))
245 static inline void
__sk_copy64_24(uint64_t * src,uint64_t * dst)246 __sk_copy64_24(uint64_t *src, uint64_t *dst)
247 {
248 	*dst++ = *src++;        /* [#0*8] */
249 	*dst++ = *src++;        /* [#1*8] */
250 	*dst = *src;            /* [#2*8] */
251 }
252 
253 /*
254  * Copy 24-bytes total, 64-bit aligned, SIMD (if available).
255  */
256 __attribute__((always_inline))
257 static inline void
__sk_vcopy64_24(uint64_t * src,uint64_t * dst)258 __sk_vcopy64_24(uint64_t *src, uint64_t *dst)
259 {
260 #if defined(__arm64__)
261 	/*
262 	 * Use 16-bytes load/store and 8-bytes load/store on arm64;
263 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
264 	 */
265 	/* BEGIN CSTYLED */
266 	__asm__ __volatile__ (
267                 "ldr	q0, [%[src]]		\n\t"
268                 "str	q0, [%[dst]]		\n\t"
269                 "ldr	d0, [%[src], #16]	\n\t"
270                 "str	d0, [%[dst], #16]	\n\t"
271                 :
272                 : [src] "r" (src), [dst] "r" (dst)
273                 : "v0", "memory"
274         );
275 	/* END CSTYLED */
276 #else
277 	__sk_copy64_24(src, dst);
278 #endif
279 }
280 
281 /*
282  * Copy 32-bytes total, 64-bit aligned, scalar.
283  */
284 __attribute__((always_inline))
285 static inline void
__sk_copy64_32(uint64_t * src,uint64_t * dst)286 __sk_copy64_32(uint64_t *src, uint64_t *dst)
287 {
288 	*dst++ = *src++;        /* [#0*8] */
289 	*dst++ = *src++;        /* [#1*8] */
290 	*dst++ = *src++;        /* [#2*8] */
291 	*dst = *src;            /* [#3*8] */
292 }
293 
294 /*
295  * Copy 32-bytes total, 64-bit aligned, SIMD (if available).
296  */
297 __attribute__((always_inline))
298 static inline void
__sk_vcopy64_32(uint64_t * src,uint64_t * dst)299 __sk_vcopy64_32(uint64_t *src, uint64_t *dst)
300 {
301 #if defined(__arm64__)
302 	/* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
303 	/* BEGIN CSTYLED */
304 	__asm__ __volatile__ (
305                 "ldp	q0, q1, [%[src]]	\n\t"
306                 "stp	q0, q1, [%[dst]]	\n\t"
307                 :
308                 : [src] "r" (src), [dst] "r" (dst)
309                 : "v0", "v1", "memory"
310         );
311 	/* END CSTYLED */
312 #else
313 	__sk_copy64_32(src, dst);
314 #endif
315 }
316 
317 /*
318  * Copy 32-bytes total, 32-bit aligned, scalar.
319  */
320 __attribute__((always_inline))
321 static inline void
__sk_copy32_32(uint32_t * src,uint32_t * dst)322 __sk_copy32_32(uint32_t *src, uint32_t *dst)
323 {
324 	*dst++ = *src++;        /* [#0*4] */
325 	*dst++ = *src++;        /* [#1*4] */
326 	*dst++ = *src++;        /* [#2*4] */
327 	*dst++ = *src++;        /* [#3*4] */
328 	*dst++ = *src++;        /* [#4*4] */
329 	*dst++ = *src++;        /* [#5*4] */
330 	*dst++ = *src++;        /* [#6*4] */
331 	*dst = *src;            /* [#7*4] */
332 }
333 
334 /*
335  * Copy 32-bytes total, 32-bit aligned, SIMD (if available).
336  */
337 __attribute__((always_inline))
338 static inline void
__sk_vcopy32_32(uint32_t * src,uint32_t * dst)339 __sk_vcopy32_32(uint32_t *src, uint32_t *dst)
340 {
341 #if defined(__arm64__)
342 	/* use SIMD unaligned move on arm64 */
343 	__sk_vcopy64_32((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
344 #else
345 	__sk_copy32_32(src, dst);
346 #endif
347 }
348 
349 /*
350  * Copy 40-bytes total, 64-bit aligned, scalar.
351  */
352 __attribute__((always_inline))
353 static inline void
__sk_copy64_40(uint64_t * src,uint64_t * dst)354 __sk_copy64_40(uint64_t *src, uint64_t *dst)
355 {
356 	*dst++ = *src++;        /* [#0*8] */
357 	*dst++ = *src++;        /* [#1*8] */
358 	*dst++ = *src++;        /* [#2*8] */
359 	*dst++ = *src++;        /* [#3*8] */
360 	*dst = *src;            /* [#4*8] */
361 }
362 
363 /*
364  * Copy 40-bytes total, 64-bit aligned, SIMD (if available).
365  */
366 __attribute__((always_inline))
367 static inline void
__sk_vcopy64_40(uint64_t * src,uint64_t * dst)368 __sk_vcopy64_40(uint64_t *src, uint64_t *dst)
369 {
370 #if defined(__arm64__)
371 	/*
372 	 * Use 32-bytes load/store pair and 8-bytes load/store on arm64;
373 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
374 	 */
375 	/* BEGIN CSTYLED */
376 	__asm__ __volatile__ (
377                 "ldp	q0, q1, [%[src]]	\n\t"
378                 "stp	q0, q1, [%[dst]]	\n\t"
379                 "ldr	d0, [%[src], #32]	\n\t"
380                 "str	d0, [%[dst], #32]	\n\t"
381                 :
382                 : [src] "r" (src), [dst] "r" (dst)
383                 : "v0", "v1", "memory"
384         );
385 	/* END CSTYLED */
386 #else
387 	__sk_copy64_40(src, dst);
388 #endif
389 }
390 
391 #if defined(__arm64__)
392 /*
393  * On arm64, the following inline assembly fixed-length routines have
394  * fewer clock cycles than bzero().  We can directly use vector registers
395  * without saving/restoring them unlike on x86_64/arm32.
396  */
397 
398 /*
399  * Zero 16-bytes total, SIMD.
400  */
401 __attribute__((always_inline))
402 static inline void
__sk_zero_16(void * p)403 __sk_zero_16(void *p)
404 {
405 	/*
406 	 * Use 16-bytes store pair using 64-bit zero register on arm64;
407 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
408 	 */
409 	/* BEGIN CSTYLED */
410 	__asm__ __volatile__ (
411                 "stp	xzr, xzr, [%[p]]	\n\t"
412                 :
413                 : [p] "r" (p)
414                 : "memory"
415         );
416 	/* END CSTYLED */
417 }
418 
419 /*
420  * Zero 32-bytes total, SIMD.
421  */
422 __attribute__((always_inline))
423 static inline void
__sk_zero_32(void * p)424 __sk_zero_32(void *p)
425 {
426 	/*
427 	 * Use 32-bytes store pair using zeroed v0 register on arm64;
428 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
429 	 */
430 	/* BEGIN CSTYLED */
431 	__asm__ __volatile__ (
432                 "eor.16b v0, v0, v0		\n\t"
433                 "stp	 q0, q0, [%[p]]		\n\t"
434                 :
435                 : [p] "r" (p)
436                 : "v0", "memory", "cc"
437         );
438 	/* END CSTYLED */
439 }
440 
441 /*
442  * Zero 48-bytes total, SIMD.
443  */
444 __attribute__((always_inline))
445 static inline void
__sk_zero_48(void * p)446 __sk_zero_48(void *p)
447 {
448 	/*
449 	 * Use 32-bytes store pair and 16-byte store using zeroed v0
450 	 * register on arm64; no need to save/restore registers on
451 	 * arm64 (SPILL_REGISTERS).
452 	 */
453 	/* BEGIN CSTYLED */
454 	__asm__ __volatile__ (
455                 "eor.16b v0, v0, v0		\n\t"
456                 "stp	 q0, q0, [%[p]]		\n\t"
457                 "str	 q0, [%[p], #32]	\n\t"
458                 :
459                 : [p] "r" (p)
460                 : "v0", "memory", "cc"
461         );
462 	/* END CSTYLED */
463 }
464 
465 /*
466  * Zero 128-bytes total, SIMD.
467  */
468 __attribute__((always_inline))
469 static inline void
__sk_zero_128(void * p)470 __sk_zero_128(void *p)
471 {
472 	/*
473 	 * Use 4x 32-bytes store pairs using zeroed v0 register on arm64;
474 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
475 	 *
476 	 * Note that we could optimize this routine by utilizing "dc zva"
477 	 * which zeroes the entire cache line.  However, that requires
478 	 * us to guarantee that the address is cache line aligned which
479 	 * we cannot (at the moment).
480 	 */
481 	/* BEGIN CSTYLED */
482 	__asm__ __volatile__ (
483                 "eor.16b v0, v0, v0		\n\t"
484                 "stp	 q0, q0, [%[p]]		\n\t"
485                 "stp	 q0, q0, [%[p], #32]	\n\t"
486                 "stp	 q0, q0, [%[p], #64]	\n\t"
487                 "stp	 q0, q0, [%[p], #96]	\n\t"
488                 :
489                 : [p] "r" (p)
490                 : "v0", "memory", "cc"
491         );
492 	/* END CSTYLED */
493 }
494 #else /* !__arm64__ */
495 /*
496  * Just use bzero() for simplicity.  On x86_64, "rep stosb" microcoded
497  * implementation already uses wider stores and can go much faster than
498  * one byte per clock cycle.  For arm32, bzero() is also good enough.
499  */
500 #define __sk_zero_16(_p)        bzero(_p, 16)
501 #define __sk_zero_32(_p)        bzero(_p, 32)
502 #define __sk_zero_48(_p)        bzero(_p, 48)
503 #define __sk_zero_128(_p)       bzero(_p, 128)
504 #endif /* !__arm64__ */
505 
506 /*
507  * The following are optimized routines which rely on the caller
508  * rounding up the source and destination buffers to multiples of
509  * 4, 8 or 64 bytes, and are 64-bit aligned; faster than memcpy().
510  *
511  * Note: they do not support overlapping ranges.
512  */
513 
514 /*
515  * Threshold as to when we use memcpy() rather than unrolled copy.
516  */
517 #if defined(__x86_64__)
518 #define SK_COPY_THRES 2048
519 #elif defined(__arm64__)
520 #define SK_COPY_THRES 1024
521 #else /* !__x86_64__ && !__arm64__ */
522 #define SK_COPY_THRES 1024
523 #endif /* !__x86_64__ && !__arm64__ */
524 
525 #if (DEVELOPMENT || DEBUG)
526 extern size_t sk_copy_thres;
527 #endif /* (DEVELOPMENT || DEBUG) */
528 
529 /*
530  * Scalar version, 4-bytes multiple.
531  */
532 __attribute__((always_inline))
533 static inline void
sk_copy64_4x(uint32_t * src,uint32_t * dst,size_t l)534 sk_copy64_4x(uint32_t *src, uint32_t *dst, size_t l)
535 {
536 #if (DEVELOPMENT || DEBUG)
537 	if (__probable(l <= sk_copy_thres)) {
538 #else
539 	if (__probable(l <= SK_COPY_THRES)) {
540 #endif /* (!DEVELOPMENT && !DEBUG! */
541 		while ((ssize_t)(l -= 4) >= 0) {
542 			*dst++ = *src++;        /* [#n*4] */
543 		}
544 	} else {
545 		(void) memcpy((void *)dst, (void *)src, l);
546 	}
547 }
548 
549 /*
550  * Scalar version, 8-bytes multiple.
551  */
552 __attribute__((always_inline))
553 static inline void
554 sk_copy64_8x(uint64_t *src, uint64_t *dst, size_t l)
555 {
556 #if (DEVELOPMENT || DEBUG)
557 	if (__probable(l <= sk_copy_thres)) {
558 #else
559 	if (__probable(l <= SK_COPY_THRES)) {
560 #endif /* (!DEVELOPMENT && !DEBUG! */
561 		while ((ssize_t)(l -= 8) >= 0) {
562 			*dst++ = *src++;        /* [#n*8] */
563 		}
564 	} else {
565 		(void) memcpy((void *)dst, (void *)src, l);
566 	}
567 }
568 
569 /*
570  * Scalar version (usually faster than SIMD), 32-bytes multiple.
571  */
572 __attribute__((always_inline))
573 static inline void
574 sk_copy64_32x(uint64_t *src, uint64_t *dst, size_t l)
575 {
576 #if (DEVELOPMENT || DEBUG)
577 	if (__probable(l <= sk_copy_thres)) {
578 #else
579 	if (__probable(l <= SK_COPY_THRES)) {
580 #endif /* (!DEVELOPMENT && !DEBUG! */
581 		while ((ssize_t)(l -= 32) >= 0) {
582 			*dst++ = *src++;        /* [#0*8] */
583 			*dst++ = *src++;        /* [#1*8] */
584 			*dst++ = *src++;        /* [#2*8] */
585 			*dst++ = *src++;        /* [#3*8] */
586 		}
587 	} else {
588 		(void) memcpy((void *)dst, (void *)src, l);
589 	}
590 }
591 
592 /*
593  * Scalar version (usually faster than SIMD), 64-bytes multiple.
594  */
595 __attribute__((always_inline))
596 static inline void
597 sk_copy64_64x(uint64_t *src, uint64_t *dst, size_t l)
598 {
599 #if (DEVELOPMENT || DEBUG)
600 	if (__probable(l <= sk_copy_thres)) {
601 #else
602 	if (__probable(l <= SK_COPY_THRES)) {
603 #endif /* (!DEVELOPMENT && !DEBUG! */
604 		while ((ssize_t)(l -= 64) >= 0) {
605 			*dst++ = *src++;        /* [#0*8] */
606 			*dst++ = *src++;        /* [#1*8] */
607 			*dst++ = *src++;        /* [#2*8] */
608 			*dst++ = *src++;        /* [#3*8] */
609 			*dst++ = *src++;        /* [#4*8] */
610 			*dst++ = *src++;        /* [#5*8] */
611 			*dst++ = *src++;        /* [#6*8] */
612 			*dst++ = *src++;        /* [#7*8] */
613 		}
614 	} else {
615 		(void) memcpy((void *)dst, (void *)src, l);
616 	}
617 }
618 
619 /*
620  * Use scalar or SIMD based on platform/size.
621  */
622 #if defined(__x86_64__)
623 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
624 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
625 #define sk_copy64_16    __sk_copy64_16          /* scalar */
626 #define sk_copy32_16    __sk_copy32_16          /* scalar */
627 #define sk_copy64_20    __sk_copy64_20          /* scalar */
628 #define sk_copy64_24    __sk_copy64_24          /* scalar */
629 #define sk_copy64_32    __sk_copy64_32          /* scalar */
630 #define sk_copy32_32    __sk_copy32_32          /* scalar */
631 #define sk_copy64_40    __sk_copy64_40          /* scalar */
632 #define sk_zero_16      __sk_zero_16            /* scalar */
633 #define sk_zero_32      __sk_zero_32            /* scalar */
634 #define sk_zero_48      __sk_zero_48            /* scalar */
635 #define sk_zero_128     __sk_zero_128           /* scalar */
636 #elif defined(__arm64__)
637 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
638 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
639 #define sk_copy64_16    __sk_vcopy64_16         /* SIMD */
640 #define sk_copy32_16    __sk_vcopy32_16         /* SIMD */
641 #define sk_copy64_20    __sk_vcopy64_20         /* SIMD */
642 #define sk_copy64_24    __sk_vcopy64_24         /* SIMD */
643 #define sk_copy64_32    __sk_vcopy64_32         /* SIMD */
644 #define sk_copy32_32    __sk_vcopy32_32         /* SIMD */
645 #define sk_copy64_40    __sk_vcopy64_40         /* SIMD */
646 #define sk_zero_16      __sk_zero_16            /* SIMD */
647 #define sk_zero_32      __sk_zero_32            /* SIMD */
648 #define sk_zero_48      __sk_zero_48            /* SIMD */
649 #define sk_zero_128     __sk_zero_128           /* SIMD */
650 #else
651 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
652 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
653 #define sk_copy64_16    __sk_copy64_16          /* scalar */
654 #define sk_copy32_16    __sk_copy32_16          /* scalar */
655 #define sk_copy64_20    __sk_copy64_20          /* scalar */
656 #define sk_copy64_24    __sk_copy64_24          /* scalar */
657 #define sk_copy64_32    __sk_copy64_32          /* scalar */
658 #define sk_copy32_32    __sk_copy32_32          /* scalar */
659 #define sk_copy64_40    __sk_copy64_40          /* scalar */
660 #define sk_zero_16      __sk_zero_16            /* scalar */
661 #define sk_zero_32      __sk_zero_32            /* scalar */
662 #define sk_zero_48      __sk_zero_48            /* scalar */
663 #define sk_zero_128     __sk_zero_128           /* scalar */
664 #endif
665 
666 /*
667  * Do not use these directly.
668  * Use the skn_ variants if you need custom probe names.
669  */
670 #define _sk_alloc_type(probename, type, flags, name)                    \
671 ({                                                                      \
672 	void *ret;                                                      \
673                                                                         \
674 	/* XXX Modify this to use KT_PRIV_ACCT later  */                \
675 	ret = kalloc_type_tag(type, Z_ZERO | (flags), (name)->tag);     \
676 	DTRACE_SKYWALK3(probename, char *, #type, int, (flags),         \
677 	    void *, ret);                                               \
678 	ret;                                                            \
679 })
680 
681 #define _sk_alloc_type_array(probename, type, count, flags, name)       \
682 ({                                                                      \
683 	void *ret;                                                      \
684                                                                         \
685 	ret = kalloc_type_tag(type, (count), Z_ZERO | (flags),          \
686 	    (name)->tag);                                               \
687 	DTRACE_SKYWALK4(probename, char *, #type, size_t, (count),      \
688 	    int, (flags), void *, ret);                                 \
689 	ret;                                                            \
690 })
691 
692 #define _sk_alloc_type_hash(probename, heap, size, flags, name)         \
693 ({                                                                      \
694 	void *ret;                                                      \
695                                                                         \
696 	ret = kalloc_type_var_impl((heap), (size),                      \
697 	    __zone_flags_mix_tag((flags) | Z_ZERO, (name)->tag), NULL); \
698 	DTRACE_SKYWALK4(probename, char *, (heap)->kt_name + 5,         \
699 	    size_t, (size), int, (flags), void *, ret);                 \
700 	ret;                                                            \
701 })
702 
703 #define _sk_realloc_type_array(probename, type, oldcount, newcount, elem, flags, name) \
704 ({                                                                      \
705 	void *ret;                                                      \
706                                                                         \
707 	ret = krealloc_type_tag(type, (oldcount), (newcount), (elem),   \
708 	    Z_ZERO | (flags), (name)->tag);                             \
709 	DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldcount),  \
710 	    size_t, (newcount), int, (flags), void *, ret);             \
711 	ret;                                                            \
712 })
713 
714 #define _sk_alloc_type_header_array(probename, htype, type, count, flags, name) \
715 ({                                                                      \
716 	void *ret;                                                      \
717                                                                         \
718 	ret = kalloc_type_tag(htype, type, (count), Z_ZERO | (flags),   \
719 	    (name)->tag);                                               \
720 	DTRACE_SKYWALK5(probename, char *, #htype, char *, #type,       \
721 	    size_t, (count), int, (flags), void *, ret);                \
722 	ret;                                                            \
723 })
724 
725 #define _sk_free_type(probename, type, elem)                            \
726 {                                                                       \
727 	DTRACE_SKYWALK2(probename, char *, #type, void *, (elem));      \
728 	kfree_type(type, (elem));                                       \
729 }
730 
731 #define _sk_free_type_array(probename, type, count, elem)               \
732 {                                                                       \
733 	DTRACE_SKYWALK3(probename, char *, #type, size_t, (count),      \
734 	    void *, (elem));                                            \
735 	kfree_type(type, (count), (elem));                              \
736 }
737 
738 #define _sk_free_type_hash(probename, heap, size, elem)                 \
739 {                                                                       \
740 	DTRACE_SKYWALK3(probename, char *, (heap)->kt_name + 5,         \
741 	    size_t, (size), void *, (elem));                            \
742 	kfree_type_var_impl((heap), (elem), (size));                    \
743 }
744 
745 #define _sk_free_type_header_array(probename, htype, type, count, elem) \
746 {                                                                       \
747 	DTRACE_SKYWALK4(probename, char *, #htype, char *, #type,       \
748 	    size_t, (count), void *, (elem));                           \
749 	kfree_type(htype, type, (count), (elem));                       \
750 }
751 
752 #define _sk_alloc_data(probename, size, flags, name)                    \
753 ({                                                                      \
754 	void *ret;                                                      \
755                                                                         \
756 	ret = kalloc_data_tag((size), Z_ZERO | (flags), (name)->tag);   \
757 	DTRACE_SKYWALK3(probename, size_t, (size), int, (flags),        \
758 	    void *, ret);                                               \
759 	ret;                                                            \
760 })
761 
762 #define _sk_realloc_data(probename, elem, oldsize, newsize, flags, name) \
763 ({                                                                      \
764 	void *ret;                                                      \
765                                                                         \
766 	ret = krealloc_data_tag((elem), (oldsize), (newsize),           \
767 	    Z_ZERO | (flags), (name)->tag);                             \
768 	DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldsize),   \
769 	    size_t, (newsize), int, (flags), void *, ret);              \
770 	ret;                                                            \
771 })
772 
773 #define _sk_free_data(probename, elem, size)                            \
774 {                                                                       \
775 	DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size));     \
776 	kfree_data((elem), (size));                                     \
777 }
778 
779 #define sk_alloc_type(type, flags, tag)                                 \
780 	_sk_alloc_type(sk_alloc_type, type, flags, tag)
781 
782 #define sk_alloc_type_array(type, count, flags, tag)                    \
783 	_sk_alloc_type_array(sk_alloc_type_array, type, count, flags, tag)
784 
785 #define sk_alloc_type_hash(heap, size, flags, tag)                      \
786 	_sk_alloc_type_hash(sk_alloc_type_hash, heap, size, flags, tag)
787 
788 #define sk_alloc_type_header_array(htype, type, count, flags, tag)      \
789 	_sk_alloc_type_header_array(sk_alloc_type_header_array, htype,  \
790 	type, count, flags, tag)
791 
792 #define sk_realloc_type_array(type, oldsize, newsize, elem, flags, tag) \
793 	_sk_realloc_type_array(sk_realloc_type_array, type,             \
794 	oldsize, newsize, elem, flags, tag)
795 
796 #define sk_free_type(type, elem)                                        \
797 	_sk_free_type(sk_free_type, type, elem)
798 
799 #define sk_free_type_array(type, count, elem)                           \
800 	_sk_free_type_array(sk_free_type_array, type, count, elem)
801 
802 #define sk_free_type_hash(heap, size, elem)                             \
803 	_sk_free_type_hash(sk_free_type_hash, heap, size, elem)
804 
805 #define sk_free_type_header_array(htype, type, count, elem)             \
806 	_sk_free_type_header_array(sk_free_type_header_array, htype,    \
807 	type, count, elem)
808 
809 #define sk_alloc_data(size, flags, tag)                                 \
810 	_sk_alloc_data(sk_alloc_data, size, flags, tag)
811 
812 #define sk_realloc_data(elem, oldsize, newsize, flags, tag)             \
813 	_sk_realloc_data(sk_realloc_data, elem, oldsize, newsize,       \
814 	flags, tag)
815 
816 #define sk_free_data(elem, size)                                        \
817 	_sk_free_data(sk_free_data, elem, size)
818 
819 /*
820  * The skn_ variants are meant to be used if you need to use two or more
821  * of the same call within the same function and you want the dtrace
822  * probename to be different at each callsite.
823  */
824 #define skn_realloc(name, elem, oldsize, newsize, flags, tag)           \
825 	_sk_realloc(sk_realloc_ ## name, elem, oldsize, newsize, flags, \
826 	tag)
827 
828 #define skn_alloc_type(name, type, flags, tag)                          \
829 	_sk_alloc_type(sk_alloc_type_ ## name, type, flags, tag)
830 
831 #define skn_alloc_type_array(name, type, count, flags, tag)             \
832 	_sk_alloc_type_array(sk_alloc_type_array_ ## name, type, count, \
833 	flags, tag)
834 
835 #define skn_alloc_type_hash(name, heap, size, flags, tag)               \
836 	_sk_alloc_type_hash(sk_alloc_type_hash_ ## name, heap, size,    \
837 	flags, tag)
838 
839 #define skn_alloc_type_header_array(name, htype, type, count, flags, tag) \
840 	_sk_alloc_type_header_array(sk_alloc_type_header_array_ ## name, \
841 	htype, type, count, flags, tag)
842 
843 #define skn_free_type(name, type, elem)                                 \
844 	_sk_free_type(sk_free_type_ ## name, type, elem)
845 
846 #define skn_free_type_array(name, type, count, elem)                    \
847 	_sk_free_type_array(sk_free_type_array_ ## name, type, count,   \
848 	elem)
849 
850 #define skn_free_type_hash(name, heap, size, elem)                      \
851 	_sk_free_type_hash(sk_free_type_hash_ ## name, heap, size, elem)
852 
853 #define skn_free_type_header_array(name, htype, type, count, elem)      \
854 	_sk_free_type_header_array(sk_free_type_header_array_ ## name,  \
855 	htype, type, count, elem)
856 
857 #define skn_alloc_data(name, size, flags, tag)                          \
858 	_sk_alloc_data(sk_alloc_data_ ## name, size, flags, tag)
859 
860 #define skn_realloc_data(name, elem, oldsize, newsize, flags, tag)      \
861 	_sk_realloc_data(sk_realloc_data_ ## name, elem, oldsize, newsize,\
862 	flags, tag)
863 
864 #define skn_free_data(name, elem, size)                                 \
865 	_sk_free_data(sk_free_data_ ## name, elem, size)
866 
867 struct sk_tag_spec {
868 	kern_allocation_name_t *skt_var;
869 	const char             *skt_name;
870 };
871 
872 extern void __sk_tag_make(const struct sk_tag_spec *spec);
873 
874 #define SKMEM_TAG_DEFINE(var, name) \
875 	SECURITY_READ_ONLY_LATE(kern_allocation_name_t) var;            \
876 	__startup_data struct sk_tag_spec __sktag_##var = {             \
877 	    .skt_var = &var, .skt_name = name,                          \
878 	};                                                              \
879 	STARTUP_ARG(ZALLOC, STARTUP_RANK_LAST, __sk_tag_make, &__sktag_##var)
880 
881 /*!
882  *  @abstract Compare byte buffers of n bytes long src1 against src2, applying
883  *  the byte masks to input data before comparison.  (Scalar version)
884  *
885  *  @discussion
886  *  Returns zero if the two buffers are identical after applying the byte
887  *  masks, otherwise non-zero.
888  *  Zero-length buffers are always identical.
889  *
890  *  @param src1 first input buffer of n bytes long
891  *  @param src2 second input buffer of n bytes long
892  *  @param byte_mask byte mask of n bytes long applied before comparision
893  *  @param n number of bytes
894  */
895 static inline int
896 __sk_memcmp_mask_scalar(const uint8_t *src1, const uint8_t *src2,
897     const uint8_t *byte_mask, size_t n)
898 {
899 	uint32_t result = 0;
900 	for (size_t i = 0; i < n; i++) {
901 		result |= (src1[i] ^ src2[i]) & byte_mask[i];
902 	}
903 	return result;
904 }
905 
906 static inline int
907 __sk_memcmp_mask_16B_scalar(const uint8_t *src1, const uint8_t *src2,
908     const uint8_t *byte_mask)
909 {
910 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 16);
911 }
912 
913 static inline int
914 __sk_memcmp_mask_32B_scalar(const uint8_t *src1, const uint8_t *src2,
915     const uint8_t *byte_mask)
916 {
917 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 32);
918 }
919 
920 static inline int
921 __sk_memcmp_mask_48B_scalar(const uint8_t *src1, const uint8_t *src2,
922     const uint8_t *byte_mask)
923 {
924 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 48);
925 }
926 
927 static inline int
928 __sk_memcmp_mask_64B_scalar(const uint8_t *src1, const uint8_t *src2,
929     const uint8_t *byte_mask)
930 {
931 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 64);
932 }
933 
934 static inline int
935 __sk_memcmp_mask_80B_scalar(const uint8_t *src1, const uint8_t *src2,
936     const uint8_t *byte_mask)
937 {
938 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 80);
939 }
940 
941 #if defined(__arm64__) || defined(__arm__) || defined(__x86_64__)
942 extern int os_memcmp_mask_16B(const uint8_t *src1, const uint8_t *src2,
943     const uint8_t *byte_mask);
944 extern int os_memcmp_mask_32B(const uint8_t *src1, const uint8_t *src2,
945     const uint8_t *byte_mask);
946 extern int os_memcmp_mask_48B(const uint8_t *src1, const uint8_t *src2,
947     const uint8_t *byte_mask);
948 extern int os_memcmp_mask_64B(const uint8_t *src1, const uint8_t *src2,
949     const uint8_t *byte_mask);
950 extern int os_memcmp_mask_80B(const uint8_t *src1, const uint8_t *src2,
951     const uint8_t *byte_mask);
952 
953 /*
954  * Use SIMD variants based on ARM64 and x86_64.
955  */
956 #define sk_memcmp_mask                  __sk_memcmp_mask
957 #define sk_memcmp_mask_16B              os_memcmp_mask_16B
958 #define sk_memcmp_mask_32B              os_memcmp_mask_32B
959 #define sk_memcmp_mask_48B              os_memcmp_mask_48B
960 #define sk_memcmp_mask_64B              os_memcmp_mask_64B
961 #define sk_memcmp_mask_80B              os_memcmp_mask_80B
962 
963 /*!
964  *  @abstract Compare byte buffers of n bytes long src1 against src2, applying
965  *  the byte masks to input data before comparison.  (SIMD version)
966  *
967  *  @discussion
968  *  Returns zero if the two buffers are identical after applying the byte
969  *  masks, otherwise non-zero.
970  *  Zero-length buffers are always identical.
971  *
972  *  @param src1 first input buffer of n bytes long
973  *  @param src2 second input buffer of n bytes long
974  *  @param byte_mask byte mask of n bytes long applied before comparision
975  *  @param n number of bytes
976  */
977 static inline int
978 __sk_memcmp_mask(const uint8_t *src1, const uint8_t *src2,
979     const uint8_t *byte_mask, size_t n)
980 {
981 	uint32_t result = 0;
982 	size_t i = 0;
983 	for (; i + 64 <= n; i += 64) {
984 		result |= sk_memcmp_mask_64B(src1 + i, src2 + i,
985 		    byte_mask + i);
986 	}
987 	for (; i + 32 <= n; i += 32) {
988 		result |= sk_memcmp_mask_32B(src1 + i, src2 + i,
989 		    byte_mask + i);
990 	}
991 	for (; i + 16 <= n; i += 16) {
992 		result |= sk_memcmp_mask_16B(src1 + i, src2 + i,
993 		    byte_mask + i);
994 	}
995 	if (i < n) {
996 		if (n >= 16) {
997 			/* Compare the last 16 bytes with vector code. */
998 			result |= sk_memcmp_mask_16B(src1 + n - 16,
999 			    src2 + n - 16, byte_mask + n - 16);
1000 		} else {
1001 			/* Use scalar code if n < 16. */
1002 			for (; i < n; i++) {
1003 				result |= (src1[i] ^ src2[i]) & byte_mask[i];
1004 			}
1005 		}
1006 	}
1007 	return result;
1008 }
1009 #else /* !(__arm64__ || __arm__ || __x86_64__) */
1010 /*
1011  * Use scalar variants elsewhere.
1012  */
1013 #define sk_memcmp_mask                  __sk_memcmp_mask_scalar
1014 #define sk_memcmp_mask_16B              __sk_memcmp_mask_16B_scalar
1015 #define sk_memcmp_mask_32B              __sk_memcmp_mask_32B_scalar
1016 #define sk_memcmp_mask_48B              __sk_memcmp_mask_48B_scalar
1017 #define sk_memcmp_mask_64B              __sk_memcmp_mask_64B_scalar
1018 #define sk_memcmp_mask_80B              __sk_memcmp_mask_80B_scalar
1019 #endif /* !(__arm64__ || __arm__ || __x86_64__) */
1020 
1021 /*
1022  * Scalar variants are available on all platforms if needed.
1023  */
1024 #define sk_memcmp_mask_scalar           __sk_memcmp_mask_scalar
1025 #define sk_memcmp_mask_16B_scalar       __sk_memcmp_mask_16B_scalar
1026 #define sk_memcmp_mask_32B_scalar       __sk_memcmp_mask_32B_scalar
1027 #define sk_memcmp_mask_48B_scalar       __sk_memcmp_mask_48B_scalar
1028 #define sk_memcmp_mask_64B_scalar       __sk_memcmp_mask_64B_scalar
1029 #define sk_memcmp_mask_80B_scalar       __sk_memcmp_mask_80B_scalar
1030 
1031 #endif /* KERNEL */
1032 #endif /* PRIVATE || BSD_KERNEL_PRIVATE */
1033 #endif /* !_SKYWALK_COMMON_H_ */
1034