xref: /xnu-11417.140.69/bsd/skywalk/core/skywalk_common.h (revision 43a90889846e00bfb5cf1d255cdc0a701a1e05a4)
1 /*
2  * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #ifndef _SKYWALK_COMMON_H_
30 #define _SKYWALK_COMMON_H_
31 
32 #if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE)
33 /*
34  * Routines common to kernel and userland.  This file is intended to
35  * be included by the Skywalk kernel and libsyscall code.
36  */
37 
38 #include <skywalk/os_skywalk_private.h>
39 
40 #ifndef KERNEL
41 #if defined(LIBSYSCALL_INTERFACE)
42 __BEGIN_DECLS
43 extern int fprintf_stderr(const char *format, ...);
44 __END_DECLS
45 
46 /* CSTYLED */
47 
48 #define SK_ABORT(msg) do {                                              \
49 	(void) fprintf_stderr("%s\n", msg);                             \
50 	__asm__(""); __builtin_trap();                                  \
51 } while (0)
52 
53 #define SK_ABORT_WITH_CAUSE(msg, cause) do {                            \
54 	(void) fprintf_stderr("%s: cause 0x%x\n", msg, cause);          \
55 	__asm__(""); __builtin_trap();                                  \
56 } while (0)
57 
58 #define SK_ABORT_DYNAMIC(msg)   SK_ABORT(msg)
59 
60 
61 #define VERIFY(EX) do {                                                 \
62 	if (__improbable(!(EX))) {                                      \
63 	        SK_ABORT("assertion failed: " #EX);                     \
64 	/* NOTREACHED */                                        \
65 	        __builtin_unreachable();                                \
66 	}                                                               \
67 } while (0)
68 
69 #if (DEBUG || DEVELOPMENT)
70 #define ASSERT(EX)      VERIFY(EX)
71 #else /* !DEBUG && !DEVELOPMENT */
72 #define ASSERT(EX)      ((void)0)
73 #endif /* !DEBUG && !DEVELOPMENT */
74 #endif /* !LIBSYSCALL_INTERFACE */
75 #endif /* !KERNEL */
76 
77 #ifndef container_of
78 #define container_of(ptr, type, member) \
79 	((type*)(((uintptr_t)ptr) - offsetof(type, member)))
80 #endif
81 
82 /*
83  * Prefetch.
84  */
85 #define SK_PREFETCH(a, n) \
86 	__builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 0, 3)
87 #define SK_PREFETCHW(a, n) \
88 	__builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 1, 3)
89 
90 /*
91  * Slower roundup function; if "align" is not power of 2 (else use P2ROUNDUP)
92  */
93 #define SK_ROUNDUP(x, align)    \
94 	((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align)))))
95 
96 /* compile time assert */
97 #ifndef _CASSERT
98 #define _CASSERT(x)     _Static_assert(x, "compile-time assertion failed")
99 #endif /* !_CASSERT */
100 
101 /* power of 2 address alignment */
102 #ifndef IS_P2ALIGNED
103 #define IS_P2ALIGNED(v, a)      \
104 	((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
105 #endif /* IS_P2ALIGNED */
106 
107 #define __sk_aligned(a) __attribute__((__aligned__(a)))
108 #define __sk_packed     __attribute__((__packed__))
109 #define __sk_unused     __attribute__((__unused__))
110 
111 #ifdef KERNEL
112 #include <sys/sdt.h>
113 
114 /*
115  * Copy 8-bytes total, 64-bit aligned, scalar.
116  */
117 __attribute__((always_inline))
118 static inline void
__sk_copy64_8(uint64_t * src,uint64_t * dst)119 __sk_copy64_8(uint64_t *src, uint64_t *dst)
120 {
121 	*dst = *src;            /* [#0*8] */
122 }
123 
124 /*
125  * Copy 8-bytes total, 32-bit aligned, scalar.
126  */
127 __attribute__((always_inline))
128 static inline void
129 __sk_copy32_8(uint32_t *__counted_by(2)src, uint32_t *__counted_by(2)dst)
130 {
131 #if defined(__x86_64__)
132 	/* use unaligned scalar move on x86_64 */
133 	__sk_copy64_8((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
134 #else
135 	dst[0] = src[0]; /* dw[0] */
136 	dst[1] = src[1]; /* dw[1] */
137 #endif
138 }
139 
140 /*
141  * Copy 16-bytes total, 64-bit aligned, scalar.
142  */
143 static inline void
144 __sk_copy64_16(uint64_t *__counted_by(2) src, uint64_t *__counted_by(2) dst)
145 {
146 	dst[0] = src[0]; /* [#0*8] */
147 	dst[1] = src[1]; /* [#1*8] */
148 }
149 
150 /*
151  * Copy 16-bytes total, 32-bit aligned, scalar.
152  */
153 __attribute__((always_inline))
154 static inline void
155 __sk_copy32_16(uint32_t *__counted_by(4) src, uint32_t *__counted_by(4) dst)
156 {
157 	dst[0] = src[0]; /* [#0*4] */
158 	dst[1] = src[1]; /* [#1*4] */
159 	dst[2] = src[2]; /* [#2*4] */
160 	dst[3] = src[3]; /* [#3*4] */
161 }
162 
163 /*
164  * Copy 20-bytes total, 64-bit aligned, scalar.
165  */
166 __attribute__((always_inline))
167 static inline void
168 __sk_copy64_20(uint64_t *__sized_by(20) src, uint64_t *__sized_by(20) dst)
169 {
170 	dst[0] = src[0]; /* [#0*8] */
171 	dst[1] = src[1]; /* [#1*8] */
172 	*(uint32_t *)(dst + 2) = *(uint32_t *)(src + 2); /* [#2*4] */
173 }
174 
175 /*
176  * Copy 24-bytes total, 64-bit aligned, scalar.
177  */
178 __attribute__((always_inline))
179 static inline void
180 __sk_copy64_24(uint64_t *__counted_by(3) src, uint64_t *__counted_by(3) dst)
181 {
182 	dst[0] = src[0]; /* [#0*8] */
183 	dst[1] = src[1]; /* [#1*8] */
184 	dst[2] = src[2]; /* [#2*8] */
185 }
186 
187 /*
188  * Copy 32-bytes total, 64-bit aligned, scalar.
189  */
190 __attribute__((always_inline))
191 static inline void
192 __sk_copy64_32(uint64_t *__counted_by(4) src, uint64_t *__counted_by(4) dst)
193 {
194 	dst[0] = src[0]; /* [#0*8] */
195 	dst[1] = src[1]; /* [#1*8] */
196 	dst[2] = src[2]; /* [#2*8] */
197 	dst[3] = src[3]; /* [#3*8] */
198 }
199 
200 /*
201  * Copy 32-bytes total, 32-bit aligned, scalar.
202  */
203 __attribute__((always_inline))
204 static inline void
205 __sk_copy32_32(uint32_t *__counted_by(8) src, uint32_t *__counted_by(8) dst)
206 {
207 	dst[0] = src[0]; /* [#0*4] */
208 	dst[1] = src[1]; /* [#1*4] */
209 	dst[2] = src[2]; /* [#2*4] */
210 	dst[3] = src[3]; /* [#3*4] */
211 	dst[4] = src[4]; /* [#4*4] */
212 	dst[5] = src[5]; /* [#5*4] */
213 	dst[6] = src[6]; /* [#6*4] */
214 	dst[7] = src[7]; /* [#7*4] */
215 }
216 
217 /*
218  * Copy 40-bytes total, 64-bit aligned, scalar.
219  */
220 __attribute__((always_inline))
221 static inline void
222 __sk_copy64_40(uint64_t *__sized_by(40) src, uint64_t *__sized_by(40) dst)
223 {
224 	dst[0] = src[0]; /* [#0*8] */
225 	dst[1] = src[1]; /* [#1*8] */
226 	dst[2] = src[2]; /* [#2*8] */
227 	dst[3] = src[3]; /* [#3*8] */
228 	dst[4] = src[4]; /* [#4*8] */
229 }
230 
231 #if defined(__arm64__)
232 /*
233  * Copy 16-bytes total, 64-bit aligned, SIMD (if available).
234  */
235 __attribute__((always_inline))
236 static inline void
237 __sk_vcopy64_16(uint64_t *__counted_by(2) src, uint64_t *__counted_by(2) dst)
238 {
239 	/* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
240 	/* BEGIN CSTYLED */
241 	__asm__ __volatile__ (
242                 "ldr	q0, [%[src]]		\n\t"
243                 "str	q0, [%[dst]]		\n\t"
244                 :
245                 : [src] "r" ((uint64_t *__unsafe_indexable)src), [dst] "r" ((uint64_t *__unsafe_indexable)dst)
246                 : "v0", "memory"
247         );
248 	/* END CSTYLED */
249 }
250 
251 /*
252  * Copy 16-bytes total, 32-bit aligned, SIMD (if available).
253  */
254 __attribute__((always_inline))
255 static inline void
256 __sk_vcopy32_16(uint32_t *__counted_by(4) src, uint32_t *__counted_by(4) dst)
257 {
258 	/* use SIMD unaligned move on arm64 */
259 	__sk_vcopy64_16((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
260 }
261 
262 /*
263  * Copy 20-bytes total, 64-bit aligned, SIMD (if available).
264  */
265 __attribute__((always_inline))
266 static inline void
267 __sk_vcopy64_20(uint64_t *__sized_by(20) src, uint64_t *__sized_by(20) dst)
268 {
269 	/*
270 	 * Load/store 16 + 4 bytes;
271 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
272 	 */
273 	/* BEGIN CSTYLED */
274 	__asm__ __volatile__ (
275                 "ldr	q0, [%[src]]		\n\t"
276                 "str	q0, [%[dst]]		\n\t"
277                 "ldr	s0, [%[src], #16]	\n\t"
278                 "str	s0, [%[dst], #16]	\n\t"
279                 :
280                 : [src] "r" ((uint64_t *__unsafe_indexable)src), [dst] "r" ((uint64_t *__unsafe_indexable)dst)
281                 : "v0", "memory"
282         );
283 	/* END CSTYLED */
284 }
285 
286 /*
287  * Copy 24-bytes total, 64-bit aligned, SIMD (if available).
288  */
289 __attribute__((always_inline))
290 static inline void
291 __sk_vcopy64_24(uint64_t *__counted_by(3) src, uint64_t *__counted_by(3) dst)
292 {
293 	/*
294 	 * Use 16-bytes load/store and 8-bytes load/store on arm64;
295 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
296 	 */
297 	/* BEGIN CSTYLED */
298 	__asm__ __volatile__ (
299                 "ldr	q0, [%[src]]		\n\t"
300                 "str	q0, [%[dst]]		\n\t"
301                 "ldr	d0, [%[src], #16]	\n\t"
302                 "str	d0, [%[dst], #16]	\n\t"
303                 :
304                 : [src] "r" ((uint64_t *__unsafe_indexable)src), [dst] "r" ((uint64_t *__unsafe_indexable)dst)
305                 : "v0", "memory"
306         );
307 	/* END CSTYLED */
308 }
309 
310 /*
311  * Copy 32-bytes total, 64-bit aligned, SIMD (if available).
312  */
313 __attribute__((always_inline))
314 static inline void
315 __sk_vcopy64_32(uint64_t *__counted_by(4) src, uint64_t *__counted_by(4) dst)
316 {
317 	/* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
318 	/* BEGIN CSTYLED */
319 	__asm__ __volatile__ (
320                 "ldp	q0, q1, [%[src]]	\n\t"
321                 "stp	q0, q1, [%[dst]]	\n\t"
322                 :
323                 : [src] "r" ((uint64_t *__unsafe_indexable)src), [dst] "r" ((uint64_t *__unsafe_indexable)dst)
324                 : "v0", "v1", "memory"
325         );
326 	/* END CSTYLED */
327 }
328 
329 /*
330  * Copy 32-bytes total, 32-bit aligned, SIMD (if available).
331  */
332 __attribute__((always_inline))
333 static inline void
334 __sk_vcopy32_32(uint32_t *__counted_by(8) src, uint32_t *__counted_by(8) dst)
335 {
336 	/* use SIMD unaligned move on arm64 */
337 	__sk_vcopy64_32((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
338 }
339 
340 /*
341  * Copy 40-bytes total, 64-bit aligned, SIMD (if available).
342  */
343 __attribute__((always_inline))
344 static inline void
345 __sk_vcopy64_40(uint64_t *__sized_by(40) src, uint64_t *__sized_by(40) dst)
346 {
347 	/*
348 	 * Use 32-bytes load/store pair and 8-bytes load/store on arm64;
349 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
350 	 */
351 	/* BEGIN CSTYLED */
352 	__asm__ __volatile__ (
353                 "ldp	q0, q1, [%[src]]	\n\t"
354                 "stp	q0, q1, [%[dst]]	\n\t"
355                 "ldr	d0, [%[src], #32]	\n\t"
356                 "str	d0, [%[dst], #32]	\n\t"
357                 :
358                 : [src] "r" ((uint64_t *__unsafe_indexable)src), [dst] "r" ((uint64_t *__unsafe_indexable)dst)
359                 : "v0", "v1", "memory"
360         );
361 	/* END CSTYLED */
362 }
363 
364 /*
365  * On arm64, the following inline assembly fixed-length routines have
366  * fewer clock cycles than bzero().  We can directly use vector registers
367  * without saving/restoring them unlike on x86_64/arm32.
368  */
369 
370 /*
371  * Zero 16-bytes total, SIMD.
372  */
373 __attribute__((always_inline))
374 static inline void
__sk_zero_16(void * p)375 __sk_zero_16(void *p)
376 {
377 	/*
378 	 * Use 16-bytes store pair using 64-bit zero register on arm64;
379 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
380 	 */
381 	/* BEGIN CSTYLED */
382 	__asm__ __volatile__ (
383                 "stp	xzr, xzr, [%[p]]	\n\t"
384                 :
385                 : [p] "r" (p)
386                 : "memory"
387         );
388 	/* END CSTYLED */
389 }
390 
391 /*
392  * Zero 32-bytes total, SIMD.
393  */
394 __attribute__((always_inline))
395 static inline void
__sk_zero_32(void * p)396 __sk_zero_32(void *p)
397 {
398 	/*
399 	 * Use 32-bytes store pair using zeroed v0 register on arm64;
400 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
401 	 */
402 	/* BEGIN CSTYLED */
403 	__asm__ __volatile__ (
404                 "eor.16b v0, v0, v0		\n\t"
405                 "stp	 q0, q0, [%[p]]		\n\t"
406                 :
407                 : [p] "r" (p)
408                 : "v0", "memory", "cc"
409         );
410 	/* END CSTYLED */
411 }
412 
413 /*
414  * Zero 48-bytes total, SIMD.
415  */
416 __attribute__((always_inline))
417 static inline void
__sk_zero_48(void * p)418 __sk_zero_48(void *p)
419 {
420 	/*
421 	 * Use 32-bytes store pair and 16-byte store using zeroed v0
422 	 * register on arm64; no need to save/restore registers on
423 	 * arm64 (SPILL_REGISTERS).
424 	 */
425 	/* BEGIN CSTYLED */
426 	__asm__ __volatile__ (
427                 "eor.16b v0, v0, v0		\n\t"
428                 "stp	 q0, q0, [%[p]]		\n\t"
429                 "str	 q0, [%[p], #32]	\n\t"
430                 :
431                 : [p] "r" (p)
432                 : "v0", "memory", "cc"
433         );
434 	/* END CSTYLED */
435 }
436 
437 /*
438  * Zero 128-bytes total, SIMD.
439  */
440 __attribute__((always_inline))
441 static inline void
__sk_zero_128(void * p)442 __sk_zero_128(void *p)
443 {
444 	/*
445 	 * Use 4x 32-bytes store pairs using zeroed v0 register on arm64;
446 	 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
447 	 *
448 	 * Note that we could optimize this routine by utilizing "dc zva"
449 	 * which zeroes the entire cache line.  However, that requires
450 	 * us to guarantee that the address is cache line aligned which
451 	 * we cannot (at the moment).
452 	 */
453 	/* BEGIN CSTYLED */
454 	__asm__ __volatile__ (
455                 "eor.16b v0, v0, v0		\n\t"
456                 "stp	 q0, q0, [%[p]]		\n\t"
457                 "stp	 q0, q0, [%[p], #32]	\n\t"
458                 "stp	 q0, q0, [%[p], #64]	\n\t"
459                 "stp	 q0, q0, [%[p], #96]	\n\t"
460                 :
461                 : [p] "r" (p)
462                 : "v0", "memory", "cc"
463         );
464 	/* END CSTYLED */
465 }
466 #else /* !__arm64__ */
467 /*
468  * Just use bzero() for simplicity.  On x86_64, "rep stosb" microcoded
469  * implementation already uses wider stores and can go much faster than
470  * one byte per clock cycle.  For arm32, bzero() is also good enough.
471  */
472 #define __sk_zero_16(_p)        bzero(_p, 16)
473 #define __sk_zero_32(_p)        bzero(_p, 32)
474 #define __sk_zero_48(_p)        bzero(_p, 48)
475 #define __sk_zero_128(_p)       bzero(_p, 128)
476 #endif /* !__arm64__ */
477 
478 /*
479  * The following are optimized routines which rely on the caller
480  * rounding up the source and destination buffers to multiples of
481  * 4, 8 or 64 bytes, and are 64-bit aligned; faster than memcpy().
482  *
483  * Note: they do not support overlapping ranges.
484  */
485 
486 /*
487  * Threshold as to when we use memcpy() rather than unrolled copy.
488  */
489 #if defined(__x86_64__)
490 #define SK_COPY_THRES 2048
491 #elif defined(__arm64__)
492 #define SK_COPY_THRES 1024
493 #else /* !__x86_64__ && !__arm64__ */
494 #define SK_COPY_THRES 1024
495 #endif /* !__x86_64__ && !__arm64__ */
496 
497 #if (DEVELOPMENT || DEBUG)
498 extern size_t sk_copy_thres;
499 #endif /* (DEVELOPMENT || DEBUG) */
500 
501 /*
502  * Scalar version, 4-bytes multiple.
503  */
504 __attribute__((always_inline))
505 static inline void
sk_copy64_4x(uint32_t * __sized_by (l)src,uint32_t * __sized_by (l)dst,size_t l)506 sk_copy64_4x(uint32_t *__sized_by(l)src, uint32_t *__sized_by(l)dst, size_t l)
507 {
508 #if (DEVELOPMENT || DEBUG)
509 	if (__probable(l <= sk_copy_thres)) {
510 #else
511 	if (__probable(l <= SK_COPY_THRES)) {
512 #endif /* (!DEVELOPMENT && !DEBUG! */
513 		int i;
514 
515 		/*
516 		 * Clang is unable to optimize away bounds checks in the presence of
517 		 * divisions in the loop bound at this time. However, the caller
518 		 * already bounds-checked that each of `src` and `dst` have `l` bytes
519 		 * at them each. It's therefore safe to copy that many bytes.
520 		 */
521 		uint32_t *__unsafe_indexable src_unsafe = src;
522 		uint32_t *__unsafe_indexable dst_unsafe = dst;
523 		for (i = 0; i < l / 4; i++) {
524 			dst_unsafe[i] = src_unsafe[i]; /* [#i*4] */
525 		}
526 	} else {
527 		(void) memcpy((void *)dst, (void *)src, l);
528 	}
529 }
530 
531 /*
532  * Scalar version, 8-bytes multiple.
533  */
534 __attribute__((always_inline))
535 static inline void
536 sk_copy64_8x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
537 {
538 #if (DEVELOPMENT || DEBUG)
539 	if (__probable(l <= sk_copy_thres)) {
540 #else
541 	if (__probable(l <= SK_COPY_THRES)) {
542 #endif /* (!DEVELOPMENT && !DEBUG! */
543 		int i;
544 
545 		/*
546 		 * Clang is unable to optimize away bounds checks in the presence of
547 		 * divisions in the loop bound at this time. However, the caller
548 		 * already bounds-checked that each of `src` and `dst` have `l` bytes
549 		 * at them each. It's therefore safe to copy that many bytes.
550 		 */
551 		uint64_t *__unsafe_indexable src_unsafe = src;
552 		uint64_t *__unsafe_indexable dst_unsafe = dst;
553 		for (i = 0; i < l / 8; i++) {
554 			dst_unsafe[i] = src_unsafe[i]; /* [#i*8] */
555 		}
556 	} else {
557 		(void) memcpy((void *)dst, (void *)src, l);
558 	}
559 }
560 
561 /*
562  * Scalar version (usually faster than SIMD), 32-bytes multiple.
563  */
564 __attribute__((always_inline))
565 static inline void
566 sk_copy64_32x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
567 {
568 #if (DEVELOPMENT || DEBUG)
569 	if (__probable(l <= sk_copy_thres)) {
570 #else
571 	if (__probable(l <= SK_COPY_THRES)) {
572 #endif /* (!DEVELOPMENT && !DEBUG! */
573 		int n, i;
574 
575 		/*
576 		 * Clang is unable to optimize away bounds checks in the presence of
577 		 * divisions in the loop bound at this time. However, the caller
578 		 * already bounds-checked that each of `src` and `dst` have `l` bytes
579 		 * at them each. It's therefore safe to copy that many bytes.
580 		 */
581 		uint64_t *__unsafe_indexable src_unsafe = src;
582 		uint64_t *__unsafe_indexable dst_unsafe = dst;
583 		for (n = 0; n < l / 32; n++) {
584 			i = n * 4;
585 			dst_unsafe[i] = src_unsafe[i];         /* [#(i+0)*8] */
586 			dst_unsafe[i + 1] = src_unsafe[i + 1]; /* [#(i+1)*8] */
587 			dst_unsafe[i + 2] = src_unsafe[i + 2]; /* [#(i+2)*8] */
588 			dst_unsafe[i + 3] = src_unsafe[i + 3]; /* [#(i+3)*8] */
589 		}
590 	} else {
591 		(void) memcpy((void *)dst, (void *)src, l);
592 	}
593 }
594 
595 /*
596  * Scalar version (usually faster than SIMD), 64-bytes multiple.
597  */
598 __attribute__((always_inline))
599 static inline void
600 sk_copy64_64x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
601 {
602 #if (DEVELOPMENT || DEBUG)
603 	if (__probable(l <= sk_copy_thres)) {
604 #else
605 	if (__probable(l <= SK_COPY_THRES)) {
606 #endif /* (!DEVELOPMENT && !DEBUG! */
607 		int n, i;
608 
609 		/*
610 		 * Clang is unable to optimize away bounds checks in the presence of
611 		 * divisions in the loop bound at this time. However, the caller
612 		 * already bounds-checked that each of `src` and `dst` have `l` bytes
613 		 * at them each. It's therefore safe to copy that many bytes.
614 		 */
615 		uint64_t *__unsafe_indexable src_unsafe = src;
616 		uint64_t *__unsafe_indexable dst_unsafe = dst;
617 		for (n = 0; n < l / 64; n++) {
618 			i = n * 8;
619 			dst_unsafe[i] = src_unsafe[i];         /* [#(i+0)*8] */
620 			dst_unsafe[i + 1] = src_unsafe[i + 1]; /* [#(i+1)*8] */
621 			dst_unsafe[i + 2] = src_unsafe[i + 2]; /* [#(i+2)*8] */
622 			dst_unsafe[i + 3] = src_unsafe[i + 3]; /* [#(i+3)*8] */
623 			dst_unsafe[i + 4] = src_unsafe[i + 4]; /* [#(i+4)*8] */
624 			dst_unsafe[i + 5] = src_unsafe[i + 5]; /* [#(i+5)*8] */
625 			dst_unsafe[i + 6] = src_unsafe[i + 6]; /* [#(i+6)*8] */
626 			dst_unsafe[i + 7] = src_unsafe[i + 7]; /* [#(i+7)*8] */
627 		}
628 	} else {
629 		(void) memcpy((void *)dst, (void *)src, l);
630 	}
631 }
632 
633 /*
634  * Use scalar or SIMD based on platform/size.
635  */
636 #if defined(__x86_64__)
637 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
638 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
639 #define sk_copy64_16    __sk_copy64_16          /* scalar */
640 #define sk_copy32_16    __sk_copy32_16          /* scalar */
641 #define sk_copy64_20    __sk_copy64_20          /* scalar */
642 #define sk_copy64_24    __sk_copy64_24          /* scalar */
643 #define sk_copy64_32    __sk_copy64_32          /* scalar */
644 #define sk_copy32_32    __sk_copy32_32          /* scalar */
645 #define sk_copy64_40    __sk_copy64_40          /* scalar */
646 #define sk_zero_16      __sk_zero_16            /* scalar */
647 #define sk_zero_32      __sk_zero_32            /* scalar */
648 #define sk_zero_48      __sk_zero_48            /* scalar */
649 #define sk_zero_128     __sk_zero_128           /* scalar */
650 #elif defined(__arm64__)
651 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
652 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
653 #define sk_copy64_16    __sk_vcopy64_16         /* SIMD */
654 #define sk_copy32_16    __sk_vcopy32_16         /* SIMD */
655 #define sk_copy64_20    __sk_vcopy64_20         /* SIMD */
656 #define sk_copy64_24    __sk_vcopy64_24         /* SIMD */
657 #define sk_copy64_32    __sk_vcopy64_32         /* SIMD */
658 #define sk_copy32_32    __sk_vcopy32_32         /* SIMD */
659 #define sk_copy64_40    __sk_vcopy64_40         /* SIMD */
660 #define sk_zero_16      __sk_zero_16            /* SIMD */
661 #define sk_zero_32      __sk_zero_32            /* SIMD */
662 #define sk_zero_48      __sk_zero_48            /* SIMD */
663 #define sk_zero_128     __sk_zero_128           /* SIMD */
664 #else
665 #define sk_copy64_8     __sk_copy64_8           /* scalar only */
666 #define sk_copy32_8     __sk_copy32_8           /* scalar only */
667 #define sk_copy64_16    __sk_copy64_16          /* scalar */
668 #define sk_copy32_16    __sk_copy32_16          /* scalar */
669 #define sk_copy64_20    __sk_copy64_20          /* scalar */
670 #define sk_copy64_24    __sk_copy64_24          /* scalar */
671 #define sk_copy64_32    __sk_copy64_32          /* scalar */
672 #define sk_copy32_32    __sk_copy32_32          /* scalar */
673 #define sk_copy64_40    __sk_copy64_40          /* scalar */
674 #define sk_zero_16      __sk_zero_16            /* scalar */
675 #define sk_zero_32      __sk_zero_32            /* scalar */
676 #define sk_zero_48      __sk_zero_48            /* scalar */
677 #define sk_zero_128     __sk_zero_128           /* scalar */
678 #endif
679 
680 /*
681  * Do not use these directly.
682  * Use the skn_ variants if you need custom probe names.
683  */
684 #define _sk_alloc_type(probename, type, flags, name)                    \
685 ({                                                                      \
686 	/* XXX Modify this to use KT_PRIV_ACCT later  */                \
687 	__auto_type ret = kalloc_type_tag(type, Z_ZERO | (flags),       \
688 	    (name)->tag);                                               \
689 	DTRACE_SKYWALK3(probename, char *, #type, int, (flags),         \
690 	    void *, ret);                                               \
691 	ret;                                                            \
692 })
693 
694 #define _sk_alloc_type_array(probename, type, count, flags, name)       \
695 ({                                                                      \
696 	__auto_type ret = kalloc_type_tag(type, (count),                \
697 	    Z_ZERO | (flags), (name)->tag);                             \
698 	DTRACE_SKYWALK4(probename, char *, #type, size_t, (count),      \
699 	    int, (flags), void *, ret);                                 \
700 	ret;                                                            \
701 })
702 
703 #define _sk_alloc_type_hash(probename, heap, size, flags, name)         \
704 ({                                                                      \
705 	__auto_type ret = kalloc_type_var_impl((heap), (size),          \
706 	    __zone_flags_mix_tag((flags) | Z_ZERO, (name)->tag), NULL); \
707 	DTRACE_SKYWALK4(probename, char *, (heap)->kt_name + 5,         \
708 	    size_t, (size), int, (flags), void *, ret);                 \
709 	ret;                                                            \
710 })
711 
712 #define _sk_realloc_type_array(probename, type, oldcount, newcount, elem, flags, name) \
713 ({                                                                      \
714 	__auto_type ret = krealloc_type_tag(type, (oldcount),           \
715 	    (newcount), (elem), Z_ZERO | (flags), (name)->tag);         \
716 	DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldcount),  \
717 	    size_t, (newcount), int, (flags), void *, ret);             \
718 	ret;                                                            \
719 })
720 
721 #define _sk_alloc_type_header_array(probename, htype, type, count, flags, name) \
722 ({                                                                      \
723 	__auto_type ret = kalloc_type_tag(htype, type, (count),         \
724 	    Z_ZERO | (flags), (name)->tag);                             \
725 	DTRACE_SKYWALK5(probename, char *, #htype, char *, #type,       \
726 	    size_t, (count), int, (flags), void *, ret);                \
727 	ret;                                                            \
728 })
729 
730 #define _sk_free_type(probename, type, elem)                            \
731 {                                                                       \
732 	DTRACE_SKYWALK2(probename, char *, #type, void *, (elem));      \
733 	kfree_type(type, (elem));                                       \
734 }
735 
736 #define _sk_free_type_array(probename, type, count, elem)               \
737 {                                                                       \
738 	DTRACE_SKYWALK3(probename, char *, #type, size_t, (count),      \
739 	    void *, (elem));                                            \
740 	kfree_type(type, (count), (elem));                              \
741 }
742 
743 #define _sk_free_type_array_counted_by(probename, type, count, elem)    \
744 {                                                                       \
745 	DTRACE_SKYWALK3(probename, char *, #type, size_t, (count),      \
746 	    void *, (elem));                                            \
747 	kfree_type_counted_by(type, (count), (elem));                   \
748 }
749 
750 #define _sk_free_type_hash(probename, heap, size, elem)                 \
751 {                                                                       \
752 	DTRACE_SKYWALK3(probename, char *, (heap)->kt_name + 5,         \
753 	    size_t, (size), void *, (elem));                            \
754 	kfree_type_var_impl((heap), (elem), (size));                    \
755 }
756 
757 #define _sk_free_type_header_array(probename, htype, type, count, elem) \
758 {                                                                       \
759 	DTRACE_SKYWALK4(probename, char *, #htype, char *, #type,       \
760 	    size_t, (count), void *, (elem));                           \
761 	kfree_type(htype, type, (count), (elem));                       \
762 }
763 
764 #define _sk_alloc_data(probename, size, flags, name)                    \
765 ({                                                                      \
766 	void *ret;                                                      \
767                                                                         \
768 	ret = kalloc_data_tag((size), Z_ZERO | (flags), (name)->tag);   \
769 	DTRACE_SKYWALK3(probename, size_t, (size), int, (flags),        \
770 	    void *, ret);                                               \
771 	ret;                                                            \
772 })
773 
774 #define _sk_realloc_data(probename, elem, oldsize, newsize, flags, name) \
775 ({                                                                      \
776 	void *ret;                                                      \
777                                                                         \
778 	ret = krealloc_data_tag((elem), (oldsize), (newsize),           \
779 	    Z_ZERO | (flags), (name)->tag);                             \
780 	DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldsize),   \
781 	    size_t, (newsize), int, (flags), void *, ret);              \
782 	ret;                                                            \
783 })
784 
785 #define _sk_free_data(probename, elem, size)                            \
786 {                                                                       \
787 	DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size));     \
788 	kfree_data((elem), (size));                                     \
789 }
790 
791 #define _sk_free_data_sized_by(probename, elem, size)                   \
792 {                                                                       \
793 	DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size));     \
794 	kfree_data_sized_by((elem), (size));                            \
795 }
796 
797 #define sk_alloc_type(type, flags, tag)                                 \
798 	_sk_alloc_type(sk_alloc_type, type, flags, tag)
799 
800 #define sk_alloc_type_array(type, count, flags, tag)                    \
801 	_sk_alloc_type_array(sk_alloc_type_array, type, count, flags, tag)
802 
803 #define sk_alloc_type_hash(heap, size, flags, tag)                      \
804 	_sk_alloc_type_hash(sk_alloc_type_hash, heap, size, flags, tag)
805 
806 #define sk_alloc_type_header_array(htype, type, count, flags, tag)      \
807 	_sk_alloc_type_header_array(sk_alloc_type_header_array, htype,  \
808 	type, count, flags, tag)
809 
810 #define sk_realloc_type_array(type, oldsize, newsize, elem, flags, tag) \
811 	_sk_realloc_type_array(sk_realloc_type_array, type,             \
812 	oldsize, newsize, elem, flags, tag)
813 
814 #define sk_free_type(type, elem)                                        \
815 	_sk_free_type(sk_free_type, type, elem)
816 
817 #define sk_free_type_array(type, count, elem)                           \
818 	_sk_free_type_array(sk_free_type_array, type, count, elem)
819 
820 #define sk_free_type_array_counted_by(type, count, elem)                \
821 	_sk_free_type_array_counted_by(sk_free_type_array_counted_by, type, count, elem)
822 
823 #define sk_free_type_hash(heap, size, elem)                             \
824 	_sk_free_type_hash(sk_free_type_hash, heap, size, elem)
825 
826 #define sk_free_type_header_array(htype, type, count, elem)             \
827 	_sk_free_type_header_array(sk_free_type_header_array, htype,    \
828 	type, count, elem)
829 
830 #define sk_alloc_data(size, flags, tag)                                 \
831 	_sk_alloc_data(sk_alloc_data, size, flags, tag)
832 
833 #define sk_realloc_data(elem, oldsize, newsize, flags, tag)             \
834 	_sk_realloc_data(sk_realloc_data, elem, oldsize, newsize,       \
835 	flags, tag)
836 
837 #define sk_free_data(elem, size)                                        \
838 	_sk_free_data(sk_free_data, elem, size)
839 
840 #define sk_free_data_sized_by(elem, size)                               \
841 	_sk_free_data_sized_by(sk_free_data_sized_by, elem, size)
842 
843 /*
844  * The skn_ variants are meant to be used if you need to use two or more
845  * of the same call within the same function and you want the dtrace
846  * probename to be different at each callsite.
847  */
848 #define skn_realloc(name, elem, oldsize, newsize, flags, tag)           \
849 	_sk_realloc(sk_realloc_ ## name, elem, oldsize, newsize, flags, \
850 	tag)
851 
852 #define skn_alloc_type(name, type, flags, tag)                          \
853 	_sk_alloc_type(sk_alloc_type_ ## name, type, flags, tag)
854 
855 #define skn_alloc_type_array(name, type, count, flags, tag)             \
856 	_sk_alloc_type_array(sk_alloc_type_array_ ## name, type, count, \
857 	flags, tag)
858 
859 #define skn_alloc_type_hash(name, heap, size, flags, tag)               \
860 	_sk_alloc_type_hash(sk_alloc_type_hash_ ## name, heap, size,    \
861 	flags, tag)
862 
863 #define skn_alloc_type_header_array(name, htype, type, count, flags, tag) \
864 	_sk_alloc_type_header_array(sk_alloc_type_header_array_ ## name, \
865 	htype, type, count, flags, tag)
866 
867 #define skn_free_type(name, type, elem)                                 \
868 	_sk_free_type(sk_free_type_ ## name, type, elem)
869 
870 #define skn_free_type_array(name, type, count, elem)                    \
871 	_sk_free_type_array(sk_free_type_array_ ## name, type, count,   \
872 	elem)
873 
874 #define skn_free_type_array_counted_by(name, type, count, elem)         \
875 	_sk_free_type_array_counted_by(sk_free_type_array_ ## name, type, count,   \
876 	elem)
877 
878 #define skn_free_type_hash(name, heap, size, elem)                      \
879 	_sk_free_type_hash(sk_free_type_hash_ ## name, heap, size, elem)
880 
881 #define skn_free_type_header_array(name, htype, type, count, elem)      \
882 	_sk_free_type_header_array(sk_free_type_header_array_ ## name,  \
883 	htype, type, count, elem)
884 
885 #define skn_alloc_data(name, size, flags, tag)                          \
886 	_sk_alloc_data(sk_alloc_data_ ## name, size, flags, tag)
887 
888 #define skn_realloc_data(name, elem, oldsize, newsize, flags, tag)      \
889 	_sk_realloc_data(sk_realloc_data_ ## name, elem, oldsize, newsize,\
890 	flags, tag)
891 
892 #define skn_free_data(name, elem, size)                                 \
893 	_sk_free_data(sk_free_data_ ## name, elem, size)
894 
895 struct sk_tag_spec {
896 	kern_allocation_name_t *skt_var;
897 	const char             *skt_name;
898 };
899 
900 extern void __sk_tag_make(const struct sk_tag_spec *spec);
901 
902 #define SKMEM_TAG_DEFINE(var, name) \
903 	SECURITY_READ_ONLY_LATE(kern_allocation_name_t) var;            \
904 	__startup_data struct sk_tag_spec __sktag_##var = {             \
905 	    .skt_var = &var, .skt_name = name,                          \
906 	};                                                              \
907 	STARTUP_ARG(ZALLOC, STARTUP_RANK_LAST, __sk_tag_make, &__sktag_##var)
908 
909 /*!
910  *  @abstract Compare byte buffers of n bytes long src1 against src2, applying
911  *  the byte masks to input data before comparison.  (Scalar version)
912  *
913  *  @discussion
914  *  Returns zero if the two buffers are identical after applying the byte
915  *  masks, otherwise non-zero.
916  *  Zero-length buffers are always identical.
917  *
918  *  @param src1 first input buffer of n bytes long
919  *  @param src2 second input buffer of n bytes long
920  *  @param byte_mask byte mask of n bytes long applied before comparision
921  *  @param n number of bytes
922  */
923 static inline int
924 __sk_memcmp_mask_scalar(const uint8_t *__counted_by(n)src1,
925     const uint8_t *__counted_by(n)src2,
926     const uint8_t *__counted_by(n)byte_mask, size_t n)
927 {
928 	uint32_t result = 0;
929 	for (size_t i = 0; i < n; i++) {
930 		result |= (src1[i] ^ src2[i]) & byte_mask[i];
931 	}
932 	return result;
933 }
934 
935 static inline int
936 __sk_memcmp_mask_16B_scalar(const uint8_t *__counted_by(16)src1,
937     const uint8_t *__counted_by(16)src2,
938     const uint8_t *__counted_by(16)byte_mask)
939 {
940 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 16);
941 }
942 
943 static inline int
944 __sk_memcmp_mask_32B_scalar(const uint8_t *__counted_by(32)src1,
945     const uint8_t *__counted_by(32)src2,
946     const uint8_t *__counted_by(32)byte_mask)
947 {
948 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 32);
949 }
950 
951 static inline int
952 __sk_memcmp_mask_48B_scalar(const uint8_t *__counted_by(48)src1,
953     const uint8_t *__counted_by(48)src2,
954     const uint8_t *__counted_by(48)byte_mask)
955 {
956 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 48);
957 }
958 
959 static inline int
960 __sk_memcmp_mask_64B_scalar(const uint8_t *__counted_by(64)src1,
961     const uint8_t *__counted_by(64)src2,
962     const uint8_t *__counted_by(64)byte_mask)
963 {
964 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 64);
965 }
966 
967 static inline int
968 __sk_memcmp_mask_80B_scalar(const uint8_t *__counted_by(80)src1,
969     const uint8_t *__counted_by(80)src2,
970     const uint8_t *__counted_by(80)byte_mask)
971 {
972 	return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 80);
973 }
974 
975 #if defined(__arm64__) || defined(__arm__) || defined(__x86_64__)
976 extern int os_memcmp_mask_16B(const uint8_t *__counted_by(16)src1,
977     const uint8_t *__counted_by(16)src2,
978     const uint8_t *__counted_by(16)byte_mask);
979 extern int os_memcmp_mask_32B(const uint8_t *__counted_by(32)src1,
980     const uint8_t *__counted_by(32)src2,
981     const uint8_t *__counted_by(32)byte_mask);
982 extern int os_memcmp_mask_48B(const uint8_t *__counted_by(48)src1,
983     const uint8_t *__counted_by(48)src2,
984     const uint8_t *__counted_by(48)byte_mask);
985 extern int os_memcmp_mask_64B(const uint8_t *__counted_by(64)src1,
986     const uint8_t *__counted_by(64)src2,
987     const uint8_t *__counted_by(64)byte_mask);
988 extern int os_memcmp_mask_80B(const uint8_t *__counted_by(80)src1,
989     const uint8_t *__counted_by(80)src2,
990     const uint8_t *__counted_by(80)byte_mask);
991 
992 /*
993  * Use SIMD variants based on ARM64 and x86_64.
994  */
995 #define sk_memcmp_mask                  __sk_memcmp_mask
996 #define sk_memcmp_mask_16B              os_memcmp_mask_16B
997 #define sk_memcmp_mask_32B              os_memcmp_mask_32B
998 #define sk_memcmp_mask_48B              os_memcmp_mask_48B
999 #define sk_memcmp_mask_64B              os_memcmp_mask_64B
1000 #define sk_memcmp_mask_80B              os_memcmp_mask_80B
1001 
1002 /*!
1003  *  @abstract Compare byte buffers of n bytes long src1 against src2, applying
1004  *  the byte masks to input data before comparison.  (SIMD version)
1005  *
1006  *  @discussion
1007  *  Returns zero if the two buffers are identical after applying the byte
1008  *  masks, otherwise non-zero.
1009  *  Zero-length buffers are always identical.
1010  *
1011  *  @param src1 first input buffer of n bytes long
1012  *  @param src2 second input buffer of n bytes long
1013  *  @param byte_mask byte mask of n bytes long applied before comparision
1014  *  @param n number of bytes
1015  */
1016 static inline int
1017 __sk_memcmp_mask(const uint8_t *__counted_by(n)src1,
1018     const uint8_t *__counted_by(n)src2,
1019     const uint8_t *__counted_by(n)byte_mask, size_t n)
1020 {
1021 	uint32_t result = 0;
1022 	size_t i = 0;
1023 	for (; i + 64 <= n; i += 64) {
1024 		result |= sk_memcmp_mask_64B(src1 + i, src2 + i,
1025 		    byte_mask + i);
1026 	}
1027 	for (; i + 32 <= n; i += 32) {
1028 		result |= sk_memcmp_mask_32B(src1 + i, src2 + i,
1029 		    byte_mask + i);
1030 	}
1031 	for (; i + 16 <= n; i += 16) {
1032 		result |= sk_memcmp_mask_16B(src1 + i, src2 + i,
1033 		    byte_mask + i);
1034 	}
1035 	if (i < n) {
1036 		if (n >= 16) {
1037 			/* Compare the last 16 bytes with vector code. */
1038 			result |= sk_memcmp_mask_16B(src1 + n - 16,
1039 			    src2 + n - 16, byte_mask + n - 16);
1040 		} else {
1041 			/* Use scalar code if n < 16. */
1042 			for (; i < n; i++) {
1043 				result |= (src1[i] ^ src2[i]) & byte_mask[i];
1044 			}
1045 		}
1046 	}
1047 	return result;
1048 }
1049 #else /* !(__arm64__ || __arm__ || __x86_64__) */
1050 /*
1051  * Use scalar variants elsewhere.
1052  */
1053 #define sk_memcmp_mask                  __sk_memcmp_mask_scalar
1054 #define sk_memcmp_mask_16B              __sk_memcmp_mask_16B_scalar
1055 #define sk_memcmp_mask_32B              __sk_memcmp_mask_32B_scalar
1056 #define sk_memcmp_mask_48B              __sk_memcmp_mask_48B_scalar
1057 #define sk_memcmp_mask_64B              __sk_memcmp_mask_64B_scalar
1058 #define sk_memcmp_mask_80B              __sk_memcmp_mask_80B_scalar
1059 #endif /* !(__arm64__ || __arm__ || __x86_64__) */
1060 
1061 /*
1062  * Scalar variants are available on all platforms if needed.
1063  */
1064 #define sk_memcmp_mask_scalar           __sk_memcmp_mask_scalar
1065 #define sk_memcmp_mask_16B_scalar       __sk_memcmp_mask_16B_scalar
1066 #define sk_memcmp_mask_32B_scalar       __sk_memcmp_mask_32B_scalar
1067 #define sk_memcmp_mask_48B_scalar       __sk_memcmp_mask_48B_scalar
1068 #define sk_memcmp_mask_64B_scalar       __sk_memcmp_mask_64B_scalar
1069 #define sk_memcmp_mask_80B_scalar       __sk_memcmp_mask_80B_scalar
1070 
1071 #endif /* KERNEL */
1072 #endif /* PRIVATE || BSD_KERNEL_PRIVATE */
1073 #endif /* !_SKYWALK_COMMON_H_ */
1074