1 /*
2 * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 #ifndef _SKYWALK_COMMON_H_
30 #define _SKYWALK_COMMON_H_
31
32 #if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE)
33 /*
34 * Routines common to kernel and userland. This file is intended to
35 * be included by the Skywalk kernel and libsyscall code.
36 */
37
38 #include <skywalk/os_skywalk_private.h>
39
40 #ifndef KERNEL
41 #if defined(LIBSYSCALL_INTERFACE)
42 __BEGIN_DECLS
43 extern int fprintf_stderr(const char *format, ...);
44 __END_DECLS
45
46 /* CSTYLED */
47
48 #define SK_ABORT(msg) do { \
49 (void) fprintf_stderr("%s\n", msg); \
50 __asm__(""); __builtin_trap(); \
51 } while (0)
52
53 #define SK_ABORT_WITH_CAUSE(msg, cause) do { \
54 (void) fprintf_stderr("%s: cause 0x%x\n", msg, cause); \
55 __asm__(""); __builtin_trap(); \
56 } while (0)
57
58 #define SK_ABORT_DYNAMIC(msg) SK_ABORT(msg)
59
60
61 #define VERIFY(EX) do { \
62 if (__improbable(!(EX))) { \
63 SK_ABORT("assertion failed: " #EX); \
64 /* NOTREACHED */ \
65 __builtin_unreachable(); \
66 } \
67 } while (0)
68
69 #if (DEBUG || DEVELOPMENT)
70 #define ASSERT(EX) VERIFY(EX)
71 #else /* !DEBUG && !DEVELOPMENT */
72 #define ASSERT(EX) ((void)0)
73 #endif /* !DEBUG && !DEVELOPMENT */
74 #endif /* !LIBSYSCALL_INTERFACE */
75 #endif /* !KERNEL */
76
77 #ifndef container_of
78 #define container_of(ptr, type, member) \
79 ((type*)(((uintptr_t)ptr) - offsetof(type, member)))
80 #endif
81
82 /*
83 * Prefetch.
84 */
85 #define SK_PREFETCH(a, n) \
86 __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 0, 3)
87 #define SK_PREFETCHW(a, n) \
88 __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 1, 3)
89
90 /*
91 * Slower roundup function; if "align" is not power of 2 (else use P2ROUNDUP)
92 */
93 #define SK_ROUNDUP(x, align) \
94 ((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align)))))
95
96 /* compile time assert */
97 #ifndef _CASSERT
98 #define _CASSERT(x) _Static_assert(x, "compile-time assertion failed")
99 #endif /* !_CASSERT */
100
101 /* power of 2 address alignment */
102 #ifndef IS_P2ALIGNED
103 #define IS_P2ALIGNED(v, a) \
104 ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
105 #endif /* IS_P2ALIGNED */
106
107 #define __sk_aligned(a) __attribute__((__aligned__(a)))
108 #define __sk_packed __attribute__((__packed__))
109 #define __sk_unused __attribute__((__unused__))
110
111 #ifdef KERNEL
112 #include <sys/sdt.h>
113
114 /*
115 * Copy 8-bytes total, 64-bit aligned, scalar.
116 */
117 __attribute__((always_inline))
118 static inline void
__sk_copy64_8(uint64_t * src,uint64_t * dst)119 __sk_copy64_8(uint64_t *src, uint64_t *dst)
120 {
121 *dst = *src; /* [#0*8] */
122 }
123
124 /*
125 * Copy 8-bytes total, 32-bit aligned, scalar.
126 */
127 __attribute__((always_inline))
128 static inline void
__sk_copy32_8(uint32_t * src,uint32_t * dst)129 __sk_copy32_8(uint32_t *src, uint32_t *dst)
130 {
131 #if defined(__x86_64__)
132 /* use unaligned scalar move on x86_64 */
133 __sk_copy64_8((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
134 #else
135 *dst++ = *src++; /* dw[0] */
136 *dst = *src; /* dw[1] */
137 #endif
138 }
139
140 /*
141 * Copy 16-bytes total, 64-bit aligned, scalar.
142 */
143 static inline void
__sk_copy64_16(uint64_t * src,uint64_t * dst)144 __sk_copy64_16(uint64_t *src, uint64_t *dst)
145 {
146 *dst++ = *src++; /* [#0*8] */
147 *dst = *src; /* [#1*8] */
148 }
149
150 /*
151 * Copy 16-bytes total, 64-bit aligned, SIMD (if available).
152 */
153 __attribute__((always_inline))
154 static inline void
__sk_vcopy64_16(uint64_t * src,uint64_t * dst)155 __sk_vcopy64_16(uint64_t *src, uint64_t *dst)
156 {
157 #if defined(__arm64__)
158 /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
159 /* BEGIN CSTYLED */
160 __asm__ __volatile__ (
161 "ldr q0, [%[src]] \n\t"
162 "str q0, [%[dst]] \n\t"
163 :
164 : [src] "r" (src), [dst] "r" (dst)
165 : "v0", "memory"
166 );
167 /* END CSTYLED */
168 #else
169 __sk_copy64_16(src, dst);
170 #endif
171 }
172
173 /*
174 * Copy 16-bytes total, 32-bit aligned, scalar.
175 */
176 __attribute__((always_inline))
177 static inline void
__sk_copy32_16(uint32_t * src,uint32_t * dst)178 __sk_copy32_16(uint32_t *src, uint32_t *dst)
179 {
180 *dst++ = *src++; /* [#0*4] */
181 *dst++ = *src++; /* [#1*4] */
182 *dst++ = *src++; /* [#2*4] */
183 *dst = *src; /* [#3*4] */
184 }
185
186 /*
187 * Copy 16-bytes total, 32-bit aligned, SIMD (if available).
188 */
189 __attribute__((always_inline))
190 static inline void
__sk_vcopy32_16(uint32_t * src,uint32_t * dst)191 __sk_vcopy32_16(uint32_t *src, uint32_t *dst)
192 {
193 #if defined(__arm64__)
194 /* use SIMD unaligned move on arm64 */
195 __sk_vcopy64_16((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
196 #else
197 __sk_copy32_16(src, dst);
198 #endif
199 }
200
201 /*
202 * Copy 20-bytes total, 64-bit aligned, scalar.
203 */
204 __attribute__((always_inline))
205 static inline void
__sk_copy64_20(uint64_t * src,uint64_t * dst)206 __sk_copy64_20(uint64_t *src, uint64_t *dst)
207 {
208 *dst++ = *src++; /* [#0*8] */
209 *dst++ = *src++; /* [#1*8] */
210 *(uint32_t *)dst = *(uint32_t *)src; /* [#2*4] */
211 }
212
213 /*
214 * Copy 20-bytes total, 64-bit aligned, SIMD (if available).
215 */
216 __attribute__((always_inline))
217 static inline void
__sk_vcopy64_20(uint64_t * src,uint64_t * dst)218 __sk_vcopy64_20(uint64_t *src, uint64_t *dst)
219 {
220 #if defined(__arm64__)
221 /*
222 * Load pair 2x16-bytes, store single 16-bytes and 4-bytes;
223 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
224 */
225 /* BEGIN CSTYLED */
226 __asm__ __volatile__ (
227 "ldp q0, q1, [%[src]] \n\t"
228 "str q0, [%[dst]] \n\t"
229 "str s1, [%[dst], #16] \n\t"
230 :
231 : [src] "r" (src), [dst] "r" (dst)
232 : "v0", "v1", "memory"
233 );
234 /* END CSTYLED */
235 #else
236 __sk_copy64_20(src, dst);
237 #endif
238 }
239
240 /*
241 * Copy 24-bytes total, 64-bit aligned, scalar.
242 */
243 __attribute__((always_inline))
244 static inline void
__sk_copy64_24(uint64_t * src,uint64_t * dst)245 __sk_copy64_24(uint64_t *src, uint64_t *dst)
246 {
247 *dst++ = *src++; /* [#0*8] */
248 *dst++ = *src++; /* [#1*8] */
249 *dst = *src; /* [#2*8] */
250 }
251
252 /*
253 * Copy 24-bytes total, 64-bit aligned, SIMD (if available).
254 */
255 __attribute__((always_inline))
256 static inline void
__sk_vcopy64_24(uint64_t * src,uint64_t * dst)257 __sk_vcopy64_24(uint64_t *src, uint64_t *dst)
258 {
259 #if defined(__arm64__)
260 /*
261 * Use 16-bytes load/store and 8-bytes load/store on arm64;
262 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
263 */
264 /* BEGIN CSTYLED */
265 __asm__ __volatile__ (
266 "ldr q0, [%[src]] \n\t"
267 "str q0, [%[dst]] \n\t"
268 "ldr d0, [%[src], #16] \n\t"
269 "str d0, [%[dst], #16] \n\t"
270 :
271 : [src] "r" (src), [dst] "r" (dst)
272 : "v0", "memory"
273 );
274 /* END CSTYLED */
275 #else
276 __sk_copy64_24(src, dst);
277 #endif
278 }
279
280 /*
281 * Copy 32-bytes total, 64-bit aligned, scalar.
282 */
283 __attribute__((always_inline))
284 static inline void
__sk_copy64_32(uint64_t * src,uint64_t * dst)285 __sk_copy64_32(uint64_t *src, uint64_t *dst)
286 {
287 *dst++ = *src++; /* [#0*8] */
288 *dst++ = *src++; /* [#1*8] */
289 *dst++ = *src++; /* [#2*8] */
290 *dst = *src; /* [#3*8] */
291 }
292
293 /*
294 * Copy 32-bytes total, 64-bit aligned, SIMD (if available).
295 */
296 __attribute__((always_inline))
297 static inline void
__sk_vcopy64_32(uint64_t * src,uint64_t * dst)298 __sk_vcopy64_32(uint64_t *src, uint64_t *dst)
299 {
300 #if defined(__arm64__)
301 /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
302 /* BEGIN CSTYLED */
303 __asm__ __volatile__ (
304 "ldp q0, q1, [%[src]] \n\t"
305 "stp q0, q1, [%[dst]] \n\t"
306 :
307 : [src] "r" (src), [dst] "r" (dst)
308 : "v0", "v1", "memory"
309 );
310 /* END CSTYLED */
311 #else
312 __sk_copy64_32(src, dst);
313 #endif
314 }
315
316 /*
317 * Copy 32-bytes total, 32-bit aligned, scalar.
318 */
319 __attribute__((always_inline))
320 static inline void
__sk_copy32_32(uint32_t * src,uint32_t * dst)321 __sk_copy32_32(uint32_t *src, uint32_t *dst)
322 {
323 *dst++ = *src++; /* [#0*4] */
324 *dst++ = *src++; /* [#1*4] */
325 *dst++ = *src++; /* [#2*4] */
326 *dst++ = *src++; /* [#3*4] */
327 *dst++ = *src++; /* [#4*4] */
328 *dst++ = *src++; /* [#5*4] */
329 *dst++ = *src++; /* [#6*4] */
330 *dst = *src; /* [#7*4] */
331 }
332
333 /*
334 * Copy 32-bytes total, 32-bit aligned, SIMD (if available).
335 */
336 __attribute__((always_inline))
337 static inline void
__sk_vcopy32_32(uint32_t * src,uint32_t * dst)338 __sk_vcopy32_32(uint32_t *src, uint32_t *dst)
339 {
340 #if defined(__arm64__)
341 /* use SIMD unaligned move on arm64 */
342 __sk_vcopy64_32((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
343 #else
344 __sk_copy32_32(src, dst);
345 #endif
346 }
347
348 /*
349 * Copy 40-bytes total, 64-bit aligned, scalar.
350 */
351 __attribute__((always_inline))
352 static inline void
__sk_copy64_40(uint64_t * src,uint64_t * dst)353 __sk_copy64_40(uint64_t *src, uint64_t *dst)
354 {
355 *dst++ = *src++; /* [#0*8] */
356 *dst++ = *src++; /* [#1*8] */
357 *dst++ = *src++; /* [#2*8] */
358 *dst++ = *src++; /* [#3*8] */
359 *dst = *src; /* [#4*8] */
360 }
361
362 /*
363 * Copy 40-bytes total, 64-bit aligned, SIMD (if available).
364 */
365 __attribute__((always_inline))
366 static inline void
__sk_vcopy64_40(uint64_t * src,uint64_t * dst)367 __sk_vcopy64_40(uint64_t *src, uint64_t *dst)
368 {
369 #if defined(__arm64__)
370 /*
371 * Use 32-bytes load/store pair and 8-bytes load/store on arm64;
372 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
373 */
374 /* BEGIN CSTYLED */
375 __asm__ __volatile__ (
376 "ldp q0, q1, [%[src]] \n\t"
377 "stp q0, q1, [%[dst]] \n\t"
378 "ldr d0, [%[src], #32] \n\t"
379 "str d0, [%[dst], #32] \n\t"
380 :
381 : [src] "r" (src), [dst] "r" (dst)
382 : "v0", "v1", "memory"
383 );
384 /* END CSTYLED */
385 #else
386 __sk_copy64_40(src, dst);
387 #endif
388 }
389
390 #if defined(__arm64__)
391 /*
392 * On arm64, the following inline assembly fixed-length routines have
393 * fewer clock cycles than bzero(). We can directly use vector registers
394 * without saving/restoring them unlike on x86_64/arm32.
395 */
396
397 /*
398 * Zero 16-bytes total, SIMD.
399 */
400 __attribute__((always_inline))
401 static inline void
__sk_zero_16(void * p)402 __sk_zero_16(void *p)
403 {
404 /*
405 * Use 16-bytes store pair using 64-bit zero register on arm64;
406 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
407 */
408 /* BEGIN CSTYLED */
409 __asm__ __volatile__ (
410 "stp xzr, xzr, [%[p]] \n\t"
411 :
412 : [p] "r" (p)
413 : "memory"
414 );
415 /* END CSTYLED */
416 }
417
418 /*
419 * Zero 32-bytes total, SIMD.
420 */
421 __attribute__((always_inline))
422 static inline void
__sk_zero_32(void * p)423 __sk_zero_32(void *p)
424 {
425 /*
426 * Use 32-bytes store pair using zeroed v0 register on arm64;
427 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
428 */
429 /* BEGIN CSTYLED */
430 __asm__ __volatile__ (
431 "eor.16b v0, v0, v0 \n\t"
432 "stp q0, q0, [%[p]] \n\t"
433 :
434 : [p] "r" (p)
435 : "v0", "memory", "cc"
436 );
437 /* END CSTYLED */
438 }
439
440 /*
441 * Zero 48-bytes total, SIMD.
442 */
443 __attribute__((always_inline))
444 static inline void
__sk_zero_48(void * p)445 __sk_zero_48(void *p)
446 {
447 /*
448 * Use 32-bytes store pair and 16-byte store using zeroed v0
449 * register on arm64; no need to save/restore registers on
450 * arm64 (SPILL_REGISTERS).
451 */
452 /* BEGIN CSTYLED */
453 __asm__ __volatile__ (
454 "eor.16b v0, v0, v0 \n\t"
455 "stp q0, q0, [%[p]] \n\t"
456 "str q0, [%[p], #32] \n\t"
457 :
458 : [p] "r" (p)
459 : "v0", "memory", "cc"
460 );
461 /* END CSTYLED */
462 }
463
464 /*
465 * Zero 128-bytes total, SIMD.
466 */
467 __attribute__((always_inline))
468 static inline void
__sk_zero_128(void * p)469 __sk_zero_128(void *p)
470 {
471 /*
472 * Use 4x 32-bytes store pairs using zeroed v0 register on arm64;
473 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
474 *
475 * Note that we could optimize this routine by utilizing "dc zva"
476 * which zeroes the entire cache line. However, that requires
477 * us to guarantee that the address is cache line aligned which
478 * we cannot (at the moment).
479 */
480 /* BEGIN CSTYLED */
481 __asm__ __volatile__ (
482 "eor.16b v0, v0, v0 \n\t"
483 "stp q0, q0, [%[p]] \n\t"
484 "stp q0, q0, [%[p], #32] \n\t"
485 "stp q0, q0, [%[p], #64] \n\t"
486 "stp q0, q0, [%[p], #96] \n\t"
487 :
488 : [p] "r" (p)
489 : "v0", "memory", "cc"
490 );
491 /* END CSTYLED */
492 }
493 #else /* !__arm64__ */
494 /*
495 * Just use bzero() for simplicity. On x86_64, "rep stosb" microcoded
496 * implementation already uses wider stores and can go much faster than
497 * one byte per clock cycle. For arm32, bzero() is also good enough.
498 */
499 #define __sk_zero_16(_p) bzero(_p, 16)
500 #define __sk_zero_32(_p) bzero(_p, 32)
501 #define __sk_zero_48(_p) bzero(_p, 48)
502 #define __sk_zero_128(_p) bzero(_p, 128)
503 #endif /* !__arm64__ */
504
505 /*
506 * The following are optimized routines which rely on the caller
507 * rounding up the source and destination buffers to multiples of
508 * 4, 8 or 64 bytes, and are 64-bit aligned; faster than memcpy().
509 *
510 * Note: they do not support overlapping ranges.
511 */
512
513 /*
514 * Threshold as to when we use memcpy() rather than unrolled copy.
515 */
516 #if defined(__x86_64__)
517 #define SK_COPY_THRES 2048
518 #elif defined(__arm64__)
519 #define SK_COPY_THRES 1024
520 #else /* !__x86_64__ && !__arm64__ */
521 #define SK_COPY_THRES 1024
522 #endif /* !__x86_64__ && !__arm64__ */
523
524 #if (DEVELOPMENT || DEBUG)
525 extern size_t sk_copy_thres;
526 #endif /* (DEVELOPMENT || DEBUG) */
527
528 /*
529 * Scalar version, 4-bytes multiple.
530 */
531 __attribute__((always_inline))
532 static inline void
sk_copy64_4x(uint32_t * src,uint32_t * dst,size_t l)533 sk_copy64_4x(uint32_t *src, uint32_t *dst, size_t l)
534 {
535 #if (DEVELOPMENT || DEBUG)
536 if (__probable(l <= sk_copy_thres)) {
537 #else
538 if (__probable(l <= SK_COPY_THRES)) {
539 #endif /* (!DEVELOPMENT && !DEBUG! */
540 while ((ssize_t)(l -= 4) >= 0) {
541 *dst++ = *src++; /* [#n*4] */
542 }
543 } else {
544 (void) memcpy((void *)dst, (void *)src, l);
545 }
546 }
547
548 /*
549 * Scalar version, 8-bytes multiple.
550 */
551 __attribute__((always_inline))
552 static inline void
553 sk_copy64_8x(uint64_t *src, uint64_t *dst, size_t l)
554 {
555 #if (DEVELOPMENT || DEBUG)
556 if (__probable(l <= sk_copy_thres)) {
557 #else
558 if (__probable(l <= SK_COPY_THRES)) {
559 #endif /* (!DEVELOPMENT && !DEBUG! */
560 while ((ssize_t)(l -= 8) >= 0) {
561 *dst++ = *src++; /* [#n*8] */
562 }
563 } else {
564 (void) memcpy((void *)dst, (void *)src, l);
565 }
566 }
567
568 /*
569 * Scalar version (usually faster than SIMD), 32-bytes multiple.
570 */
571 __attribute__((always_inline))
572 static inline void
573 sk_copy64_32x(uint64_t *src, uint64_t *dst, size_t l)
574 {
575 #if (DEVELOPMENT || DEBUG)
576 if (__probable(l <= sk_copy_thres)) {
577 #else
578 if (__probable(l <= SK_COPY_THRES)) {
579 #endif /* (!DEVELOPMENT && !DEBUG! */
580 while ((ssize_t)(l -= 32) >= 0) {
581 *dst++ = *src++; /* [#0*8] */
582 *dst++ = *src++; /* [#1*8] */
583 *dst++ = *src++; /* [#2*8] */
584 *dst++ = *src++; /* [#3*8] */
585 }
586 } else {
587 (void) memcpy((void *)dst, (void *)src, l);
588 }
589 }
590
591 /*
592 * Scalar version (usually faster than SIMD), 64-bytes multiple.
593 */
594 __attribute__((always_inline))
595 static inline void
596 sk_copy64_64x(uint64_t *src, uint64_t *dst, size_t l)
597 {
598 #if (DEVELOPMENT || DEBUG)
599 if (__probable(l <= sk_copy_thres)) {
600 #else
601 if (__probable(l <= SK_COPY_THRES)) {
602 #endif /* (!DEVELOPMENT && !DEBUG! */
603 while ((ssize_t)(l -= 64) >= 0) {
604 *dst++ = *src++; /* [#0*8] */
605 *dst++ = *src++; /* [#1*8] */
606 *dst++ = *src++; /* [#2*8] */
607 *dst++ = *src++; /* [#3*8] */
608 *dst++ = *src++; /* [#4*8] */
609 *dst++ = *src++; /* [#5*8] */
610 *dst++ = *src++; /* [#6*8] */
611 *dst++ = *src++; /* [#7*8] */
612 }
613 } else {
614 (void) memcpy((void *)dst, (void *)src, l);
615 }
616 }
617
618 /*
619 * Use scalar or SIMD based on platform/size.
620 */
621 #if defined(__x86_64__)
622 #define sk_copy64_8 __sk_copy64_8 /* scalar only */
623 #define sk_copy32_8 __sk_copy32_8 /* scalar only */
624 #define sk_copy64_16 __sk_copy64_16 /* scalar */
625 #define sk_copy32_16 __sk_copy32_16 /* scalar */
626 #define sk_copy64_20 __sk_copy64_20 /* scalar */
627 #define sk_copy64_24 __sk_copy64_24 /* scalar */
628 #define sk_copy64_32 __sk_copy64_32 /* scalar */
629 #define sk_copy32_32 __sk_copy32_32 /* scalar */
630 #define sk_copy64_40 __sk_copy64_40 /* scalar */
631 #define sk_zero_16 __sk_zero_16 /* scalar */
632 #define sk_zero_32 __sk_zero_32 /* scalar */
633 #define sk_zero_48 __sk_zero_48 /* scalar */
634 #define sk_zero_128 __sk_zero_128 /* scalar */
635 #elif defined(__arm64__)
636 #define sk_copy64_8 __sk_copy64_8 /* scalar only */
637 #define sk_copy32_8 __sk_copy32_8 /* scalar only */
638 #define sk_copy64_16 __sk_vcopy64_16 /* SIMD */
639 #define sk_copy32_16 __sk_vcopy32_16 /* SIMD */
640 #define sk_copy64_20 __sk_vcopy64_20 /* SIMD */
641 #define sk_copy64_24 __sk_vcopy64_24 /* SIMD */
642 #define sk_copy64_32 __sk_vcopy64_32 /* SIMD */
643 #define sk_copy32_32 __sk_vcopy32_32 /* SIMD */
644 #define sk_copy64_40 __sk_vcopy64_40 /* SIMD */
645 #define sk_zero_16 __sk_zero_16 /* SIMD */
646 #define sk_zero_32 __sk_zero_32 /* SIMD */
647 #define sk_zero_48 __sk_zero_48 /* SIMD */
648 #define sk_zero_128 __sk_zero_128 /* SIMD */
649 #else
650 #define sk_copy64_8 __sk_copy64_8 /* scalar only */
651 #define sk_copy32_8 __sk_copy32_8 /* scalar only */
652 #define sk_copy64_16 __sk_copy64_16 /* scalar */
653 #define sk_copy32_16 __sk_copy32_16 /* scalar */
654 #define sk_copy64_20 __sk_copy64_20 /* scalar */
655 #define sk_copy64_24 __sk_copy64_24 /* scalar */
656 #define sk_copy64_32 __sk_copy64_32 /* scalar */
657 #define sk_copy32_32 __sk_copy32_32 /* scalar */
658 #define sk_copy64_40 __sk_copy64_40 /* scalar */
659 #define sk_zero_16 __sk_zero_16 /* scalar */
660 #define sk_zero_32 __sk_zero_32 /* scalar */
661 #define sk_zero_48 __sk_zero_48 /* scalar */
662 #define sk_zero_128 __sk_zero_128 /* scalar */
663 #endif
664
665 /*
666 * Do not use these directly.
667 * Use the skn_ variants if you need custom probe names.
668 */
669 #define _sk_alloc_type(probename, type, flags, name) \
670 ({ \
671 void *ret; \
672 \
673 /* XXX Modify this to use KT_PRIV_ACCT later */ \
674 ret = kalloc_type_tag(type, Z_ZERO | (flags), (name)->tag); \
675 DTRACE_SKYWALK3(probename, char *, #type, int, (flags), \
676 void *, ret); \
677 ret; \
678 })
679
680 #define _sk_alloc_type_array(probename, type, count, flags, name) \
681 ({ \
682 void *ret; \
683 \
684 ret = kalloc_type_tag(type, (count), Z_ZERO | (flags), \
685 (name)->tag); \
686 DTRACE_SKYWALK4(probename, char *, #type, size_t, (count), \
687 int, (flags), void *, ret); \
688 ret; \
689 })
690
691 #define _sk_alloc_type_hash(probename, heap, size, flags, name) \
692 ({ \
693 void *ret; \
694 \
695 ret = kalloc_type_var_impl((heap), (size), \
696 __zone_flags_mix_tag((flags) | Z_ZERO, (name)->tag), NULL); \
697 DTRACE_SKYWALK4(probename, char *, (heap)->kt_name + 5, \
698 size_t, (size), int, (flags), void *, ret); \
699 ret; \
700 })
701
702 #define _sk_realloc_type_array(probename, type, oldcount, newcount, elem, flags, name) \
703 ({ \
704 void *ret; \
705 \
706 ret = krealloc_type_tag(type, (oldcount), (newcount), (elem), \
707 Z_ZERO | (flags), (name)->tag); \
708 DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldcount), \
709 size_t, (newcount), int, (flags), void *, ret); \
710 ret; \
711 })
712
713 #define _sk_alloc_type_header_array(probename, htype, type, count, flags, name) \
714 ({ \
715 void *ret; \
716 \
717 ret = kalloc_type_tag(htype, type, (count), Z_ZERO | (flags), \
718 (name)->tag); \
719 DTRACE_SKYWALK5(probename, char *, #htype, char *, #type, \
720 size_t, (count), int, (flags), void *, ret); \
721 ret; \
722 })
723
724 #define _sk_free_type(probename, type, elem) \
725 { \
726 DTRACE_SKYWALK2(probename, char *, #type, void *, (elem)); \
727 kfree_type(type, (elem)); \
728 }
729
730 #define _sk_free_type_array(probename, type, count, elem) \
731 { \
732 DTRACE_SKYWALK3(probename, char *, #type, size_t, (count), \
733 void *, (elem)); \
734 kfree_type(type, (count), (elem)); \
735 }
736
737 #define _sk_free_type_hash(probename, heap, size, elem) \
738 { \
739 DTRACE_SKYWALK3(probename, char *, (heap)->kt_name + 5, \
740 size_t, (size), void *, (elem)); \
741 kfree_type_var_impl((heap), (elem), (size)); \
742 }
743
744 #define _sk_free_type_header_array(probename, htype, type, count, elem) \
745 { \
746 DTRACE_SKYWALK4(probename, char *, #htype, char *, #type, \
747 size_t, (count), void *, (elem)); \
748 kfree_type(htype, type, (count), (elem)); \
749 }
750
751 #define _sk_alloc_data(probename, size, flags, name) \
752 ({ \
753 void *ret; \
754 \
755 ret = kalloc_data_tag((size), Z_ZERO | (flags), (name)->tag); \
756 DTRACE_SKYWALK3(probename, size_t, (size), int, (flags), \
757 void *, ret); \
758 ret; \
759 })
760
761 #define _sk_realloc_data(probename, elem, oldsize, newsize, flags, name) \
762 ({ \
763 void *ret; \
764 \
765 ret = krealloc_data_tag((elem), (oldsize), (newsize), \
766 Z_ZERO | (flags), (name)->tag); \
767 DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldsize), \
768 size_t, (newsize), int, (flags), void *, ret); \
769 ret; \
770 })
771
772 #define _sk_free_data(probename, elem, size) \
773 { \
774 DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size)); \
775 kfree_data((elem), (size)); \
776 }
777
778 #define sk_alloc_type(type, flags, tag) \
779 _sk_alloc_type(sk_alloc_type, type, flags, tag)
780
781 #define sk_alloc_type_array(type, count, flags, tag) \
782 _sk_alloc_type_array(sk_alloc_type_array, type, count, flags, tag)
783
784 #define sk_alloc_type_hash(heap, size, flags, tag) \
785 _sk_alloc_type_hash(sk_alloc_type_hash, heap, size, flags, tag)
786
787 #define sk_alloc_type_header_array(htype, type, count, flags, tag) \
788 _sk_alloc_type_header_array(sk_alloc_type_header_array, htype, \
789 type, count, flags, tag)
790
791 #define sk_realloc_type_array(type, oldsize, newsize, elem, flags, tag) \
792 _sk_realloc_type_array(sk_realloc_type_array, type, \
793 oldsize, newsize, elem, flags, tag)
794
795 #define sk_free_type(type, elem) \
796 _sk_free_type(sk_free_type, type, elem)
797
798 #define sk_free_type_array(type, count, elem) \
799 _sk_free_type_array(sk_free_type_array, type, count, elem)
800
801 #define sk_free_type_hash(heap, size, elem) \
802 _sk_free_type_hash(sk_free_type_hash, heap, size, elem)
803
804 #define sk_free_type_header_array(htype, type, count, elem) \
805 _sk_free_type_header_array(sk_free_type_header_array, htype, \
806 type, count, elem)
807
808 #define sk_alloc_data(size, flags, tag) \
809 _sk_alloc_data(sk_alloc_data, size, flags, tag)
810
811 #define sk_realloc_data(elem, oldsize, newsize, flags, tag) \
812 _sk_realloc_data(sk_realloc_data, elem, oldsize, newsize, \
813 flags, tag)
814
815 #define sk_free_data(elem, size) \
816 _sk_free_data(sk_free_data, elem, size)
817
818 /*
819 * The skn_ variants are meant to be used if you need to use two or more
820 * of the same call within the same function and you want the dtrace
821 * probename to be different at each callsite.
822 */
823 #define skn_realloc(name, elem, oldsize, newsize, flags, tag) \
824 _sk_realloc(sk_realloc_ ## name, elem, oldsize, newsize, flags, \
825 tag)
826
827 #define skn_alloc_type(name, type, flags, tag) \
828 _sk_alloc_type(sk_alloc_type_ ## name, type, flags, tag)
829
830 #define skn_alloc_type_array(name, type, count, flags, tag) \
831 _sk_alloc_type_array(sk_alloc_type_array_ ## name, type, count, \
832 flags, tag)
833
834 #define skn_alloc_type_hash(name, heap, size, flags, tag) \
835 _sk_alloc_type_hash(sk_alloc_type_hash_ ## name, heap, size, \
836 flags, tag)
837
838 #define skn_alloc_type_header_array(name, htype, type, count, flags, tag) \
839 _sk_alloc_type_header_array(sk_alloc_type_header_array_ ## name, \
840 htype, type, count, flags, tag)
841
842 #define skn_free_type(name, type, elem) \
843 _sk_free_type(sk_free_type_ ## name, type, elem)
844
845 #define skn_free_type_array(name, type, count, elem) \
846 _sk_free_type_array(sk_free_type_array_ ## name, type, count, \
847 elem)
848
849 #define skn_free_type_hash(name, heap, size, elem) \
850 _sk_free_type_hash(sk_free_type_hash_ ## name, heap, size, elem)
851
852 #define skn_free_type_header_array(name, htype, type, count, elem) \
853 _sk_free_type_header_array(sk_free_type_header_array_ ## name, \
854 htype, type, count, elem)
855
856 #define skn_alloc_data(name, size, flags, tag) \
857 _sk_alloc_data(sk_alloc_data_ ## name, size, flags, tag)
858
859 #define skn_realloc_data(name, elem, oldsize, newsize, flags, tag) \
860 _sk_realloc_data(sk_realloc_data_ ## name, elem, oldsize, newsize,\
861 flags, tag)
862
863 #define skn_free_data(name, elem, size) \
864 _sk_free_data(sk_free_data_ ## name, elem, size)
865
866 struct sk_tag_spec {
867 kern_allocation_name_t *skt_var;
868 const char *skt_name;
869 };
870
871 extern void __sk_tag_make(const struct sk_tag_spec *spec);
872
873 #define SKMEM_TAG_DEFINE(var, name) \
874 SECURITY_READ_ONLY_LATE(kern_allocation_name_t) var; \
875 __startup_data struct sk_tag_spec __sktag_##var = { \
876 .skt_var = &var, .skt_name = name, \
877 }; \
878 STARTUP_ARG(ZALLOC, STARTUP_RANK_LAST, __sk_tag_make, &__sktag_##var)
879
880 /*!
881 * @abstract Compare byte buffers of n bytes long src1 against src2, applying
882 * the byte masks to input data before comparison. (Scalar version)
883 *
884 * @discussion
885 * Returns zero if the two buffers are identical after applying the byte
886 * masks, otherwise non-zero.
887 * Zero-length buffers are always identical.
888 *
889 * @param src1 first input buffer of n bytes long
890 * @param src2 second input buffer of n bytes long
891 * @param byte_mask byte mask of n bytes long applied before comparision
892 * @param n number of bytes
893 */
894 static inline int
895 __sk_memcmp_mask_scalar(const uint8_t *src1, const uint8_t *src2,
896 const uint8_t *byte_mask, size_t n)
897 {
898 uint32_t result = 0;
899 for (size_t i = 0; i < n; i++) {
900 result |= (src1[i] ^ src2[i]) & byte_mask[i];
901 }
902 return result;
903 }
904
905 static inline int
906 __sk_memcmp_mask_16B_scalar(const uint8_t *src1, const uint8_t *src2,
907 const uint8_t *byte_mask)
908 {
909 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 16);
910 }
911
912 static inline int
913 __sk_memcmp_mask_32B_scalar(const uint8_t *src1, const uint8_t *src2,
914 const uint8_t *byte_mask)
915 {
916 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 32);
917 }
918
919 static inline int
920 __sk_memcmp_mask_48B_scalar(const uint8_t *src1, const uint8_t *src2,
921 const uint8_t *byte_mask)
922 {
923 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 48);
924 }
925
926 static inline int
927 __sk_memcmp_mask_64B_scalar(const uint8_t *src1, const uint8_t *src2,
928 const uint8_t *byte_mask)
929 {
930 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 64);
931 }
932
933 static inline int
934 __sk_memcmp_mask_80B_scalar(const uint8_t *src1, const uint8_t *src2,
935 const uint8_t *byte_mask)
936 {
937 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, 80);
938 }
939
940 #if defined(__arm64__) || defined(__arm__) || defined(__x86_64__)
941 extern int os_memcmp_mask_16B(const uint8_t *src1, const uint8_t *src2,
942 const uint8_t *byte_mask);
943 extern int os_memcmp_mask_32B(const uint8_t *src1, const uint8_t *src2,
944 const uint8_t *byte_mask);
945 extern int os_memcmp_mask_48B(const uint8_t *src1, const uint8_t *src2,
946 const uint8_t *byte_mask);
947 extern int os_memcmp_mask_64B(const uint8_t *src1, const uint8_t *src2,
948 const uint8_t *byte_mask);
949 extern int os_memcmp_mask_80B(const uint8_t *src1, const uint8_t *src2,
950 const uint8_t *byte_mask);
951
952 /*
953 * Use SIMD variants based on ARM64 and x86_64.
954 */
955 #define sk_memcmp_mask __sk_memcmp_mask
956 #define sk_memcmp_mask_16B os_memcmp_mask_16B
957 #define sk_memcmp_mask_32B os_memcmp_mask_32B
958 #define sk_memcmp_mask_48B os_memcmp_mask_48B
959 #define sk_memcmp_mask_64B os_memcmp_mask_64B
960 #define sk_memcmp_mask_80B os_memcmp_mask_80B
961
962 /*!
963 * @abstract Compare byte buffers of n bytes long src1 against src2, applying
964 * the byte masks to input data before comparison. (SIMD version)
965 *
966 * @discussion
967 * Returns zero if the two buffers are identical after applying the byte
968 * masks, otherwise non-zero.
969 * Zero-length buffers are always identical.
970 *
971 * @param src1 first input buffer of n bytes long
972 * @param src2 second input buffer of n bytes long
973 * @param byte_mask byte mask of n bytes long applied before comparision
974 * @param n number of bytes
975 */
976 static inline int
977 __sk_memcmp_mask(const uint8_t *src1, const uint8_t *src2,
978 const uint8_t *byte_mask, size_t n)
979 {
980 uint32_t result = 0;
981 size_t i = 0;
982 for (; i + 64 <= n; i += 64) {
983 result |= sk_memcmp_mask_64B(src1 + i, src2 + i,
984 byte_mask + i);
985 }
986 for (; i + 32 <= n; i += 32) {
987 result |= sk_memcmp_mask_32B(src1 + i, src2 + i,
988 byte_mask + i);
989 }
990 for (; i + 16 <= n; i += 16) {
991 result |= sk_memcmp_mask_16B(src1 + i, src2 + i,
992 byte_mask + i);
993 }
994 if (i < n) {
995 if (n >= 16) {
996 /* Compare the last 16 bytes with vector code. */
997 result |= sk_memcmp_mask_16B(src1 + n - 16,
998 src2 + n - 16, byte_mask + n - 16);
999 } else {
1000 /* Use scalar code if n < 16. */
1001 for (; i < n; i++) {
1002 result |= (src1[i] ^ src2[i]) & byte_mask[i];
1003 }
1004 }
1005 }
1006 return result;
1007 }
1008 #else /* !(__arm64__ || __arm__ || __x86_64__) */
1009 /*
1010 * Use scalar variants elsewhere.
1011 */
1012 #define sk_memcmp_mask __sk_memcmp_mask_scalar
1013 #define sk_memcmp_mask_16B __sk_memcmp_mask_16B_scalar
1014 #define sk_memcmp_mask_32B __sk_memcmp_mask_32B_scalar
1015 #define sk_memcmp_mask_48B __sk_memcmp_mask_48B_scalar
1016 #define sk_memcmp_mask_64B __sk_memcmp_mask_64B_scalar
1017 #define sk_memcmp_mask_80B __sk_memcmp_mask_80B_scalar
1018 #endif /* !(__arm64__ || __arm__ || __x86_64__) */
1019
1020 /*
1021 * Scalar variants are available on all platforms if needed.
1022 */
1023 #define sk_memcmp_mask_scalar __sk_memcmp_mask_scalar
1024 #define sk_memcmp_mask_16B_scalar __sk_memcmp_mask_16B_scalar
1025 #define sk_memcmp_mask_32B_scalar __sk_memcmp_mask_32B_scalar
1026 #define sk_memcmp_mask_48B_scalar __sk_memcmp_mask_48B_scalar
1027 #define sk_memcmp_mask_64B_scalar __sk_memcmp_mask_64B_scalar
1028 #define sk_memcmp_mask_80B_scalar __sk_memcmp_mask_80B_scalar
1029
1030 #endif /* KERNEL */
1031 #endif /* PRIVATE || BSD_KERNEL_PRIVATE */
1032 #endif /* !_SKYWALK_COMMON_H_ */
1033