1/* 2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__) 31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based 32 * on the 64-bit part in netinet/cpu_in_cksum.c 33 * 34 * cclee - CoreOS - Vector & Numerics. 06/20/2012. 35 */ 36 37#ifdef KERNEL 38#include <arm64/asm.h> 39 40#define CKSUM_ERR _kprintf 41#else 42#ifndef LIBSYSCALL_INTERFACE 43#error "LIBSYSCALL_INTERFACE not defined" 44#endif /* !LIBSYSCALL_INTERFACE */ 45#define CKSUM_ERR _fprintf_stderr 46#endif /* !KERNEL */ 47 48/* 49 * XXX: [email protected]: 50 * 51 * Ugly, but we have little choice, since relying on genassym and <assym.s> 52 * is not possible unless this code lives in osfmk. Note also that this 53 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be 54 * authentic; it only cares about 3 fields. 55 */ 56#if defined(__LP64__) 57#define M_NEXT 0 58#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary 59#define M_LEN 24 60#else 61#define M_NEXT 0 62#define M_DATA 8 63#define M_LEN 12 64#endif 65 66 .globl _os_cpu_in_cksum_mbuf 67 .text 68 .align 4 69_os_cpu_in_cksum_mbuf: 70 71 72/* 73 * 64-bit version. 74 * 75 * This function returns the partial 16-bit checksum accumulated in 76 * a 32-bit variable (withouth 1's complement); caller is responsible 77 * for folding the 32-bit sum into 16-bit and performinng the 1's 78 * complement if applicable 79 */ 80 81/* 82 * uint32_t 83 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum) 84 * { 85 * int mlen; 86 * uint64_t sum, partial; 87 * unsigned int final_acc; 88 * uint8_t *data; 89 * boolean_t needs_swap, started_on_odd; 90 * 91 * VERIFY(len >= 0); 92 * VERIFY(off >= 0); 93 * 94 * needs_swap = FALSE; 95 * started_on_odd = FALSE; 96 * sum = initial_sum; 97 */ 98 99 #define m x0 100 #define len x1 101 #define off x2 102 #define sum x3 103 #define needs_swap x4 104 #define started_on_odd x5 105 #define mlen x6 106 #define Wmlen w6 107 #define t x7 108 #define data x8 109#if defined(__LP64__) 110 #define ptr_m x0 111 #define ptr_data x8 112#else 113 #define ptr_m w0 114 #define ptr_data w8 115#endif 116 117 118#ifdef KERNEL 119 ARM64_PROLOG 120#endif /* KERNEL */ 121 mov needs_swap, #0 // needs_swap = FALSE; 122 mov started_on_odd, #0 // started_on_odd = FALSE; 123 mov w3, w3 // clear higher half 124 125 126/* 127 * for (;;) { 128 * if (PREDICT_FALSE(m == NULL)) { 129 * CKSUM_ERR("%s: out of data\n", __func__); 130 * return (-1); 131 * } 132 * mlen = m->m_len; 133 * if (mlen > off) { 134 * mlen -= off; 135 * data = mtod(m, uint8_t *) + off; 136 * goto post_initial_offset; 137 * } 138 * off -= mlen; 139 * if (len == 0) 140 * break; 141 * m = m->m_next; 142 * } 143 */ 144 1450: 146 cbz m, Lin_cksum_whoops // if (m == NULL) return -1; 147 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; 148 cmp mlen, off 149 b.le 1f 150 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) 151 sub mlen, mlen, off // mlen -= off; 152 add data, data, off // data = mtod(m, uint8_t *) + off; 153 b L_post_initial_offset 1541: 155 sub off, off, mlen 156 cbnz len, 2f 157 mov x0, x3 158 ret lr 1592: 160 ldr ptr_m, [m, #M_NEXT] 161 b 0b 162 163L_loop: // for (; len > 0; m = m->m_next) { 164/* 165 * if (PREDICT_FALSE(m == NULL)) { 166 * CKSUM_ERR("%s: out of data\n", __func__); 167 * return (-1); 168 * } 169 * mlen = m->m_len; 170 * data = mtod(m, uint8_t *); 171 */ 172 cbz m, Lin_cksum_whoops // if (m == NULL) return -1; 173 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; 174 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) 175 176L_post_initial_offset: 177/* 178 * if (mlen == 0) continue; 179 * if (mlen > len) mlen = len; 180 * len -= mlen; 181 */ 182 183 cbz mlen, L_continue 184 cmp mlen, len 185 csel mlen, mlen, len, le 186 sub len, len, mlen 187 188/* 189 * partial = 0; 190 * if ((uintptr_t)data & 1) { 191 * started_on_odd = !started_on_odd; 192 * partial = *data << 8; 193 * ++data; 194 * --mlen; 195 * } 196 * needs_swap = started_on_odd; 197 */ 198 199 tst data, #1 200 mov x7, #0 201 mov x10, #0 202 b.eq 1f 203 ldrb w7, [data], #1 204 eor started_on_odd, started_on_odd, #1 205 sub mlen, mlen, #1 206 lsl w7, w7, #8 2071: 208 209 210/* 211 * if ((uintptr_t)data & 2) { 212 * if (mlen < 2) 213 * goto trailing_bytes; 214 * partial += *(uint16_t *)(void *)data; 215 * data += 2; 216 * mlen -= 2; 217 * } 218 */ 219 tst data, #2 220 mov needs_swap, started_on_odd 221 b.eq 1f 222 cmp mlen, #2 223 b.lt L_trailing_bytes 224 ldrh w9, [data], #2 225 sub mlen, mlen, #2 226 add w7, w7, w9 2271: 228 229/* 230 * if ((uintptr_t)data & 4) { 231 * if (mlen < 4) 232 * goto L2_bytes; 233 * partial += *(uint32_t *)(void *)data; 234 * data += 4; 235 * mlen -= 4; 236 * } 237 */ 238 // align on 8-bytes boundary if applicable 239 tst data, #4 240 b.eq 1f 241 cmp mlen, #4 242 b.lt L2_bytes 243 ldr w9, [data], #4 244 sub mlen, mlen, #4 245 adds w7, w7, w9 246 adc x7, x7, x10 // assumes x10 still is #0 as set above 2471: 248 249/* 250 * while (mlen >= 64) { 251 * __builtin_prefetch(data + 32); 252 * __builtin_prefetch(data + 64); 253 * partial += *(uint32_t *)(void *)data; 254 * partial += *(uint32_t *)(void *)(data + 4); 255 * partial += *(uint32_t *)(void *)(data + 8); 256 * partial += *(uint32_t *)(void *)(data + 12); 257 * partial += *(uint32_t *)(void *)(data + 16); 258 * partial += *(uint32_t *)(void *)(data + 20); 259 * partial += *(uint32_t *)(void *)(data + 24); 260 * partial += *(uint32_t *)(void *)(data + 28); 261 * partial += *(uint32_t *)(void *)(data + 32); 262 * partial += *(uint32_t *)(void *)(data + 36); 263 * partial += *(uint32_t *)(void *)(data + 40); 264 * partial += *(uint32_t *)(void *)(data + 44); 265 * partial += *(uint32_t *)(void *)(data + 48); 266 * partial += *(uint32_t *)(void *)(data + 52); 267 * partial += *(uint32_t *)(void *)(data + 56); 268 * partial += *(uint32_t *)(void *)(data + 60); 269 * data += 64; 270 * mlen -= 64; 271 * // if (PREDICT_FALSE(partial & (3ULL << 62))) { 272 * // if (needs_swap) 273 * // partial = (partial << 8) + 274 * // (partial >> 56); 275 * // sum += (partial >> 32); 276 * // sum += (partial & 0xffffffff); 277 * // partial = 0; 278 * // } 279 * } 280*/ 281 282 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next 283 subs mlen, mlen, #64 284 b.lt L32_bytes 285 286 // save used vector registers 287 sub sp, sp, #8*16 288 mov x11, sp 289 st1.4s {v0, v1, v2, v3}, [x11], #4*16 290 st1.4s {v4, v5, v6, v7}, [x11], #4*16 291 292 // spread partial into 8 8-byte registers in v0-v3 293 fmov s3, w7 294 eor.16b v0, v0, v0 295 eor.16b v1, v1, v1 296 eor.16b v2, v2, v2 297 298 // load the 1st 64 bytes (16 32-bit words) 299 ld1.4s {v4,v5,v6,v7},[data],#64 300 301 // branch to finish off if mlen<64 302 subs mlen, mlen, #64 303 b.lt L64_finishup 304 305 /* 306 * loop for loading and accumulating 16 32-bit words into 307 * 8 8-byte accumulators per iteration. 308 */ 309L64_loop: 310 subs mlen, mlen, #64 // mlen -= 64 311 312 uadalp.2d v0, v4 313 ld1.4s {v4},[data], #16 314 315 uadalp.2d v1, v5 316 ld1.4s {v5},[data], #16 317 318 uadalp.2d v2, v6 319 ld1.4s {v6},[data], #16 320 321 uadalp.2d v3, v7 322 ld1.4s {v7},[data], #16 323 324 b.ge L64_loop 325 326L64_finishup: 327 uadalp.2d v0, v4 328 uadalp.2d v1, v5 329 uadalp.2d v2, v6 330 uadalp.2d v3, v7 331 332 add.2d v0, v0, v1 333 add.2d v2, v2, v3 334 addp.2d d0, v0 335 addp.2d d2, v2 336 add.2d v0, v0, v2 337 fmov x7, d0 // partial in x7 now 338 339 // restore used vector registers 340 ld1.4s {v0, v1, v2, v3}, [sp], #4*16 341 ld1.4s {v4, v5, v6, v7}, [sp], #4*16 342 343L32_bytes: 344 tst mlen, #32 345 b.eq L16_bytes 346 ldp x9, x10, [data], #16 347 ldp x11, x12, [data], #16 348 adds x7, x7, x9 349 mov x9, #0 350 adcs x7, x7, x10 351 adcs x7, x7, x11 352 adcs x7, x7, x12 353 adc x7, x7, x9 354 355L16_bytes: 356 tst mlen, #16 357 b.eq L8_bytes 358 ldp x9, x10, [data], #16 359 adds x7, x7, x9 360 mov x9, #0 361 adcs x7, x7, x10 362 adc x7, x7, x9 363 364L8_bytes: 365 tst mlen, #8 366 mov x10, #0 367 b.eq L4_bytes 368 ldr x9,[data],#8 369 adds x7, x7, x9 370 adc x7, x7, x10 371 372L4_bytes: 373 tst mlen, #4 374 b.eq L2_bytes 375 ldr w9,[data],#4 376 adds x7, x7, x9 377 adc x7, x7, x10 378 379L2_bytes: 380 tst mlen, #2 381 b.eq L_trailing_bytes 382 ldrh w9,[data],#2 383 adds x7, x7, x9 384 adc x7, x7, x10 385 386L_trailing_bytes: 387 tst mlen, #1 388 b.eq L0_bytes 389 ldrb w9,[data],#1 390 adds x7, x7, x9 391 adc x7, x7, x10 392 eor started_on_odd, started_on_odd, #1 393 394L0_bytes: 395/* 396 * if (needs_swap) 397 * partial = (partial << 8) + (partial >> 56); 398 */ 399 cbz needs_swap, 1f 400 ror x7, x7, #56 4011: 402/* 403 * sum += (partial >> 32) + (partial & 0xffffffff); 404 * sum = (sum >> 32) + (sum & 0xffffffff); 405 * } 406 */ 407 408 add x3, x3, x7, lsr #32 409 mov w7, w7 410 add x3, x3, x7 411 mov w7, w3 412 add x3, x7, x3, lsr #32 413 414L_continue: 415 cmp len, #0 416 ldr ptr_m, [m, #M_NEXT] // m = m->m_next 417 b.gt L_loop 418 419/* 420 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + 421 * ((sum >> 16) & 0xffff) + (sum & 0xffff); 422 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 423 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 424 * return (final_acc & 0xffff); 425 * } 426 */ 427 428 mov w4, #0x00ffff 429 and x0, x4, x3, lsr #48 430 and x1, x4, x3, lsr #32 431 and x2, x4, x3, lsr #16 432 and x3, x4, x3 433 add w0, w0, w1 434 add w2, w2, w3 435 add w0, w0, w2 436 and w1, w4, w0, lsr #16 437 and w0, w4, w0 438 add w0, w0, w1 439 and w1, w4, w0, lsr #16 440 and w0, w4, w0 441 add w0, w0, w1 442 /* 443 * If we were to 1's complement it (XOR with 0xffff): 444 * 445 * eor w0, w0, w4 446 */ 447 and w0, w0, w4 448 449 ret lr 450 451Lin_cksum_whoops: 452 adrp x0, Lin_cksum_whoops_str@page 453 add x0, x0, Lin_cksum_whoops_str@pageoff 454 bl #CKSUM_ERR 455 mov x0, #-1 456 ret lr 457 458Lin_cksum_whoops_str: 459 .asciz "os_cpu_in_cksum_mbuf: out of data\n" 460 .align 5 461