1/* 2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__) 31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based 32 * on the 64-bit part in netinet/cpu_in_cksum.c 33 * 34 * cclee - CoreOS - Vector & Numerics. 06/20/2012. 35 */ 36 37#ifdef KERNEL 38#define CKSUM_ERR _kprintf 39#else 40#ifndef LIBSYSCALL_INTERFACE 41#error "LIBSYSCALL_INTERFACE not defined" 42#endif /* !LIBSYSCALL_INTERFACE */ 43#define CKSUM_ERR _fprintf_stderr 44#endif /* !KERNEL */ 45 46/* 47 * XXX: [email protected]: 48 * 49 * Ugly, but we have little choice, since relying on genassym and <assym.s> 50 * is not possible unless this code lives in osfmk. Note also that this 51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be 52 * authentic; it only cares about 3 fields. 53 */ 54#if defined(__LP64__) 55#define M_NEXT 0 56#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary 57#define M_LEN 24 58#else 59#define M_NEXT 0 60#define M_DATA 8 61#define M_LEN 12 62#endif 63 64 .globl _os_cpu_in_cksum_mbuf 65 .text 66 .align 4 67_os_cpu_in_cksum_mbuf: 68 69 70/* 71 * 64-bit version. 72 * 73 * This function returns the partial 16-bit checksum accumulated in 74 * a 32-bit variable (withouth 1's complement); caller is responsible 75 * for folding the 32-bit sum into 16-bit and performinng the 1's 76 * complement if applicable 77 */ 78 79/* 80 * uint32_t 81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum) 82 * { 83 * int mlen; 84 * uint64_t sum, partial; 85 * unsigned int final_acc; 86 * uint8_t *data; 87 * boolean_t needs_swap, started_on_odd; 88 * 89 * VERIFY(len >= 0); 90 * VERIFY(off >= 0); 91 * 92 * needs_swap = FALSE; 93 * started_on_odd = FALSE; 94 * sum = initial_sum; 95 */ 96 97 #define m x0 98 #define len x1 99 #define off x2 100 #define sum x3 101 #define needs_swap x4 102 #define started_on_odd x5 103 #define mlen x6 104 #define Wmlen w6 105 #define t x7 106 #define data x8 107#if defined(__LP64__) 108 #define ptr_m x0 109 #define ptr_data x8 110#else 111 #define ptr_m w0 112 #define ptr_data w8 113#endif 114 115 116 mov needs_swap, #0 // needs_swap = FALSE; 117 mov started_on_odd, #0 // started_on_odd = FALSE; 118 mov w3, w3 // clear higher half 119 120 121/* 122 * for (;;) { 123 * if (PREDICT_FALSE(m == NULL)) { 124 * CKSUM_ERR("%s: out of data\n", __func__); 125 * return (-1); 126 * } 127 * mlen = m->m_len; 128 * if (mlen > off) { 129 * mlen -= off; 130 * data = mtod(m, uint8_t *) + off; 131 * goto post_initial_offset; 132 * } 133 * off -= mlen; 134 * if (len == 0) 135 * break; 136 * m = m->m_next; 137 * } 138 */ 139 1400: 141 cbz m, Lin_cksum_whoops // if (m == NULL) return -1; 142 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; 143 cmp mlen, off 144 b.le 1f 145 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) 146 sub mlen, mlen, off // mlen -= off; 147 add data, data, off // data = mtod(m, uint8_t *) + off; 148 b L_post_initial_offset 1491: 150 sub off, off, mlen 151 cbnz len, 2f 152 mov x0, x3 153 ret lr 1542: 155 ldr ptr_m, [m, #M_NEXT] 156 b 0b 157 158L_loop: // for (; len > 0; m = m->m_next) { 159/* 160 * if (PREDICT_FALSE(m == NULL)) { 161 * CKSUM_ERR("%s: out of data\n", __func__); 162 * return (-1); 163 * } 164 * mlen = m->m_len; 165 * data = mtod(m, uint8_t *); 166 */ 167 cbz m, Lin_cksum_whoops // if (m == NULL) return -1; 168 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; 169 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) 170 171L_post_initial_offset: 172/* 173 * if (mlen == 0) continue; 174 * if (mlen > len) mlen = len; 175 * len -= mlen; 176 */ 177 178 cbz mlen, L_continue 179 cmp mlen, len 180 csel mlen, mlen, len, le 181 sub len, len, mlen 182 183/* 184 * partial = 0; 185 * if ((uintptr_t)data & 1) { 186 * started_on_odd = !started_on_odd; 187 * partial = *data << 8; 188 * ++data; 189 * --mlen; 190 * } 191 * needs_swap = started_on_odd; 192 */ 193 194 tst data, #1 195 mov x7, #0 196 mov x10, #0 197 b.eq 1f 198 ldrb w7, [data], #1 199 eor started_on_odd, started_on_odd, #1 200 sub mlen, mlen, #1 201 lsl w7, w7, #8 2021: 203 204 205/* 206 * if ((uintptr_t)data & 2) { 207 * if (mlen < 2) 208 * goto trailing_bytes; 209 * partial += *(uint16_t *)(void *)data; 210 * data += 2; 211 * mlen -= 2; 212 * } 213 */ 214 tst data, #2 215 mov needs_swap, started_on_odd 216 b.eq 1f 217 cmp mlen, #2 218 b.lt L_trailing_bytes 219 ldrh w9, [data], #2 220 sub mlen, mlen, #2 221 add w7, w7, w9 2221: 223 224/* 225 * if ((uintptr_t)data & 4) { 226 * if (mlen < 4) 227 * goto L2_bytes; 228 * partial += *(uint32_t *)(void *)data; 229 * data += 4; 230 * mlen -= 4; 231 * } 232 */ 233 // align on 8-bytes boundary if applicable 234 tst data, #4 235 b.eq 1f 236 cmp mlen, #4 237 b.lt L2_bytes 238 ldr w9, [data], #4 239 sub mlen, mlen, #4 240 adds w7, w7, w9 241 adc x7, x7, x10 // assumes x10 still is #0 as set above 2421: 243 244/* 245 * while (mlen >= 64) { 246 * __builtin_prefetch(data + 32); 247 * __builtin_prefetch(data + 64); 248 * partial += *(uint32_t *)(void *)data; 249 * partial += *(uint32_t *)(void *)(data + 4); 250 * partial += *(uint32_t *)(void *)(data + 8); 251 * partial += *(uint32_t *)(void *)(data + 12); 252 * partial += *(uint32_t *)(void *)(data + 16); 253 * partial += *(uint32_t *)(void *)(data + 20); 254 * partial += *(uint32_t *)(void *)(data + 24); 255 * partial += *(uint32_t *)(void *)(data + 28); 256 * partial += *(uint32_t *)(void *)(data + 32); 257 * partial += *(uint32_t *)(void *)(data + 36); 258 * partial += *(uint32_t *)(void *)(data + 40); 259 * partial += *(uint32_t *)(void *)(data + 44); 260 * partial += *(uint32_t *)(void *)(data + 48); 261 * partial += *(uint32_t *)(void *)(data + 52); 262 * partial += *(uint32_t *)(void *)(data + 56); 263 * partial += *(uint32_t *)(void *)(data + 60); 264 * data += 64; 265 * mlen -= 64; 266 * // if (PREDICT_FALSE(partial & (3ULL << 62))) { 267 * // if (needs_swap) 268 * // partial = (partial << 8) + 269 * // (partial >> 56); 270 * // sum += (partial >> 32); 271 * // sum += (partial & 0xffffffff); 272 * // partial = 0; 273 * // } 274 * } 275*/ 276 277 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next 278 subs mlen, mlen, #64 279 b.lt L32_bytes 280 281 // save used vector registers 282 sub sp, sp, #8*16 283 mov x11, sp 284 st1.4s {v0, v1, v2, v3}, [x11], #4*16 285 st1.4s {v4, v5, v6, v7}, [x11], #4*16 286 287 // spread partial into 8 8-byte registers in v0-v3 288 fmov s3, w7 289 eor.16b v0, v0, v0 290 eor.16b v1, v1, v1 291 eor.16b v2, v2, v2 292 293 // load the 1st 64 bytes (16 32-bit words) 294 ld1.4s {v4,v5,v6,v7},[data],#64 295 296 // branch to finish off if mlen<64 297 subs mlen, mlen, #64 298 b.lt L64_finishup 299 300 /* 301 * loop for loading and accumulating 16 32-bit words into 302 * 8 8-byte accumulators per iteration. 303 */ 304L64_loop: 305 subs mlen, mlen, #64 // mlen -= 64 306 307 uadalp.2d v0, v4 308 ld1.4s {v4},[data], #16 309 310 uadalp.2d v1, v5 311 ld1.4s {v5},[data], #16 312 313 uadalp.2d v2, v6 314 ld1.4s {v6},[data], #16 315 316 uadalp.2d v3, v7 317 ld1.4s {v7},[data], #16 318 319 b.ge L64_loop 320 321L64_finishup: 322 uadalp.2d v0, v4 323 uadalp.2d v1, v5 324 uadalp.2d v2, v6 325 uadalp.2d v3, v7 326 327 add.2d v0, v0, v1 328 add.2d v2, v2, v3 329 addp.2d d0, v0 330 addp.2d d2, v2 331 add.2d v0, v0, v2 332 fmov x7, d0 // partial in x7 now 333 334 // restore used vector registers 335 ld1.4s {v0, v1, v2, v3}, [sp], #4*16 336 ld1.4s {v4, v5, v6, v7}, [sp], #4*16 337 338L32_bytes: 339 tst mlen, #32 340 b.eq L16_bytes 341 ldp x9, x10, [data], #16 342 ldp x11, x12, [data], #16 343 adds x7, x7, x9 344 mov x9, #0 345 adcs x7, x7, x10 346 adcs x7, x7, x11 347 adcs x7, x7, x12 348 adc x7, x7, x9 349 350L16_bytes: 351 tst mlen, #16 352 b.eq L8_bytes 353 ldp x9, x10, [data], #16 354 adds x7, x7, x9 355 mov x9, #0 356 adcs x7, x7, x10 357 adc x7, x7, x9 358 359L8_bytes: 360 tst mlen, #8 361 mov x10, #0 362 b.eq L4_bytes 363 ldr x9,[data],#8 364 adds x7, x7, x9 365 adc x7, x7, x10 366 367L4_bytes: 368 tst mlen, #4 369 b.eq L2_bytes 370 ldr w9,[data],#4 371 adds x7, x7, x9 372 adc x7, x7, x10 373 374L2_bytes: 375 tst mlen, #2 376 b.eq L_trailing_bytes 377 ldrh w9,[data],#2 378 adds x7, x7, x9 379 adc x7, x7, x10 380 381L_trailing_bytes: 382 tst mlen, #1 383 b.eq L0_bytes 384 ldrb w9,[data],#1 385 adds x7, x7, x9 386 adc x7, x7, x10 387 eor started_on_odd, started_on_odd, #1 388 389L0_bytes: 390/* 391 * if (needs_swap) 392 * partial = (partial << 8) + (partial >> 56); 393 */ 394 cbz needs_swap, 1f 395 ror x7, x7, #56 3961: 397/* 398 * sum += (partial >> 32) + (partial & 0xffffffff); 399 * sum = (sum >> 32) + (sum & 0xffffffff); 400 * } 401 */ 402 403 add x3, x3, x7, lsr #32 404 mov w7, w7 405 add x3, x3, x7 406 mov w7, w3 407 add x3, x7, x3, lsr #32 408 409L_continue: 410 cmp len, #0 411 ldr ptr_m, [m, #M_NEXT] // m = m->m_next 412 b.gt L_loop 413 414/* 415 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + 416 * ((sum >> 16) & 0xffff) + (sum & 0xffff); 417 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 418 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 419 * return (final_acc & 0xffff); 420 * } 421 */ 422 423 mov w4, #0x00ffff 424 and x0, x4, x3, lsr #48 425 and x1, x4, x3, lsr #32 426 and x2, x4, x3, lsr #16 427 and x3, x4, x3 428 add w0, w0, w1 429 add w2, w2, w3 430 add w0, w0, w2 431 and w1, w4, w0, lsr #16 432 and w0, w4, w0 433 add w0, w0, w1 434 and w1, w4, w0, lsr #16 435 and w0, w4, w0 436 add w0, w0, w1 437 /* 438 * If we were to 1's complement it (XOR with 0xffff): 439 * 440 * eor w0, w0, w4 441 */ 442 and w0, w0, w4 443 444 ret lr 445 446Lin_cksum_whoops: 447 adrp x0, Lin_cksum_whoops_str@page 448 add x0, x0, Lin_cksum_whoops_str@pageoff 449 bl #CKSUM_ERR 450 mov x0, #-1 451 ret lr 452 453Lin_cksum_whoops_str: 454 .asciz "os_cpu_in_cksum_mbuf: out of data\n" 455 .align 5 456