1/* 2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, 31 * uint32_t len, uint32_t sum0); 32 * 33 * input : 34 * src : source starting address 35 * dst : destination starting address 36 * len : byte stream length 37 * sum0 : initial 32-bit sum 38 * 39 * output : 40 * the source byte stream is copied into the destination buffer 41 * the function returns the partial 16-bit checksum accumulated 42 * in a 32-bit variable (without 1's complement); caller is 43 * responsible for folding the 32-bit sum into 16-bit and 44 * performing the 1's complement if applicable 45 */ 46 47#define LITTLE_ENDIAN 1 48#define BYTE_ORDER LITTLE_ENDIAN 49 50 .const 51 .align 4 52 53/* 54 * a vector v0 = w3 : w2 : w1 : w0 will be using the following mask to 55 * extract 0 : w2 : 0 : w0 56 * then shift right quadword 32-bit to get 0 : w3 : 0 : w1 57 * these two vectors are then accumulated to 4 quadword lanes in 2 vectors 58 */ 59L_mask: 60 .quad 0x00000000ffffffff 61 .quad 0x00000000ffffffff 62 63#define Lmask L_mask(%rip) 64 65 .globl _os_cpu_copy_in_cksum 66 .text 67 .align 4 68_os_cpu_copy_in_cksum: 69 70#define src %rdi 71#define dst %rsi 72#define len %rdx 73#define sum %rcx 74#define need_swap %r8 75#define t %r9 76#define td %r9d 77#define tw %r9w 78#define tb %r9b 79#define partial %r10 80#define partiald %r10d 81#define partialw %r10w 82#define partialb %r10b 83 84/* 85 * renaming vector registers 86 */ 87#define v0 %xmm0 88#define v1 %xmm1 89#define v2 %xmm2 90#define v3 %xmm3 91#define v4 %xmm4 92#define v5 %xmm5 93#define v6 %xmm6 94#define v7 %xmm7 95#define v8 %xmm8 96#define v9 %xmm9 97#define v10 %xmm10 98#define v11 %xmm11 99#define v12 %xmm12 100#define v13 %xmm13 101#define v14 %xmm14 102#define v15 %xmm15 103 104 /* push callee-saved registers and set up base pointer */ 105 push %rbp 106 movq %rsp, %rbp 107 108 mov $0, partial // partial = 0; 109 mov $0, need_swap // needs_swap = 0; 110 111 cmp $0, len 112 je L_len_0 113 114/* 115 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this 116 * byte to high byte of 16-bit in w7 117 * 118 * t = 0; 119 * if ((uintptr_t)src & 1) { 120 * t = *src << 8; 121 * *dst++ = *src++; 122 * --len; 123 * } 124 */ 125 test $1, src 126 je 1f 127 128 movzb (src), partial 129 add $1, src 130 movb partialb, (dst) 131 add $1, dst 132#if BYTE_ORDER == LITTLE_ENDIAN 133 shl $8, partial 134#endif 135 mov $1, need_swap 136 sub $1, len 137 jz L_len_0 1381: 139 140#ifdef KERNEL 141 /* allocate stack space and save xmm0-xmm15 */ 142 sub $16*16, %rsp 143 movdqa v0, 0*16(%rsp) 144 movdqa v1, 1*16(%rsp) 145 movdqa v2, 2*16(%rsp) 146 movdqa v3, 3*16(%rsp) 147 movdqa v4, 4*16(%rsp) 148 movdqa v5, 5*16(%rsp) 149 movdqa v6, 6*16(%rsp) 150 movdqa v7, 7*16(%rsp) 151 movdqa v8, 8*16(%rsp) 152 movdqa v9, 9*16(%rsp) 153 movdqa v10, 10*16(%rsp) 154 movdqa v11, 11*16(%rsp) 155 movdqa v12, 12*16(%rsp) 156 movdqa v13, 13*16(%rsp) 157 movdqa v14, 14*16(%rsp) 158 movdqa v15, 15*16(%rsp) 159#endif 160 161 /* 162 * pre-decrement len by 8*16, and if less tha 8*16 bytes, 163 * try 4*16 bytes next 164 * v0,v1 will store temp result after we exit the L128 loop 165 */ 166 pxor v0, v0 167 pxor v1, v1 168 cmp $(8*16), len 169 movq partial, v0 // move partial to 1st 64b lane in v0 170 jl L64_bytes 171 172 /* 173 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3 174 * load 1st 4 vectors, and clear v0-v3 175 */ 176 pxor v2, v2 177 pxor v3, v3 178 movups 0*16(src), v4 179 movups 1*16(src), v5 180 movups 2*16(src), v6 181 movups 3*16(src), v7 182 movups 4*16(src), v8 183 movups 5*16(src), v9 184 movups 6*16(src), v10 185 movups 7*16(src), v11 186 add $8*16, src 187 188 /* branch to finish off if len<128 */ 189 sub $2*8*16, len 190 jl L128_finishup 191 192 /* 193 * loop for loading and accumulating 16 32-bit words into 194 * 8 8-byte accumulators per iteration 195 */ 196L128_loop: 197 /* 198 * store v4-v7 to dst[0:3] 199 * copy v4-v7 to v12-v15 200 * extract w3:w1 in v4-v7 201 */ 202 movups v4, 0*16(dst) 203 movdqa v4, v12 204 psrlq $32, v4 205 206 movups v5, 1*16(dst) 207 movdqa v5, v13 208 psrlq $32, v5 209 210 movups v6, 2*16(dst) 211 movdqa v6, v14 212 psrlq $32, v6 213 214 movups v7, 3*16(dst) 215 movdqa v7, v15 216 psrlq $32, v7 217 218 /* 219 * store v8-v11 to dst[4:7] 220 * extract w2:w0 in v12-v15 221 * accumulate w3:w1 in v4-v7 to v0-v3 222 */ 223 movups v8, 4*16(dst) 224 pand Lmask, v12 225 paddq v4, v0 226 227 movups v9, 5*16(dst) 228 pand Lmask, v13 229 paddq v5, v1 230 231 movups v10, 6*16(dst) 232 pand Lmask, v14 233 paddq v6, v2 234 235 movups v11, 7*16(dst) 236 pand Lmask, v15 237 paddq v7, v3 238 239 add $8*16, dst // advance dst for next iteration 240 241 /* 242 * accumulate w2:w0 in v12-v15 to v0-v3 243 * copy v8-v11 to v12-v15 244 * extract w3:w1 in v8-v11 245 */ 246 paddq v12, v0 247 movdqa v8, v12 248 psrlq $32, v8 249 250 paddq v13, v1 251 movdqa v9, v13 252 psrlq $32, v9 253 254 paddq v14, v2 255 movdqa v10, v14 256 psrlq $32, v10 257 258 paddq v15, v3 259 movdqa v11, v15 260 psrlq $32, v11 261 262 /* 263 * load src[0:3] to v4-v7 264 * accumulate w3:w1 in v8-v11 to v0-v3 265 * extract w2:w0 in v12-v15 266 */ 267 movups 0*16(src), v4 268 paddq v8, v0 269 pand Lmask, v12 270 271 movups 1*16(src), v5 272 paddq v9, v1 273 pand Lmask, v13 274 275 movups 2*16(src), v6 276 paddq v10, v2 277 pand Lmask, v14 278 279 movups 3*16(src), v7 280 paddq v11, v3 281 pand Lmask, v15 282 283 /* 284 * load src[4:7] to v8-v11 285 * accumulate w2:w0 in v12-v15 to v0-v3 286 */ 287 movups 4*16(src), v8 288 paddq v12, v0 289 290 movups 5*16(src), v9 291 paddq v13, v1 292 293 movups 6*16(src), v10 294 paddq v14, v2 295 296 movups 7*16(src), v11 297 paddq v15, v3 298 299 add $8*16, src // advance src for next iteration 300 301 sub $8*16, len 302 jge L128_loop 303 304L128_finishup: 305 movups v4, 0*16(dst) 306 movdqa v4, v12 307 psrlq $32, v4 308 309 movups v5, 1*16(dst) 310 movdqa v5, v13 311 psrlq $32, v5 312 313 movups v6, 2*16(dst) 314 movdqa v6, v14 315 psrlq $32, v6 316 317 movups v7, 3*16(dst) 318 movdqa v7, v15 319 psrlq $32, v7 320 321 pand Lmask, v12 322 paddq v4, v0 323 movups v8, 4*16(dst) 324 325 pand Lmask, v13 326 paddq v5, v1 327 movups v9, 5*16(dst) 328 329 pand Lmask, v14 330 paddq v6, v2 331 movups v10, 6*16(dst) 332 333 pand Lmask, v15 334 paddq v7, v3 335 movups v11, 7*16(dst) 336 337 add $8*16, dst 338 339 paddq v12, v0 340 movdqa v8, v12 341 psrlq $32, v8 342 343 paddq v13, v1 344 movdqa v9, v13 345 psrlq $32, v9 346 347 paddq v14, v2 348 movdqa v10, v14 349 psrlq $32, v10 350 351 paddq v15, v3 352 movdqa v11, v15 353 psrlq $32, v11 354 355 paddq v8, v0 356 pand Lmask, v12 357 358 paddq v9, v1 359 pand Lmask, v13 360 361 paddq v10, v2 362 pand Lmask, v14 363 364 paddq v11, v3 365 pand Lmask, v15 366 367 paddq v12, v0 368 paddq v13, v1 369 paddq v14, v2 370 paddq v15, v3 371 372 add $8*16, len 373 374 /* absorb v2-v3 into v0-v1 */ 375 paddq v2, v0 376 paddq v3, v1 377 378L64_bytes: 379 cmp $4*16, len 380 jl L32_bytes 381 382 movups 0*16(src), v4 383 movups 1*16(src), v5 384 movups 2*16(src), v6 385 movups 3*16(src), v7 386 add $4*16, src 387 388 movups v4, 0*16(dst) 389 movups v5, 1*16(dst) 390 movups v6, 2*16(dst) 391 movups v7, 3*16(dst) 392 add $4*16, dst 393 394 movdqa v4, v12 395 psrlq $32, v4 396 movdqa v5, v13 397 psrlq $32, v5 398 movdqa v6, v14 399 psrlq $32, v6 400 movdqa v7, v15 401 psrlq $32, v7 402 403 pand Lmask, v12 404 paddq v4, v0 405 pand Lmask, v13 406 paddq v5, v1 407 pand Lmask, v14 408 paddq v6, v0 409 pand Lmask, v15 410 paddq v7, v1 411 412 paddq v12, v0 413 paddq v13, v1 414 paddq v14, v0 415 paddq v15, v1 416 417 sub $4*16, len 418 419L32_bytes: 420 cmp $2*16, len 421 jl L16_bytes 422 movups 0*16(src), v4 423 movups 1*16(src), v5 424 add $2*16, src 425 426 movups v4, 0*16(dst) 427 movups v5, 1*16(dst) 428 add $2*16, dst 429 430 movdqa v4, v12 431 movdqa v5, v13 432 psrlq $32, v4 433 psrlq $32, v5 434 pand Lmask, v12 435 pand Lmask, v13 436 paddq v4, v0 437 paddq v5, v1 438 paddq v12, v0 439 paddq v13, v1 440 441 sub $2*16, len 442 443L16_bytes: 444 paddq v1, v0 445 446 cmp $16, len 447 jl L8_bytes 448 449 movups 0*16(src), v4 450 add $1*16, src 451 452 movups v4, 0*16(dst) 453 add $1*16, dst 454 455 movdqa v4, v12 456 psrlq $32, v4 457 pand Lmask, v12 458 paddq v4, v0 459 paddq v12, v0 460 461 sub $16, len 462 463L8_bytes: 464 movq v0, partial 465 psrldq $8, v0 466 movq v0, t 467 add t, partial 468 469#ifdef KERNEL 470 // restore xmm0-xmm15 and deallocate stack space 471 movdqa 0*16(%rsp), v0 472 movdqa 1*16(%rsp), v1 473 movdqa 2*16(%rsp), v2 474 movdqa 3*16(%rsp), v3 475 movdqa 4*16(%rsp), v4 476 movdqa 5*16(%rsp), v5 477 movdqa 6*16(%rsp), v6 478 movdqa 7*16(%rsp), v7 479 movdqa 8*16(%rsp), v8 480 movdqa 9*16(%rsp), v9 481 movdqa 10*16(%rsp), v10 482 movdqa 11*16(%rsp), v11 483 movdqa 12*16(%rsp), v12 484 movdqa 13*16(%rsp), v13 485 movdqa 14*16(%rsp), v14 486 movdqa 15*16(%rsp), v15 487 add $16*16, %rsp 488#endif 489 490 sub $4, len 491 jl L2_bytes 4920: 493 movl (src), td 494 add t, partial 495 mov td, (dst) 496 add $4, src 497 add $4, dst 498 sub $4, len 499 jge 0b 500 501 502L2_bytes: 503 test $2, len 504 je L_trailing_bytes 505 506 movzwl (src), td 507 add t, partial 508 mov tw, (dst) 509 add $2, src 510 add $2, dst 511 512L_trailing_bytes: 513 test $1, len 514 je L0_bytes 515 movzbl (src), td 516 mov tb, (dst) 517#if BYTE_ORDER != LITTLE_ENDIAN 518 shl $8, t // partial <<= 8; 519#endif 520 add t, partial 521 522L0_bytes: 523 /* partial = (partial >> 32) + (partial & 0xffffffff); */ 524 mov partiald, %eax 525 shr $32, partial 526 add %rax, partial 527 528 /* partial = (partial >> 16) + (partial & 0xffff); */ 529 movzwl partialw, %eax 530 shr $16, partial 531 add %rax, partial 532 533L_len_0: 534 /* 535 * if (needs_swap) 536 * partial = (partial << 8) + (partial >> 24); 537 */ 538 cmp $0, need_swap 539 je 1f 540 mov partial, %rax 541 shl $8, %rax 542 shr $24, partial 543 add %rax, partial 5441: 545 546 /* final_acc = (initial_sum >> 16) + (initial_sum & 0xffff); */ 547 movzwl %cx, %eax 548 shr $16, %ecx 549 add %ecx, %eax 550 551 /* final_acc += (partial >> 16) + (partial & 0xffff); */ 552 movzwl partialw, %ecx 553 shr $16, partial 554 add %ecx, %eax 555 add partiald, %eax 556 557 /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ 558 movzwl %ax, %ecx 559 shr $16, %eax 560 add %ecx, %eax 561 562 /* 563 * One final fold in case of carry from the previous one. 564 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 565 */ 566 movzwl %ax, %ecx 567 shr $16, %eax 568 add %ecx, %eax 569 570 /* 571 * return (~final_acc & 0xffff); 572 * 573 * not %eax 574 * movzwl %ax, %eax 575 */ 576 577 /* restore callee-saved registers */ 578 pop %rbp 579 ret 580