1/* 2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, 31 * uint32_t len, uint32_t sum0); 32 * 33 * input : 34 * src : source starting address 35 * dst : destination starting address 36 * len : byte stream length 37 * sum0 : initial 32-bit sum 38 * 39 * output : 40 * the source byte stream is copied into the destination buffer 41 * the function returns the partial 16-bit checksum accumulated 42 * in a 32-bit variable (without 1's complement); caller is 43 * responsible for folding the 32-bit sum into 16-bit and 44 * performing the 1's complement if applicable 45 */ 46 47/* 48 * the following definitions default the implementation to little-endian 49 * architectures 50 */ 51#define LITTLE_ENDIAN 1 52#define BYTE_ORDER LITTLE_ENDIAN 53 54/* 55 * renaming registers to ease code porting from arm64 56 */ 57#define v0 q0 58#define v1 q1 59#define v2 q2 60#define v3 q3 61#define v8 q8 62#define v9 q9 63#define v10 q10 64#define v11 q11 65#define v12 q12 66#define v13 q13 67#define v14 q14 68#define v15 q15 69 70 .syntax unified 71 .align 2 72 .code 16 73 .thumb_func _os_cpu_copy_in_cksum 74 .text 75 76 .globl _os_cpu_copy_in_cksum 77_os_cpu_copy_in_cksum: 78 79#define src r0 80#define dst r1 81#define len r2 82#define sum r3 83#define need_swap r4 84#define partial r5 85#define t r12 86 87 push {r4,r5,r7,lr} 88 add r7, sp, #8 /* set up base pointer for debug tracing */ 89 90 cmp len, #0 91 mov partial, #0 /* partial = 0; */ 92 mov need_swap, #0 /* needs_swap = 0; */ 93 94 cbnz len, 0f 95 b L_len_0 960: 97 98/* 99 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this 100 * byte to high byte of 16-bit in w7 101 * 102 * t = 0; 103 * if ((uintptr_t)src & 1) { 104 * t = *src << 8; 105 * *dst++ = *src++; 106 * --len; 107 * } 108*/ 109 tst src, #1 110 beq 1f 111 ldrb partial, [src] 112 add src, src, #1 113 strb partial, [dst], #1 114#if BYTE_ORDER == LITTLE_ENDIAN 115 lsl partial, partial, #8 116#endif 117 subs len, len, #1 118 mov need_swap, #1 119 beq L_len_0 1201: 121 122#ifdef KERNEL 123 vpush {v8-v15} 124 vpush {v0-v3} 125#endif 126 127 /* 128 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try 129 * 4*16 bytes next. 130 * v0,v1 will store temp result after we exit the L128 loop 131 */ 132 veor v0, v0, v0 133 veor v1, v1, v1 134 cmp len, #8*16 135 vmov s0, partial /* move partial to 1st 64b lane in v0 */ 136 blt L64_bytes 137 138 /* 139 * accumulate 8 x 2 x 16-bit pairs into 16 lanes in v0-v3 140 * branch to finish off if len<128 141 */ 142 vld1.8 {q8,q9}, [src]! 143 veor v2, v2, v2 144 vld1.8 {q10,q11}, [src]! 145 veor v3, v3, v3 146 vld1.8 {q12,q13}, [src]! 147 subs len, len, #2*8*16 148 vld1.8 {q14,q15}, [src]! 149 blt L128_finishup 150 151 /* 152 * loop for loading and accumulating 16 32-bit words nto 8 8-byte 153 * accumulators per iteration 154 */ 155L128_loop: 156 vpadal.u16 v0, v8 157 vst1.8 {q8,q9}, [dst]! 158 vpadal.u16 v1, v9 159 vld1.8 {q8,q9}, [src]! 160 161 vpadal.u16 v2, v10 162 vst1.8 {q10,q11}, [dst]! 163 vpadal.u16 v3, v11 164 vld1.8 {q10,q11}, [src]! 165 166 vpadal.u16 v0, v12 167 vst1.8 {q12,q13}, [dst]! 168 vpadal.u16 v1, v13 169 vld1.8 {q12,q13}, [src]! 170 171 vpadal.u16 v2, v14 172 vst1.8 {q14,q15}, [dst]! 173 vpadal.u16 v3, v15 174 vld1.8 {q14,q15}, [src]! 175 176 subs len, len, #8*16 177 bge L128_loop 178 179L128_finishup: 180 vpadal.u16 v0, v8 181 vst1.8 {q8,q9}, [dst]! 182 vpadal.u16 v1, v9 183 184 vpadal.u16 v2, v10 185 vst1.8 {q10,q11}, [dst]! 186 vpadal.u16 v3, v11 187 188 vpadal.u16 v0, v12 189 vst1.8 {q12,q13}, [dst]! 190 vpadal.u16 v1, v13 191 192 vpadal.u16 v2, v14 193 vst1.8 {q14,q15}, [dst]! 194 vpadal.u16 v3, v15 195 196 add len, len, #8*16 197 198 vadd.i32 v0, v0, v2 199 vadd.i32 v1, v1, v3 200 201L64_bytes: 202 cmp len, #4*16 203 blt L32_bytes 204 205 vld1.8 {q8,q9}, [src]! 206 vld1.8 {q10,q11}, [src]! 207 208 vpadal.u16 v0, v8 209 vst1.8 {q8,q9}, [dst]! 210 vpadal.u16 v1, v9 211 212 vpadal.u16 v0, v10 213 vst1.8 {q10,q11}, [dst]! 214 vpadal.u16 v1, v11 215 216 sub len, len, #4*16 217 218L32_bytes: 219 cmp len, #2*16 220 blt L16_bytes 221 222 vld1.8 {q8,q9}, [src]! 223 224 vpadal.u16 v0, v8 225 vst1.8 {q8,q9}, [dst]! 226 vpadal.u16 v1, v9 227 228 sub len, len, #2*16 229 230L16_bytes: 231 vadd.i32 v0, v0, v1 232 233 cmp len, #16 234 blt L8_bytes 235 vld1.8 {q8}, [src]! 236 vpadal.u16 v0, v8 237 vst1.8 {q8}, [dst]! 238 239 sub len, len, #16 240 241L8_bytes: 242 veor v1, v1, v1 243 tst len, #8 244 beq L4_bytes 245 vld1.8 {d2}, [src]! 246 vst1.8 {d2}, [dst]! 247 vpadal.u16 v0, v1 248 249L4_bytes: 250 ands len, len, #7 251 vpadd.i32 d0, d0, d1 252 vpadd.i32 d0, d0, d1 253 vmov partial, s0 254 255#ifdef KERNEL 256 vpop {q0-q1} 257 vpop {q2-q3} 258 vpop {q8-q9} 259 vpop {q10-q11} 260 vpop {q12-q13} 261 vpop {q14-q15} 262#endif 263 264 beq L_len_0 265 266 subs len, len, #2 267 blt L_trailing_bytes 268 269L2_bytes: 270 ldrh t, [src], #2 271 strh t, [dst], #2 272 add partial, partial, t 273 subs len, len, #2 274 bge L2_bytes 275 276L_trailing_bytes: 277 tst len, #1 278 beq L_len_0 279 ldrb t,[src],#1 280 strb t,[dst],#1 281#if BYTE_ORDER != LITTLE_ENDIAN 282 lsl t, t, #8 283#endif 284 add partial, partial, t 285 286L_len_0: 287 /* 288 * if (needs_swap) 289 * partial = (partial << 8) + (partial >> 24); 290 */ 291 cbz need_swap, 1f 292 lsl t, partial, #8 293 add partial, t, partial, lsr #24 2941: 295 movw lr, #0xffff 296 297 /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ 298 and r0, sum, lr 299 add r0, r0, sum, lsr #16 300 301 /* final_acc += (partial >> 16) + (partial & 0xffff); */ 302 add r0, r0, partial, lsr #16 303 and partial, partial, lr 304 add r0, r0, partial 305 306 /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ 307 and t, r0, lr 308 add r0, t, r0, lsr #16 309 310 /* 311 * One final fold in case of carry from the previous one. 312 * final_acc = (final_acc >> 16) + (final_acc & 0xffff); 313 */ 314 and t, r0, lr 315 add r0, t, r0, lsr #16 316 317 /* 318 * return (~final_acc & 0xffff); 319 * 320 * mvn r0, r0 321 * and r0, r0, lr 322 */ 323 324 pop {r4,r5,r7,pc} 325