1/* 2 * Copyright (c) 2009-2018 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* $NetBSD: cpu_in_cksum.S,v 1.2 2008/01/27 16:58:05 chris Exp $ */ 30 31/* 32 * Copyright 2003 Wasabi Systems, Inc. 33 * All rights reserved. 34 * 35 * Written by Steve C. Woodford for Wasabi Systems, Inc. 36 * 37 * Redistribution and use in source and binary forms, with or without 38 * modification, are permitted provided that the following conditions 39 * are met: 40 * 1. Redistributions of source code must retain the above copyright 41 * notice, this list of conditions and the following disclaimer. 42 * 2. Redistributions in binary form must reproduce the above copyright 43 * notice, this list of conditions and the following disclaimer in the 44 * documentation and/or other materials provided with the distribution. 45 * 3. All advertising materials mentioning features or use of this software 46 * must display the following acknowledgement: 47 * This product includes software developed for the NetBSD Project by 48 * Wasabi Systems, Inc. 49 * 4. The name of Wasabi Systems, Inc. may not be used to endorse 50 * or promote products derived from this software without specific prior 51 * written permission. 52 * 53 * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND 54 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 55 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 56 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL WASABI SYSTEMS, INC 57 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 58 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 59 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 60 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 61 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 62 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 63 * POSSIBILITY OF SUCH DAMAGE. 64 */ 65 66#ifdef KERNEL 67#include "../../../osfmk/arm/arch.h" 68#include "../../../osfmk/arm/proc_reg.h" 69 70#if __ARM_VFP__ < 3 71#error "Unsupported: __ARM_VFP__ < 3" 72#endif /* __ARM_VFP__ < 3 */ 73#define CKSUM_ERR _kprintf 74#else /* !KERNEL */ 75#ifndef LIBSYSCALL_INTERFACE 76#error "LIBSYSCALL_INTERFACE not defined" 77#endif /* !LIBSYSCALL_INTERFACE */ 78#define CKSUM_ERR _fprintf_stderr 79#define __ARM_VFP__ 3 80#endif /* !KERNEL */ 81 82/* 83 * The following default the implementation to little-endian architectures. 84 */ 85#define LITTLE_ENDIAN 1 86#define BYTE_ORDER LITTLE_ENDIAN 87 88.syntax unified 89 90/* 91 * XXX: [email protected]: 92 * 93 * Ugly, but we have little choice, since relying on genassym and <assym.s> 94 * is not possible unless this code lives in osfmk. Note also that this 95 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be 96 * authentic; it only cares about 3 fields. 97 */ 98#define M_NEXT 0 99#define M_DATA 8 100#define M_LEN 12 101 102/* 103 * APPLE MODIFICATION 104 * 105 * The use of R7 in this code as data register prevents 106 * the use of debugging or instrumentation tools, which is an acceptable 107 * tradeoff considering the potential gain in performance. 108 */ 109 110/* 111 * Hand-optimised implementations for ARM/Xscale 112 */ 113 114 .macro EnableVFP 115#ifdef KERNEL 116 push {r0, r1, r2, r12} 117 bl _enable_kernel_vfp_context 118 pop {r0, r1, r2, r12} 119#endif /* KERNEL */ 120 .endm 121 122 123/* 124 * uint32_t os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, 125 * uint32_t initial_sum); 126 * 127 * Entry: 128 * r0 m 129 * r1 len 130 * r2 off 131 * r3 initial_sum 132 * 133 * Function wide register usage 134 * r8 accumulated sum 135 * r9 remaining length to parse 136 * ip pointer to next mbuf 137 * 138 * This function returns the partial 16-bit checksum accumulated in 139 * a 32-bit variable (withouth 1's complement); caller is responsible 140 * for folding the 32-bit sum into 16-bit and performinng the 1's 141 * complement if applicable 142 */ 143 .globl _os_cpu_in_cksum_mbuf 144 .text 145 .align 4 146_os_cpu_in_cksum_mbuf: 147 stmfd sp!, {r4-r11,lr} 148 149 mov r8, r3 /* Accumulate sum in r8 */ 150 mov r9, r1 /* save len in r9 */ 151 mov ip, r0 /* set ip to the current mbuf */ 152 153 cmp r9, #0 /* length is 0? */ 154 bne .Lin_cksum_skip_loop /* if not, proceed further */ 155 mov r0, r8 /* otherwise, return initial sum */ 156 157 ldmfd sp!, {r4-r11, pc} 158 159.Lin_cksum_skip_loop: 160 ldr r1, [ip, #(M_LEN)] 161 ldr r0, [ip, #(M_DATA)] 162 ldr ip, [ip, #(M_NEXT)] 163.Lin_cksum_skip_entry: 164 subs r2, r2, r1 /* offset = offset - mbuf length */ 165 blt .Lin_cksum_skip_done /* if offset has gone negative start with this mbuf */ 166 cmp ip, #0x00 167 bne .Lin_cksum_skip_loop 168 b .Lin_cksum_whoops 169 170.Lin_cksum_skip_done: 171 add r0, r2, r0 /* data += offset (offset is < 0) */ 172 add r0, r0, r1 /* data += length of mbuf */ 173 /* data == start of data to cksum */ 174 rsb r1, r2, #0x00 /* length = remainder of mbuf to read */ 175 mov r10, #0x00 176 b .Lin_cksum_entry 177 178.Lin_cksum_loop: 179 ldr r1, [ip, #(M_LEN)] 180 ldr r0, [ip, #(M_DATA)] 181 ldr ip, [ip, #(M_NEXT)] 182.Lin_cksum_entry: 183 cmp r9, r1 184 movlt r1, r9 185 sub r9, r9, r1 186 eor r11, r10, r0 187 add r10, r10, r1 188 adds r2, r1, #0x00 189 190 beq .Lin_cksum_next 191 192/* 193 * APPLE MODIFICATION 194 * 195 * Replace the 'blne _ASM_LABEL(L_cksumdata)' by bringing the called function 196 * inline. This results in slightly faster code, and also permits the whole 197 * function to be included in kernel profiling data. 198 */ 199 200/* 201 * The main in*_cksum() workhorse... 202 * 203 * Entry parameters: 204 * r0 Pointer to buffer 205 * r1 Buffer length 206 * lr Return address 207 * 208 * Returns: 209 * r2 Accumulated 32-bit sum 210 * 211 * Clobbers: 212 * r0-r7 213 */ 214 mov r2, #0 215 216 /* We first have to word-align the buffer. */ 217 ands r7, r0, #0x03 218 beq .Lcksumdata_wordaligned 219 rsb r7, r7, #0x04 220 cmp r1, r7 /* Enough bytes left to make it? */ 221 blt .Lcksumdata_endgame 222 cmp r7, #0x02 223 ldrb r4, [r0], #0x01 /* Fetch 1st byte */ 224 ldrbge r5, [r0], #0x01 /* Fetch 2nd byte */ 225 movlt r5, #0x00 226 ldrbgt r6, [r0], #0x01 /* Fetch 3rd byte */ 227 movle r6, #0x00 228 /* Combine the three bytes depending on endianness and alignment */ 229#if BYTE_ORDER != LITTLE_ENDIAN 230 orreq r2, r5, r4, lsl #8 231 orreq r2, r2, r6, lsl #24 232 orrne r2, r4, r5, lsl #8 233 orrne r2, r2, r6, lsl #16 234#else 235 orreq r2, r4, r5, lsl #8 236 orreq r2, r2, r6, lsl #16 237 orrne r2, r5, r4, lsl #8 238 orrne r2, r2, r6, lsl #24 239#endif 240 subs r1, r1, r7 /* Update length */ 241 beq .Lin_cksum_next /* All done? */ 242 243 /* Buffer is now word aligned */ 244.Lcksumdata_wordaligned: 245 246#if __ARM_VFP__ >= 3 247 248 cmp r1, #512 // do this if r1 is at least 512 249 blt 9f 250 251 EnableVFP 252 253 and r3, r1, #~0x3f 254 255 vpush {q0-q7} 256 257 // move r2 to s16 (q4) for neon computation 258 veor q4, q4, q4 259 vld1.32 {q0-q1}, [r0]! 260 vmov s16, r2 261 vld1.32 {q2-q3}, [r0]! 262 263 // pre-decrement size by 64 264 subs r3, r3, #0x80 265 266 vpadal.u32 q4, q0 267 vld1.32 {q0}, [r0]! 268 vpaddl.u32 q5, q1 269 vld1.32 {q1}, [r0]! 270 vpaddl.u32 q6, q2 271 vld1.32 {q2}, [r0]! 272 vpaddl.u32 q7, q3 273 vld1.32 {q3}, [r0]! 274 2750: 276 subs r3, r3, #0x40 // decrement size by 64 277 278 vpadal.u32 q4, q0 279 vld1.32 {q0}, [r0]! 280 vpadal.u32 q5, q1 281 vld1.32 {q1}, [r0]! 282 vpadal.u32 q6, q2 283 vld1.32 {q2}, [r0]! 284 vpadal.u32 q7, q3 285 vld1.32 {q3}, [r0]! 286 287 bgt 0b 288 289 vpadal.u32 q4, q0 290 vpadal.u32 q5, q1 291 vpadal.u32 q6, q2 292 vpadal.u32 q7, q3 293 294 vpadal.u32 q4, q5 295 vpadal.u32 q6, q7 296 vpadal.u32 q4, q6 297 vadd.i64 d8, d9 298 299 vpaddl.u32 d8, d8 300 vpaddl.u32 d8, d8 301 vpaddl.u32 d8, d8 302 303 vmov r2, s16 304 305 vpop {q0-q7} 306 307 ands r1, r1, #0x3f // residual bytes 308 beq .Lin_cksum_next 309 3109: 311 312#endif /* __ARM_VFP__ >= 3 */ 313 314 subs r1, r1, #0x40 315 blt .Lcksumdata_bigloop_end 316 317.Lcksumdata_bigloop: 318 ldmia r0!, {r3, r4, r5, r6} 319 adds r2, r2, r3 320 adcs r2, r2, r4 321 adcs r2, r2, r5 322 ldmia r0!, {r3, r4, r5, r7} 323 adcs r2, r2, r6 324 adcs r2, r2, r3 325 adcs r2, r2, r4 326 adcs r2, r2, r5 327 ldmia r0!, {r3, r4, r5, r6} 328 adcs r2, r2, r7 329 adcs r2, r2, r3 330 adcs r2, r2, r4 331 adcs r2, r2, r5 332 ldmia r0!, {r3, r4, r5, r7} 333 adcs r2, r2, r6 334 adcs r2, r2, r3 335 adcs r2, r2, r4 336 adcs r2, r2, r5 337 adcs r2, r2, r7 338 adc r2, r2, #0x00 339 subs r1, r1, #0x40 340 bge .Lcksumdata_bigloop 341.Lcksumdata_bigloop_end: 342 343 adds r1, r1, #0x40 344 beq .Lin_cksum_next 345 346 cmp r1, #0x20 347 348 blt .Lcksumdata_less_than_32 349 ldmia r0!, {r3, r4, r5, r6} 350 adds r2, r2, r3 351 adcs r2, r2, r4 352 adcs r2, r2, r5 353 ldmia r0!, {r3, r4, r5, r7} 354 adcs r2, r2, r6 355 adcs r2, r2, r3 356 adcs r2, r2, r4 357 adcs r2, r2, r5 358 adcs r2, r2, r7 359 adc r2, r2, #0x00 360 subs r1, r1, #0x20 361 beq .Lin_cksum_next 362 363.Lcksumdata_less_than_32: 364 /* There are less than 32 bytes left */ 365 and r3, r1, #0x18 366 rsb r4, r3, #0x18 367 sub r1, r1, r3 368 adds r4, r4, r4, lsr #1 /* Side effect: Clear carry flag */ 369 addne pc, pc, r4 370 371/* 372 * Note: We use ldm here, even on Xscale, since the combined issue/result 373 * latencies for ldm and ldrd are the same. Using ldm avoids needless #ifdefs. 374 */ 375 /* At least 24 bytes remaining... */ 376 ldmia r0!, {r4, r5} 377 nop 378 adcs r2, r2, r4 379 adcs r2, r2, r5 380 381 /* At least 16 bytes remaining... */ 382 ldmia r0!, {r4, r5} 383 adcs r2, r2, r4 384 adcs r2, r2, r5 385 386 /* At least 8 bytes remaining... */ 387 ldmia r0!, {r4, r5} 388 adcs r2, r2, r4 389 adcs r2, r2, r5 390 391 /* Less than 8 bytes remaining... */ 392 adc r2, r2, #0x00 393 subs r1, r1, #0x04 394 blt .Lcksumdata_lessthan4 395 396 ldr r4, [r0], #0x04 397 sub r1, r1, #0x04 398 adds r2, r2, r4 399 adc r2, r2, #0x00 400 401 /* Deal with < 4 bytes remaining */ 402.Lcksumdata_lessthan4: 403 adds r1, r1, #0x04 404 beq .Lin_cksum_next 405 406 /* Deal with 1 to 3 remaining bytes, possibly misaligned */ 407.Lcksumdata_endgame: 408 ldrb r3, [r0] /* Fetch first byte */ 409 cmp r1, #0x02 410 ldrbge r4, [r0, #0x01] /* Fetch 2nd and 3rd as necessary */ 411 movlt r4, #0x00 412 ldrbgt r5, [r0, #0x02] 413 movle r5, #0x00 414 /* Combine the three bytes depending on endianness and alignment */ 415 tst r0, #0x01 416#if BYTE_ORDER != LITTLE_ENDIAN 417 orreq r3, r4, r3, lsl #8 418 orreq r3, r3, r5, lsl #24 419 orrne r3, r3, r4, lsl #8 420 orrne r3, r3, r5, lsl #16 421#else 422 orreq r3, r3, r4, lsl #8 423 orreq r3, r3, r5, lsl #16 424 orrne r3, r4, r3, lsl #8 425 orrne r3, r3, r5, lsl #24 426#endif 427 adds r2, r2, r3 428 adc r2, r2, #0x00 429 430.Lin_cksum_next: 431 tst r11, #0x01 432 movne r2, r2, ror #8 433 adds r8, r8, r2 434 adc r8, r8, #0x00 435 cmp ip, #00 436 bne .Lin_cksum_loop 437 438 mov r1, #0xff 439 orr r1, r1, #0xff00 440 and r0, r8, r1 441 add r0, r0, r8, lsr #16 442 add r0, r0, r0, lsr #16 443 and r0, r0, r1 444 /* 445 * If we were to 1's complement it (XOR with 0xffff): 446 * 447 * eor r0, r0, r1 448 */ 449 450 ldmfd sp!, {r4-r11, pc} 451 452.Lin_cksum_whoops: 453 adr r0, .Lin_cksum_whoops_str 454 bl #CKSUM_ERR 455 mov r0, #-1 456 457 ldmfd sp!, {r4-r11, pc} 458 459.Lin_cksum_whoops_str: 460 .asciz "os_cpu_in_cksum_mbuf: out of data\n" 461 .align 5 462