/* * Copyright (c) 2016-2021 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ /* * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, * uint32_t len, uint32_t sum0); * * input : * src : source starting address * dst : destination starting address * len : byte stream length * sum0 : initial 32-bit sum * * output : * the source byte stream is copied into the destination buffer * the function returns the partial 16-bit checksum accumulated * in a 32-bit variable (without 1's complement); caller is * responsible for folding the 32-bit sum into 16-bit and * performing the 1's complement if applicable */ /* * the following definitions default the implementation to little-endian * architectures */ #define LITTLE_ENDIAN 1 #define BYTE_ORDER LITTLE_ENDIAN /* * renaming registers to ease code porting from arm64 */ #define v0 q0 #define v1 q1 #define v2 q2 #define v3 q3 #define v8 q8 #define v9 q9 #define v10 q10 #define v11 q11 #define v12 q12 #define v13 q13 #define v14 q14 #define v15 q15 .syntax unified .align 2 .code 16 .thumb_func _os_cpu_copy_in_cksum .text .globl _os_cpu_copy_in_cksum _os_cpu_copy_in_cksum: #define src r0 #define dst r1 #define len r2 #define sum r3 #define need_swap r4 #define partial r5 #define t r12 push {r4,r5,r7,lr} add r7, sp, #8 /* set up base pointer for debug tracing */ cmp len, #0 mov partial, #0 /* partial = 0; */ mov need_swap, #0 /* needs_swap = 0; */ cbnz len, 0f b L_len_0 0: /* * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this * byte to high byte of 16-bit in w7 * * t = 0; * if ((uintptr_t)src & 1) { * t = *src << 8; * *dst++ = *src++; * --len; * } */ tst src, #1 beq 1f ldrb partial, [src] add src, src, #1 strb partial, [dst], #1 #if BYTE_ORDER == LITTLE_ENDIAN lsl partial, partial, #8 #endif subs len, len, #1 mov need_swap, #1 beq L_len_0 1: #ifdef KERNEL vpush {v8-v15} vpush {v0-v3} #endif /* * pre-decrement len by 8*16, and if less tha 8*16 bytes, try * 4*16 bytes next. * v0,v1 will store temp result after we exit the L128 loop */ veor v0, v0, v0 veor v1, v1, v1 cmp len, #8*16 vmov s0, partial /* move partial to 1st 64b lane in v0 */ blt L64_bytes /* * accumulate 8 x 2 x 16-bit pairs into 16 lanes in v0-v3 * branch to finish off if len<128 */ vld1.8 {q8,q9}, [src]! veor v2, v2, v2 vld1.8 {q10,q11}, [src]! veor v3, v3, v3 vld1.8 {q12,q13}, [src]! subs len, len, #2*8*16 vld1.8 {q14,q15}, [src]! blt L128_finishup /* * loop for loading and accumulating 16 32-bit words nto 8 8-byte * accumulators per iteration */ L128_loop: vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vld1.8 {q8,q9}, [src]! vpadal.u16 v2, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v3, v11 vld1.8 {q10,q11}, [src]! vpadal.u16 v0, v12 vst1.8 {q12,q13}, [dst]! vpadal.u16 v1, v13 vld1.8 {q12,q13}, [src]! vpadal.u16 v2, v14 vst1.8 {q14,q15}, [dst]! vpadal.u16 v3, v15 vld1.8 {q14,q15}, [src]! subs len, len, #8*16 bge L128_loop L128_finishup: vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vpadal.u16 v2, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v3, v11 vpadal.u16 v0, v12 vst1.8 {q12,q13}, [dst]! vpadal.u16 v1, v13 vpadal.u16 v2, v14 vst1.8 {q14,q15}, [dst]! vpadal.u16 v3, v15 add len, len, #8*16 vadd.i32 v0, v0, v2 vadd.i32 v1, v1, v3 L64_bytes: cmp len, #4*16 blt L32_bytes vld1.8 {q8,q9}, [src]! vld1.8 {q10,q11}, [src]! vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 vpadal.u16 v0, v10 vst1.8 {q10,q11}, [dst]! vpadal.u16 v1, v11 sub len, len, #4*16 L32_bytes: cmp len, #2*16 blt L16_bytes vld1.8 {q8,q9}, [src]! vpadal.u16 v0, v8 vst1.8 {q8,q9}, [dst]! vpadal.u16 v1, v9 sub len, len, #2*16 L16_bytes: vadd.i32 v0, v0, v1 cmp len, #16 blt L8_bytes vld1.8 {q8}, [src]! vpadal.u16 v0, v8 vst1.8 {q8}, [dst]! sub len, len, #16 L8_bytes: veor v1, v1, v1 tst len, #8 beq L4_bytes vld1.8 {d2}, [src]! vst1.8 {d2}, [dst]! vpadal.u16 v0, v1 L4_bytes: ands len, len, #7 vpadd.i32 d0, d0, d1 vpadd.i32 d0, d0, d1 vmov partial, s0 #ifdef KERNEL vpop {q0-q1} vpop {q2-q3} vpop {q8-q9} vpop {q10-q11} vpop {q12-q13} vpop {q14-q15} #endif beq L_len_0 subs len, len, #2 blt L_trailing_bytes L2_bytes: ldrh t, [src], #2 strh t, [dst], #2 add partial, partial, t subs len, len, #2 bge L2_bytes L_trailing_bytes: tst len, #1 beq L_len_0 ldrb t,[src],#1 strb t,[dst],#1 #if BYTE_ORDER != LITTLE_ENDIAN lsl t, t, #8 #endif add partial, partial, t L_len_0: /* * if (needs_swap) * partial = (partial << 8) + (partial >> 24); */ cbz need_swap, 1f lsl t, partial, #8 add partial, t, partial, lsr #24 1: movw lr, #0xffff /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ and r0, sum, lr add r0, r0, sum, lsr #16 /* final_acc += (partial >> 16) + (partial & 0xffff); */ add r0, r0, partial, lsr #16 and partial, partial, lr add r0, r0, partial /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ and t, r0, lr add r0, t, r0, lsr #16 /* * One final fold in case of carry from the previous one. * final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ and t, r0, lr add r0, t, r0, lsr #16 /* * return (~final_acc & 0xffff); * * mvn r0, r0 * and r0, r0, lr */ pop {r4,r5,r7,pc}