/*
 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 *  extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
 *      uint32_t len, uint32_t sum0);
 *
 *  input :
 *      src : source starting address
 *      dst : destination starting address
 *      len : byte stream length
 *      sum0 : initial 32-bit sum
 *
 *  output :
 *      the source byte stream is copied into the destination buffer
 *      the function returns the partial 16-bit checksum accumulated
 *	in a 32-bit variable (without 1's complement); caller is
 *	responsible for folding the 32-bit sum into 16-bit and
 *	performing the 1's complement if applicable
 */

/*
 * The following definitions default the implementation to little-endian
 * architectures.
 */
#define LITTLE_ENDIAN	1
#define BYTE_ORDER	LITTLE_ENDIAN

/*
 * ARM64 kernel mode -- just like user mode -- no longer requires saving
 * the vector registers, since it's done by the exception handler code.
 */
#define	SAVE_REGISTERS	0

	.globl	_os_cpu_copy_in_cksum
	.text
	.align	4
_os_cpu_copy_in_cksum:

#define	src		x0
#define	dst		x1
#define	len		x2
#define	sum		x3
#define need_swap	x5
#define t		x6
#define partial		x7
#define wpartial	w7

	mov	partial, #0		// partial = 0;
	mov	need_swap, #0		// needs_swap = 0;

	cbz	len, L_len_0

/*
 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
 * byte to high byte of 16-bit in w7
 *
 *	t = 0;
 *	if ((uintptr_t)src & 1) {
 *		t = *src << 8;
 *		*dst++ = *src++;
 *		--len;
 *	}
 */
	tst	src, #1
	b.eq	1f
	ldrb	wpartial, [src]
	add	src, src, #1
	strb	wpartial, [dst], #1
#if BYTE_ORDER == LITTLE_ENDIAN
	lsl	partial, partial, #8
#endif
	sub	len, len, #1
	mov	need_swap, #1
	cbz	len, L_len_0
1:

#if SAVE_REGISTERS
	/*
	 * we will always use v0-v3, and v4-v7/v16-v19 if len>=128
	 * so allocate 12*16 bytes in the stack, and store v0-v3 now,
	 * keep x11 as the pointer
	 */
	sub	sp, sp, #12*16
	mov	x11, sp
	st1.4s	{v0, v1, v2, v3}, [x11], #4*16
#endif

	/*
	 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try
	 * 4*16 bytes next.
	 * v0,v1 will store temp result after we exit the L128 loop
	 */
	eor.16b	v0, v0, v0
	eor.16b	v1, v1, v1
	cmp	len, #8*16
	mov	v0.d[0], partial	// move partial to 1st 64b lane in v0
	b.lt	L64_bytes

#if SAVE_REGISTERS
	/* if we are here, we need to save v4-v7/v16-v19 for kernel mode */
	st1.4s	{v4, v5, v6, v7}, [x11], #4*16
	st1.4s	{v16, v17, v18, v19}, [x11], #4*16
#endif

	/*
	 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
	 * load 1st 4 vectors, and clear v0-v3
	 */
	ldr	q4, [src], #8*16
	eor.16b	v2, v2, v2
	ldr	q5, [src, #-7*16]
	eor.16b	v3, v3, v3
	ldr	q6, [src, #-6*16]
	ldr	q7, [src, #-5*16]
	ldr	q16, [src, #-4*16]
	ldr	q17, [src, #-3*16]
	ldr	q18, [src, #-2*16]
	ldr	q19, [src, #-1*16]

	/* branch to finish off if len<128 */
	subs	len, len, #2*8*16
	b.lt	L128_finishup

	/*
	 * loop for loading and accumulating 16 32-bit words nto 8 8-byte
	 * accumulators per iteration
	 */
L128_loop:
	str		q4, [dst], #16*8
	uadalp.2d	v0, v4
	str		q5, [dst, #-7*16]
	uadalp.2d	v1, v5
	ldr		q4, [src], #16*8
	ldr		q5, [src, #-7*16]

	str		q6, [dst, #-6*16]
	uadalp.2d	v2, v6
	str		q7, [dst, #-5*16]
	uadalp.2d	v3, v7
	ldr		q6, [src, #-6*16]
	ldr		q7, [src, #-5*16]

	str		q16, [dst, #-4*16]
	uadalp.2d	v0, v16
	str		q17, [dst, #-3*16]
	uadalp.2d	v1, v17
	ldr		q16, [src, #-4*16]
	ldr		q17, [src, #-3*16]

	str		q18, [dst, #-2*16]
	uadalp.2d	v2, v18
	str		q19, [dst, #-1*16]
	uadalp.2d	v3, v19
	ldr		q18, [src, #-2*16]
	ldr		q19, [src, #-1*16]

	subs		len, len, #8*16
	b.ge		L128_loop

L128_finishup:
	str		q4, [dst], #16*8
	uadalp.2d	v0, v4
	str		q5, [dst, #-7*16]
	uadalp.2d	v1, v5
	str		q6, [dst, #-6*16]
	uadalp.2d	v2, v6
	str		q7, [dst, #-5*16]
	uadalp.2d	v3, v7

	str		q16, [dst, #-4*16]
	uadalp.2d	v0, v16
	str		q17, [dst, #-3*16]
	uadalp.2d	v1, v17
	str		q18, [dst, #-2*16]
	uadalp.2d	v2, v18
	str		q19, [dst, #-1*16]
	uadalp.2d	v3, v19

	add		len, len, #8*16

	add.2d		v0, v0, v2
	add.2d		v1, v1, v3

#if SAVE_REGISTERS
	/* restore v4-v7/v16-v19 as they won't be used any more */
	add		x11, sp, #4*16
	ld1.4s		{v4, v5, v6, v7}, [x11], #4*16
	ld1.4s		{v16, v17, v18, v19}, [x11], #4*16
#endif

L64_bytes:
	cmp		len, #4*16
	b.lt		L32_bytes

	ldr		q2, [src], #4*16
	ldr		q3, [src, #-3*16]
	str		q2, [dst], #4*16
	uadalp.2d	v0, v2
	str		q3, [dst, #-3*16]
	uadalp.2d	v1, v3

	ldr		q2, [src, #-2*16]
	ldr		q3, [src, #-1*16]
	str		q2, [dst, #-2*16]
	uadalp.2d	v0, v2
	str		q3, [dst, #-1*16]
	uadalp.2d	v1, v3
	sub		len, len, #4*16

L32_bytes:
	cmp		len, #2*16
	b.lt		L16_bytes
	ldr		q2, [src], #2*16
	ldr		q3, [src, #-1*16]
	str		q2, [dst], #2*16
	uadalp.2d	v0, v2
	str		q3, [dst, #-1*16]
	uadalp.2d	v1, v3
	sub		len, len, #2*16

L16_bytes:
	add.2d		v0, v0, v1
	cmp		len, #16
	b.lt		L8_bytes
	ldr		q2, [src], #16
	str		q2, [dst], #16
	uadalp.2d	v0, v2
	sub		len, len, #16

L8_bytes:
	eor.16b		v1, v1, v1
	eor.16b		v2, v2, v2
	eor.16b		v3, v3, v3

	tst		len, #8
	b.eq		L4_bytes
	ldr		d1,[src],#8
	str		d1,[dst],#8

L4_bytes:
	tst		len, #4
	b.eq		L2_bytes
	ldr		s2,[src],#4
	str		s2,[dst],#4

L2_bytes:
	uadalp.2d	v0, v1
	eor.16b		v1, v1, v1
	tst		len, #2
	b.eq		L_trailing_bytes
	ldr		h3,[src],#2
	str		h3,[dst],#2

L_trailing_bytes:
	tst		len, #1
	b.eq		L0_bytes
	ldr		b1,[src],#1
	str		b1,[dst],#1
#if BYTE_ORDER != LITTLE_ENDIAN
	shl.4h		v1, v1, #8	// partial <<= 8;
#endif

L0_bytes:
	uadalp.2d	v2, v3
	uadalp.2d	v0, v1
	uadalp.2d	v0, v2

	addp.2d		d0, v0
	fmov		partial, d0

#if SAVE_REGISTERS
	/* restore v0-v3 and deallocate stack space */
	ld1.4s	{v0, v1, v2, v3}, [sp]
	add	sp, sp, #12*16
#endif

	/* partial = (partial >> 32) + (partial & 0xffffffff); */
	and	t, partial, #0xffffffff
	add	partial, t, partial, lsr #32

	/* partial = (partial >> 16) + (partial & 0xffff); */
	and	t, partial, #0xffff
	add	partial, t, partial, lsr #16

L_len_0:
	/*
	 * if (needs_swap)
	 *	partial = (partial << 8) + (partial >> 24);
	 */
	cbz	need_swap, 1f
	lsl	t, partial, #8
	add	partial, t, partial, lsr #24
1:
	/* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
	and	x0, sum, #0xffff
	add	x0, x0, sum, lsr #16

	/* final_acc += (partial >> 16) + (partial & 0xffff); */
	add	x0, x0, partial, lsr #16
	and	partial, partial, #0xffff
	add	x0, x0, partial

	/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
	and	t, x0, #0xffff
	add	x0, t, x0, lsr #16

	/*
	 * One final fold in case of carry from the previous one.
	 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	 */
	and	t, x0, #0xffff
	add	x0, t, x0, lsr #16

	/*
	 * return (~final_acc & 0xffff);
	 *
	 * mvn	w0, w0
	 * and	w0, w0, #0xffff
	 */

	ret	lr