/*
 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
 *
 * This file contains Original Code and/or Modifications of Original Code
 * as defined in and that are subject to the Apple Public Source License
 * Version 2.0 (the 'License'). You may not use this file except in
 * compliance with the License. The rights granted to you under the License
 * may not be used to create, or enable the creation or redistribution of,
 * unlawful or unlicensed copies of an Apple operating system, or to
 * circumvent, violate, or enable the circumvention or violation of, any
 * terms of an Apple operating system software license agreement.
 *
 * Please obtain a copy of the License at
 * http://www.opensource.apple.com/apsl/ and read it before using this file.
 *
 * The Original Code and all software distributed under the License are
 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
 * Please see the License for the specific language governing rights and
 * limitations under the License.
 *
 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
 */

/*
 *  extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
 *      uint32_t len, uint32_t sum0);
 *
 *  input :
 *      src : source starting address
 *      dst : destination starting address
 *      len : byte stream length
 *      sum0 : initial 32-bit sum
 *
 *  output :
 *      the source byte stream is copied into the destination buffer
 *      the function returns the partial 16-bit checksum accumulated
 *	in a 32-bit variable (without 1's complement); caller is
 *	responsible for folding the 32-bit sum into 16-bit and
 *	performing the 1's complement if applicable
 */

#define LITTLE_ENDIAN	1
#define BYTE_ORDER	LITTLE_ENDIAN

	.const
	.align	4

/*
 * a vector v0 = w3 : w2 : w1 : w0 will be using the following mask to
 * extract 0 : w2 : 0 : w0
 * then shift right quadword 32-bit to get 0 : w3 : 0 : w1
 * these two vectors are then accumulated to 4 quadword lanes in 2 vectors
 */
L_mask:
	.quad	0x00000000ffffffff
	.quad	0x00000000ffffffff

#define Lmask	L_mask(%rip)

	.globl	_os_cpu_copy_in_cksum
	.text
	.align	4
_os_cpu_copy_in_cksum:

#define	src		%rdi
#define	dst		%rsi
#define	len		%rdx
#define	sum		%rcx
#define need_swap	%r8
#define t		%r9
#define td		%r9d
#define tw		%r9w
#define tb		%r9b
#define partial		%r10
#define partiald	%r10d
#define partialw	%r10w
#define partialb	%r10b

/*
 * renaming vector registers
 */
#define v0		%xmm0
#define v1		%xmm1
#define v2		%xmm2
#define v3		%xmm3
#define v4		%xmm4
#define v5		%xmm5
#define v6		%xmm6
#define v7		%xmm7
#define v8		%xmm8
#define v9		%xmm9
#define v10		%xmm10
#define v11		%xmm11
#define v12		%xmm12
#define v13		%xmm13
#define v14		%xmm14
#define v15		%xmm15

	/* push callee-saved registers and set up base pointer */
	push	%rbp
	movq	%rsp, %rbp

	mov	$0, partial	// partial = 0;
	mov	$0, need_swap	// needs_swap = 0;

	cmp	$0, len
	je	L_len_0

/*
 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
 * byte to high byte of 16-bit in w7
 *
 *	t = 0;
 *	if ((uintptr_t)src & 1) {
 *		t = *src << 8;
 *		*dst++ = *src++;
 *		--len;
 *	}
 */
	test	$1, src
	je	1f

	movzb	(src), partial
	add	$1, src
	movb	partialb, (dst)
	add	$1, dst
#if BYTE_ORDER == LITTLE_ENDIAN
	shl	$8, partial
#endif
	mov	$1, need_swap
	sub	$1, len
	jz	L_len_0
1:

#ifdef KERNEL
	/* allocate stack space and save xmm0-xmm15 */
	sub	$16*16, %rsp
	movdqa	v0, 0*16(%rsp)
	movdqa	v1, 1*16(%rsp)
	movdqa	v2, 2*16(%rsp)
	movdqa	v3, 3*16(%rsp)
	movdqa	v4, 4*16(%rsp)
	movdqa	v5, 5*16(%rsp)
	movdqa	v6, 6*16(%rsp)
	movdqa	v7, 7*16(%rsp)
	movdqa	v8, 8*16(%rsp)
	movdqa	v9, 9*16(%rsp)
	movdqa	v10, 10*16(%rsp)
	movdqa	v11, 11*16(%rsp)
	movdqa	v12, 12*16(%rsp)
	movdqa	v13, 13*16(%rsp)
	movdqa	v14, 14*16(%rsp)
	movdqa	v15, 15*16(%rsp)
#endif

	/*
	 * pre-decrement len by 8*16, and if less tha 8*16 bytes,
	 * try 4*16 bytes next
	 * v0,v1 will store temp result after we exit the L128 loop
	 */
	pxor	v0, v0
	pxor	v1, v1
	cmp	$(8*16), len
	movq	partial, v0	// move partial to 1st 64b lane in v0
	jl	L64_bytes

	/*
	 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
	 * load 1st 4 vectors, and clear v0-v3
	 */
	pxor	v2, v2
	pxor	v3, v3
	movups	0*16(src), v4
	movups	1*16(src), v5
	movups	2*16(src), v6
	movups	3*16(src), v7
	movups	4*16(src), v8
	movups	5*16(src), v9
	movups	6*16(src), v10
	movups	7*16(src), v11
	add	$8*16, src

	/* branch to finish off if len<128 */
	sub	$2*8*16, len
	jl	L128_finishup

	/*
	 * loop for loading and accumulating 16 32-bit words into
	 * 8 8-byte accumulators per iteration
	 */
L128_loop:
	/*
	 * store v4-v7 to dst[0:3]
	 * copy v4-v7 to v12-v15
	 * extract w3:w1 in v4-v7
	 */
	movups	v4, 0*16(dst)
	movdqa	v4, v12
	psrlq	$32, v4

	movups	v5, 1*16(dst)
	movdqa	v5, v13
	psrlq	$32, v5

	movups	v6, 2*16(dst)
	movdqa	v6, v14
	psrlq	$32, v6

	movups	v7, 3*16(dst)
	movdqa	v7, v15
	psrlq	$32, v7

	/*
	 * store v8-v11 to dst[4:7]
	 * extract w2:w0 in v12-v15
	 * accumulate w3:w1 in v4-v7 to v0-v3
	 */
	movups	v8, 4*16(dst)
	pand	Lmask, v12
	paddq	v4, v0

	movups	v9, 5*16(dst)
	pand	Lmask, v13
	paddq	v5, v1

	movups	v10, 6*16(dst)
	pand	Lmask, v14
	paddq	v6, v2

	movups	v11, 7*16(dst)
	pand	Lmask, v15
	paddq	v7, v3

	add	$8*16, dst	// advance dst for next iteration

	/*
	 * accumulate w2:w0 in v12-v15 to v0-v3
	 * copy v8-v11 to v12-v15
	 * extract w3:w1 in v8-v11
	 */
	paddq	v12, v0
	movdqa	v8, v12
	psrlq	$32, v8

	paddq	v13, v1
	movdqa	v9, v13
	psrlq	$32, v9

	paddq	v14, v2
	movdqa	v10, v14
	psrlq	$32, v10

	paddq	v15, v3
	movdqa	v11, v15
	psrlq	$32, v11

	/*
	 * load src[0:3] to v4-v7
	 * accumulate w3:w1 in v8-v11 to v0-v3
	 * extract w2:w0 in v12-v15
	 */
	movups	0*16(src), v4
	paddq	v8, v0
	pand	Lmask, v12

	movups	1*16(src), v5
	paddq	v9, v1
	pand	Lmask, v13

	movups	2*16(src), v6
	paddq	v10, v2
	pand	Lmask, v14

	movups	3*16(src), v7
	paddq	v11, v3
	pand	Lmask, v15

	/*
	 * load src[4:7] to v8-v11
	 * accumulate w2:w0 in v12-v15 to v0-v3
	 */
	movups	4*16(src), v8
	paddq	v12, v0

	movups	5*16(src), v9
	paddq	v13, v1

	movups	6*16(src), v10
	paddq	v14, v2

	movups	7*16(src), v11
	paddq	v15, v3

	add	$8*16, src	// advance src for next iteration

	sub	$8*16, len
	jge	L128_loop

L128_finishup:
	movups	v4, 0*16(dst)
	movdqa	v4, v12
	psrlq	$32, v4

	movups	v5, 1*16(dst)
	movdqa	v5, v13
	psrlq	$32, v5

	movups	v6, 2*16(dst)
	movdqa	v6, v14
	psrlq	$32, v6

	movups	v7, 3*16(dst)
	movdqa	v7, v15
	psrlq	$32, v7

	pand	Lmask, v12
	paddq	v4, v0
	movups	v8, 4*16(dst)

	pand	Lmask, v13
	paddq	v5, v1
	movups	v9, 5*16(dst)

	pand	Lmask, v14
	paddq	v6, v2
	movups	v10, 6*16(dst)

	pand	Lmask, v15
	paddq	v7, v3
	movups	v11, 7*16(dst)

	add	$8*16, dst

	paddq	v12, v0
	movdqa	v8, v12
	psrlq	$32, v8

	paddq	v13, v1
	movdqa	v9, v13
	psrlq	$32, v9

	paddq	v14, v2
	movdqa	v10, v14
	psrlq	$32, v10

	paddq	v15, v3
	movdqa	v11, v15
	psrlq	$32, v11

	paddq	v8, v0
	pand	Lmask, v12

	paddq	v9, v1
	pand	Lmask, v13

	paddq	v10, v2
	pand	Lmask, v14

	paddq	v11, v3
	pand	Lmask, v15

	paddq	v12, v0
	paddq	v13, v1
	paddq	v14, v2
	paddq	v15, v3

	add	$8*16, len

	/* absorb v2-v3 into v0-v1 */
	paddq	v2, v0
	paddq	v3, v1

L64_bytes:
	cmp	$4*16, len
	jl	L32_bytes

	movups	0*16(src), v4
	movups	1*16(src), v5
	movups	2*16(src), v6
	movups	3*16(src), v7
	add	$4*16, src

	movups	v4, 0*16(dst)
	movups	v5, 1*16(dst)
	movups	v6, 2*16(dst)
	movups	v7, 3*16(dst)
	add	$4*16, dst

	movdqa	v4, v12
	psrlq	$32, v4
	movdqa	v5, v13
	psrlq	$32, v5
	movdqa	v6, v14
	psrlq	$32, v6
	movdqa	v7, v15
	psrlq	$32, v7

	pand	Lmask, v12
	paddq	v4, v0
	pand	Lmask, v13
	paddq	v5, v1
	pand	Lmask, v14
	paddq	v6, v0
	pand	Lmask, v15
	paddq	v7, v1

	paddq	v12, v0
	paddq	v13, v1
	paddq	v14, v0
	paddq	v15, v1

	sub	$4*16, len

L32_bytes:
	cmp	$2*16, len
	jl	L16_bytes
	movups	0*16(src), v4
	movups	1*16(src), v5
	add	$2*16, src

	movups	v4, 0*16(dst)
	movups	v5, 1*16(dst)
	add	$2*16, dst

	movdqa	v4, v12
	movdqa	v5, v13
	psrlq	$32, v4
	psrlq	$32, v5
	pand	Lmask, v12
	pand	Lmask, v13
	paddq	v4, v0
	paddq	v5, v1
	paddq	v12, v0
	paddq	v13, v1

	sub	$2*16, len

L16_bytes:
	paddq	v1, v0

	cmp	$16, len
	jl	L8_bytes

	movups	0*16(src), v4
	add	$1*16, src

	movups	v4, 0*16(dst)
	add	$1*16, dst

	movdqa	v4, v12
	psrlq	$32, v4
	pand	Lmask, v12
	paddq	v4, v0
	paddq	v12, v0

	sub	$16, len

L8_bytes:
	movq	v0, partial
	psrldq	$8, v0
	movq	v0, t
	add	t, partial

#ifdef KERNEL
	// restore xmm0-xmm15 and deallocate stack space
	movdqa	0*16(%rsp), v0
	movdqa	1*16(%rsp), v1
	movdqa	2*16(%rsp), v2
	movdqa	3*16(%rsp), v3
	movdqa	4*16(%rsp), v4
	movdqa	5*16(%rsp), v5
	movdqa	6*16(%rsp), v6
	movdqa	7*16(%rsp), v7
	movdqa	8*16(%rsp), v8
	movdqa	9*16(%rsp), v9
	movdqa	10*16(%rsp), v10
	movdqa	11*16(%rsp), v11
	movdqa	12*16(%rsp), v12
	movdqa	13*16(%rsp), v13
	movdqa	14*16(%rsp), v14
	movdqa	15*16(%rsp), v15
	add	$16*16, %rsp
#endif

	sub	$4, len
	jl	L2_bytes
0:
	movl	(src), td
	add	t, partial
	mov	td, (dst)
	add	$4, src
	add	$4, dst
	sub	$4, len
	jge	0b


L2_bytes:
	test	$2, len
	je	L_trailing_bytes

	movzwl	(src), td
	add	t, partial
	mov	tw, (dst)
	add	$2, src
	add	$2, dst

L_trailing_bytes:
	test	$1, len
	je	L0_bytes
	movzbl	(src), td
	mov	tb, (dst)
#if BYTE_ORDER != LITTLE_ENDIAN
	shl	$8, t	// partial <<= 8;
#endif
	add	t, partial

L0_bytes:
	/* partial = (partial >> 32) + (partial & 0xffffffff); */
	mov	partiald, %eax
	shr	$32, partial
	add	%rax, partial

	/* partial = (partial >> 16) + (partial & 0xffff); */
	movzwl	partialw, %eax
	shr	$16, partial
	add	%rax, partial

L_len_0:
	/*
	 * if (needs_swap)
	 *	partial = (partial << 8) + (partial >> 24);
	 */
	cmp	$0, need_swap
	je	1f
	mov	partial, %rax
	shl	$8, %rax
	shr	$24, partial
	add	%rax, partial
1:

	/* final_acc = (initial_sum >> 16) + (initial_sum & 0xffff); */
	movzwl	%cx, %eax
	shr	$16, %ecx
	add	%ecx, %eax

	/* final_acc += (partial >> 16) + (partial & 0xffff); */
	movzwl	partialw, %ecx
	shr	$16, partial
	add	%ecx, %eax
	add	partiald, %eax

	/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
	movzwl	%ax, %ecx
	shr	$16, %eax
	add	%ecx, %eax

	/*
	 * One final fold in case of carry from the previous one.
	 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
	 */
	movzwl	%ax, %ecx
	shr	$16, %eax
	add	%ecx, %eax

	/*
	 * return (~final_acc & 0xffff);
	 *
	 * not      %eax
	 * movzwl   %ax, %eax
	 */

	/* restore callee-saved registers */
	pop	%rbp
	ret