xref: /xnu-11215.41.3/bsd/dev/i386/cpu_copy_in_cksum.s (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 *  extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
31 *      uint32_t len, uint32_t sum0);
32 *
33 *  input :
34 *      src : source starting address
35 *      dst : destination starting address
36 *      len : byte stream length
37 *      sum0 : initial 32-bit sum
38 *
39 *  output :
40 *      the source byte stream is copied into the destination buffer
41 *      the function returns the partial 16-bit checksum accumulated
42 *	in a 32-bit variable (without 1's complement); caller is
43 *	responsible for folding the 32-bit sum into 16-bit and
44 *	performing the 1's complement if applicable
45 */
46
47#define LITTLE_ENDIAN	1
48#define BYTE_ORDER	LITTLE_ENDIAN
49
50	.const
51	.align	4
52
53/*
54 * a vector v0 = w3 : w2 : w1 : w0 will be using the following mask to
55 * extract 0 : w2 : 0 : w0
56 * then shift right quadword 32-bit to get 0 : w3 : 0 : w1
57 * these two vectors are then accumulated to 4 quadword lanes in 2 vectors
58 */
59L_mask:
60	.quad	0x00000000ffffffff
61	.quad	0x00000000ffffffff
62
63#define Lmask	L_mask(%rip)
64
65	.globl	_os_cpu_copy_in_cksum
66	.text
67	.align	4
68_os_cpu_copy_in_cksum:
69
70#define	src		%rdi
71#define	dst		%rsi
72#define	len		%rdx
73#define	sum		%rcx
74#define need_swap	%r8
75#define t		%r9
76#define td		%r9d
77#define tw		%r9w
78#define tb		%r9b
79#define partial		%r10
80#define partiald	%r10d
81#define partialw	%r10w
82#define partialb	%r10b
83
84/*
85 * renaming vector registers
86 */
87#define v0		%xmm0
88#define v1		%xmm1
89#define v2		%xmm2
90#define v3		%xmm3
91#define v4		%xmm4
92#define v5		%xmm5
93#define v6		%xmm6
94#define v7		%xmm7
95#define v8		%xmm8
96#define v9		%xmm9
97#define v10		%xmm10
98#define v11		%xmm11
99#define v12		%xmm12
100#define v13		%xmm13
101#define v14		%xmm14
102#define v15		%xmm15
103
104	/* push callee-saved registers and set up base pointer */
105	push	%rbp
106	movq	%rsp, %rbp
107
108	mov	$0, partial	// partial = 0;
109	mov	$0, need_swap	// needs_swap = 0;
110
111	cmp	$0, len
112	je	L_len_0
113
114/*
115 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
116 * byte to high byte of 16-bit in w7
117 *
118 *	t = 0;
119 *	if ((uintptr_t)src & 1) {
120 *		t = *src << 8;
121 *		*dst++ = *src++;
122 *		--len;
123 *	}
124 */
125	test	$1, src
126	je	1f
127
128	movzb	(src), partial
129	add	$1, src
130	movb	partialb, (dst)
131	add	$1, dst
132#if BYTE_ORDER == LITTLE_ENDIAN
133	shl	$8, partial
134#endif
135	mov	$1, need_swap
136	sub	$1, len
137	jz	L_len_0
1381:
139
140#ifdef KERNEL
141	/* allocate stack space and save xmm0-xmm15 */
142	sub	$16*16, %rsp
143	movdqa	v0, 0*16(%rsp)
144	movdqa	v1, 1*16(%rsp)
145	movdqa	v2, 2*16(%rsp)
146	movdqa	v3, 3*16(%rsp)
147	movdqa	v4, 4*16(%rsp)
148	movdqa	v5, 5*16(%rsp)
149	movdqa	v6, 6*16(%rsp)
150	movdqa	v7, 7*16(%rsp)
151	movdqa	v8, 8*16(%rsp)
152	movdqa	v9, 9*16(%rsp)
153	movdqa	v10, 10*16(%rsp)
154	movdqa	v11, 11*16(%rsp)
155	movdqa	v12, 12*16(%rsp)
156	movdqa	v13, 13*16(%rsp)
157	movdqa	v14, 14*16(%rsp)
158	movdqa	v15, 15*16(%rsp)
159#endif
160
161	/*
162	 * pre-decrement len by 8*16, and if less tha 8*16 bytes,
163	 * try 4*16 bytes next
164	 * v0,v1 will store temp result after we exit the L128 loop
165	 */
166	pxor	v0, v0
167	pxor	v1, v1
168	cmp	$(8*16), len
169	movq	partial, v0	// move partial to 1st 64b lane in v0
170	jl	L64_bytes
171
172	/*
173	 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
174	 * load 1st 4 vectors, and clear v0-v3
175	 */
176	pxor	v2, v2
177	pxor	v3, v3
178	movups	0*16(src), v4
179	movups	1*16(src), v5
180	movups	2*16(src), v6
181	movups	3*16(src), v7
182	movups	4*16(src), v8
183	movups	5*16(src), v9
184	movups	6*16(src), v10
185	movups	7*16(src), v11
186	add	$8*16, src
187
188	/* branch to finish off if len<128 */
189	sub	$2*8*16, len
190	jl	L128_finishup
191
192	/*
193	 * loop for loading and accumulating 16 32-bit words into
194	 * 8 8-byte accumulators per iteration
195	 */
196L128_loop:
197	/*
198	 * store v4-v7 to dst[0:3]
199	 * copy v4-v7 to v12-v15
200	 * extract w3:w1 in v4-v7
201	 */
202	movups	v4, 0*16(dst)
203	movdqa	v4, v12
204	psrlq	$32, v4
205
206	movups	v5, 1*16(dst)
207	movdqa	v5, v13
208	psrlq	$32, v5
209
210	movups	v6, 2*16(dst)
211	movdqa	v6, v14
212	psrlq	$32, v6
213
214	movups	v7, 3*16(dst)
215	movdqa	v7, v15
216	psrlq	$32, v7
217
218	/*
219	 * store v8-v11 to dst[4:7]
220	 * extract w2:w0 in v12-v15
221	 * accumulate w3:w1 in v4-v7 to v0-v3
222	 */
223	movups	v8, 4*16(dst)
224	pand	Lmask, v12
225	paddq	v4, v0
226
227	movups	v9, 5*16(dst)
228	pand	Lmask, v13
229	paddq	v5, v1
230
231	movups	v10, 6*16(dst)
232	pand	Lmask, v14
233	paddq	v6, v2
234
235	movups	v11, 7*16(dst)
236	pand	Lmask, v15
237	paddq	v7, v3
238
239	add	$8*16, dst	// advance dst for next iteration
240
241	/*
242	 * accumulate w2:w0 in v12-v15 to v0-v3
243	 * copy v8-v11 to v12-v15
244	 * extract w3:w1 in v8-v11
245	 */
246	paddq	v12, v0
247	movdqa	v8, v12
248	psrlq	$32, v8
249
250	paddq	v13, v1
251	movdqa	v9, v13
252	psrlq	$32, v9
253
254	paddq	v14, v2
255	movdqa	v10, v14
256	psrlq	$32, v10
257
258	paddq	v15, v3
259	movdqa	v11, v15
260	psrlq	$32, v11
261
262	/*
263	 * load src[0:3] to v4-v7
264	 * accumulate w3:w1 in v8-v11 to v0-v3
265	 * extract w2:w0 in v12-v15
266	 */
267	movups	0*16(src), v4
268	paddq	v8, v0
269	pand	Lmask, v12
270
271	movups	1*16(src), v5
272	paddq	v9, v1
273	pand	Lmask, v13
274
275	movups	2*16(src), v6
276	paddq	v10, v2
277	pand	Lmask, v14
278
279	movups	3*16(src), v7
280	paddq	v11, v3
281	pand	Lmask, v15
282
283	/*
284	 * load src[4:7] to v8-v11
285	 * accumulate w2:w0 in v12-v15 to v0-v3
286	 */
287	movups	4*16(src), v8
288	paddq	v12, v0
289
290	movups	5*16(src), v9
291	paddq	v13, v1
292
293	movups	6*16(src), v10
294	paddq	v14, v2
295
296	movups	7*16(src), v11
297	paddq	v15, v3
298
299	add	$8*16, src	// advance src for next iteration
300
301	sub	$8*16, len
302	jge	L128_loop
303
304L128_finishup:
305	movups	v4, 0*16(dst)
306	movdqa	v4, v12
307	psrlq	$32, v4
308
309	movups	v5, 1*16(dst)
310	movdqa	v5, v13
311	psrlq	$32, v5
312
313	movups	v6, 2*16(dst)
314	movdqa	v6, v14
315	psrlq	$32, v6
316
317	movups	v7, 3*16(dst)
318	movdqa	v7, v15
319	psrlq	$32, v7
320
321	pand	Lmask, v12
322	paddq	v4, v0
323	movups	v8, 4*16(dst)
324
325	pand	Lmask, v13
326	paddq	v5, v1
327	movups	v9, 5*16(dst)
328
329	pand	Lmask, v14
330	paddq	v6, v2
331	movups	v10, 6*16(dst)
332
333	pand	Lmask, v15
334	paddq	v7, v3
335	movups	v11, 7*16(dst)
336
337	add	$8*16, dst
338
339	paddq	v12, v0
340	movdqa	v8, v12
341	psrlq	$32, v8
342
343	paddq	v13, v1
344	movdqa	v9, v13
345	psrlq	$32, v9
346
347	paddq	v14, v2
348	movdqa	v10, v14
349	psrlq	$32, v10
350
351	paddq	v15, v3
352	movdqa	v11, v15
353	psrlq	$32, v11
354
355	paddq	v8, v0
356	pand	Lmask, v12
357
358	paddq	v9, v1
359	pand	Lmask, v13
360
361	paddq	v10, v2
362	pand	Lmask, v14
363
364	paddq	v11, v3
365	pand	Lmask, v15
366
367	paddq	v12, v0
368	paddq	v13, v1
369	paddq	v14, v2
370	paddq	v15, v3
371
372	add	$8*16, len
373
374	/* absorb v2-v3 into v0-v1 */
375	paddq	v2, v0
376	paddq	v3, v1
377
378L64_bytes:
379	cmp	$4*16, len
380	jl	L32_bytes
381
382	movups	0*16(src), v4
383	movups	1*16(src), v5
384	movups	2*16(src), v6
385	movups	3*16(src), v7
386	add	$4*16, src
387
388	movups	v4, 0*16(dst)
389	movups	v5, 1*16(dst)
390	movups	v6, 2*16(dst)
391	movups	v7, 3*16(dst)
392	add	$4*16, dst
393
394	movdqa	v4, v12
395	psrlq	$32, v4
396	movdqa	v5, v13
397	psrlq	$32, v5
398	movdqa	v6, v14
399	psrlq	$32, v6
400	movdqa	v7, v15
401	psrlq	$32, v7
402
403	pand	Lmask, v12
404	paddq	v4, v0
405	pand	Lmask, v13
406	paddq	v5, v1
407	pand	Lmask, v14
408	paddq	v6, v0
409	pand	Lmask, v15
410	paddq	v7, v1
411
412	paddq	v12, v0
413	paddq	v13, v1
414	paddq	v14, v0
415	paddq	v15, v1
416
417	sub	$4*16, len
418
419L32_bytes:
420	cmp	$2*16, len
421	jl	L16_bytes
422	movups	0*16(src), v4
423	movups	1*16(src), v5
424	add	$2*16, src
425
426	movups	v4, 0*16(dst)
427	movups	v5, 1*16(dst)
428	add	$2*16, dst
429
430	movdqa	v4, v12
431	movdqa	v5, v13
432	psrlq	$32, v4
433	psrlq	$32, v5
434	pand	Lmask, v12
435	pand	Lmask, v13
436	paddq	v4, v0
437	paddq	v5, v1
438	paddq	v12, v0
439	paddq	v13, v1
440
441	sub	$2*16, len
442
443L16_bytes:
444	paddq	v1, v0
445
446	cmp	$16, len
447	jl	L8_bytes
448
449	movups	0*16(src), v4
450	add	$1*16, src
451
452	movups	v4, 0*16(dst)
453	add	$1*16, dst
454
455	movdqa	v4, v12
456	psrlq	$32, v4
457	pand	Lmask, v12
458	paddq	v4, v0
459	paddq	v12, v0
460
461	sub	$16, len
462
463L8_bytes:
464	movq	v0, partial
465	psrldq	$8, v0
466	movq	v0, t
467	add	t, partial
468
469#ifdef KERNEL
470	// restore xmm0-xmm15 and deallocate stack space
471	movdqa	0*16(%rsp), v0
472	movdqa	1*16(%rsp), v1
473	movdqa	2*16(%rsp), v2
474	movdqa	3*16(%rsp), v3
475	movdqa	4*16(%rsp), v4
476	movdqa	5*16(%rsp), v5
477	movdqa	6*16(%rsp), v6
478	movdqa	7*16(%rsp), v7
479	movdqa	8*16(%rsp), v8
480	movdqa	9*16(%rsp), v9
481	movdqa	10*16(%rsp), v10
482	movdqa	11*16(%rsp), v11
483	movdqa	12*16(%rsp), v12
484	movdqa	13*16(%rsp), v13
485	movdqa	14*16(%rsp), v14
486	movdqa	15*16(%rsp), v15
487	add	$16*16, %rsp
488#endif
489
490	sub	$4, len
491	jl	L2_bytes
4920:
493	movl	(src), td
494	add	t, partial
495	mov	td, (dst)
496	add	$4, src
497	add	$4, dst
498	sub	$4, len
499	jge	0b
500
501
502L2_bytes:
503	test	$2, len
504	je	L_trailing_bytes
505
506	movzwl	(src), td
507	add	t, partial
508	mov	tw, (dst)
509	add	$2, src
510	add	$2, dst
511
512L_trailing_bytes:
513	test	$1, len
514	je	L0_bytes
515	movzbl	(src), td
516	mov	tb, (dst)
517#if BYTE_ORDER != LITTLE_ENDIAN
518	shl	$8, t	// partial <<= 8;
519#endif
520	add	t, partial
521
522L0_bytes:
523	/* partial = (partial >> 32) + (partial & 0xffffffff); */
524	mov	partiald, %eax
525	shr	$32, partial
526	add	%rax, partial
527
528	/* partial = (partial >> 16) + (partial & 0xffff); */
529	movzwl	partialw, %eax
530	shr	$16, partial
531	add	%rax, partial
532
533L_len_0:
534	/*
535	 * if (needs_swap)
536	 *	partial = (partial << 8) + (partial >> 24);
537	 */
538	cmp	$0, need_swap
539	je	1f
540	mov	partial, %rax
541	shl	$8, %rax
542	shr	$24, partial
543	add	%rax, partial
5441:
545
546	/* final_acc = (initial_sum >> 16) + (initial_sum & 0xffff); */
547	movzwl	%cx, %eax
548	shr	$16, %ecx
549	add	%ecx, %eax
550
551	/* final_acc += (partial >> 16) + (partial & 0xffff); */
552	movzwl	partialw, %ecx
553	shr	$16, partial
554	add	%ecx, %eax
555	add	partiald, %eax
556
557	/* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
558	movzwl	%ax, %ecx
559	shr	$16, %eax
560	add	%ecx, %eax
561
562	/*
563	 * One final fold in case of carry from the previous one.
564	 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
565	 */
566	movzwl	%ax, %ecx
567	shr	$16, %eax
568	add	%ecx, %eax
569
570	/*
571	 * return (~final_acc & 0xffff);
572	 *
573	 * not      %eax
574	 * movzwl   %ax, %eax
575	 */
576
577	/* restore callee-saved registers */
578	pop	%rbp
579	ret
580