xref: /xnu-8792.41.9/osfmk/arm64/strncmp.s (revision 5c2921b07a2480ab43ec66f5b9e41cb872bc554f)
1/*
2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 * This file implements the following function for the arm64 architecture:
29 *
30 *  int strncmp(const char *s1, const char *s2, size_t n);
31 *
32 * Returns 0 if the two strings are equal up to the first n bytes or to the
33 * end of the string, whichever comes first.  Otherwise, returns the difference
34 * of the first mismatched characters interpreted as uint8_t.
35 */
36
37#include <arm64/asm.h>
38
39.globl _strncmp
40
41/*****************************************************************************
42 *  Macros                                                                   *
43 *****************************************************************************/
44
45.macro EstablishFrame
46	ARM64_STACK_PROLOG
47	stp       fp, lr, [sp, #-16]!
48	mov       fp,      sp
49.endm
50
51.macro ClearFrameAndReturn
52	ldp       fp, lr, [sp], #16
53	ARM64_STACK_EPILOG
54.endm
55
56#include "../mach/arm/vm_param.h"
57#define kVectorSize 16
58
59/*****************************************************************************
60 *  Constants                                                                *
61 *****************************************************************************/
62
63.text
64.align 5
65L_mask:
66.quad 0x0706050403020100, 0x0f0e0d0c0b0a0908
67
68/*****************************************************************************
69 *  Entrypoints                                                              *
70 *****************************************************************************/
71
72_strncmp:
73	EstablishFrame
74	eor       x3,      x3, x3
75	cbz       x2,      L_scalarDone
76//	Compare one byte at a time until s1 has vector alignment.
770:	tst       x0,      #(kVectorSize-1)
78	b.eq      L_s1aligned
79	ldrb      w4,     [x0],#1  // load byte from src1
80	ldrb      w5,     [x1],#1  // load byte from src2
81	subs      x3,      x4, x5  // if the are not equal
82	ccmp      w4,  #0, #4, eq  //    or we find an EOS
83	b.eq      L_scalarDone     // return the difference
84	subs      x2,      x2, #1  // decrement length
85	b.ne      0b               // continue loop if non-zero
86
87//	We found a mismatch or EOS before s1 became aligned.  Simply return the
88//	difference between the last bytes that we loaded.
89L_scalarDone:
90	mov       x0,      x3
91	ClearFrameAndReturn
92
93L_s1aligned:
94//	If s2 is similarly aligned to s1, then we can use a naive vector comparison
95//	from this point on without worrying about spurious page faults; none of our
96//	loads will ever cross a page boundary, because they are all aligned.
97	tst       x1,      #(kVectorSize-1)
98	b.eq      L_naiveVector
99
100/*****************************************************************************
101 *  Careful chunk comparison                                                 *
102 *****************************************************************************/
103
104//	Otherwise, we need to be careful; although vector loads from s1 cannot
105//	cross a page boundary because they are aligned, s2 is not aligned.  We
106//	compute the multiple of vector size that we can safely load before reaching
107//	a page boundary, and compare only that far before switching over to scalar
108//	comparisons to step across the page boundary.  If this number happens to
109//	be zero, we jump directly to the scalar comparison.
110	neg       x7,      x1
111	ands      x7,      x7, #(PAGE_MIN_SIZE-kVectorSize)
112	b.eq      2f
113
114.align 4
115//	If n is less than the number of bytes before a page-crossing load, jump
116//	into the naive vector path instead, since we will not even reach a page
117//	crossing.  Otherwise, decrement n by that number before we monkey with it,
118//	and set the decremented value aside.
1190:	cmp       x2,      x7
120	b.ls      L_naiveVector
121	sub       x6,      x2, x7
122//	Use vector comparisons until a mismatch or EOS is encountered, or the next
123//	vector load from s2 would be page-crossing.
1241:	ldr       q0,     [x0],#(kVectorSize)
125	ldr       q1,     [x1],#(kVectorSize)
126	cmeq.16b  v1,      v0, v1
127	and.16b   v0,      v0, v1   // contains zero byte iff mismatch or EOS
128	uminv.16b b1,      v0
129	fmov      w3,      s1       // zero only iff comparison is finished
130	cbz       w3,      L_vectorDone
131	subs      x7,      x7, #(kVectorSize)
132	b.ne      1b
133//	Restore the updated n to x2
134	mov       x2,      x6
135//	The next vector load will cross a page boundary.  Instead, compare one byte
136//	at a time until s1 again has vector alignment, at which point we will have
137//	compared exactly 16 bytes.
1382:	ldrb      w4,     [x0],#1  // load byte from src1
139	ldrb      w5,     [x1],#1  // load byte from src2
140	subs      x3,      x4, x5  // if the are not equal
141	ccmp      w4,  #0, #4, eq  //    or we find an EOS
142	b.eq      L_scalarDone     // return the difference
143	subs      x2,      x2, #1  // decrement length
144	b.eq      L_scalarDone     // exit loop if zero.
145	tst       x0,      #(kVectorSize-1)
146	b.ne      2b
147//	Having compared one vector's worth of bytes using a scalar comparison, we
148//	know that we are safely across the page boundary.  Initialize x7 and jump
149//	back into the vector comparison part of the loop.
150	mov       x7,      #(PAGE_MIN_SIZE-kVectorSize)
151	b         0b
152
153/*****************************************************************************
154 *  Naive vector comparison                                                  *
155 *****************************************************************************/
156
157.align 4
158L_naiveVector:
159	ldr       q0,     [x0],#(kVectorSize)
160	ldr       q1,     [x1],#(kVectorSize)
161	cmeq.16b  v1,      v0, v1
162	and.16b   v0,      v0, v1   // contains zero byte iff mismatch or EOS
163	uminv.16b b1,      v0
164	fmov      w3,      s1       // zero only iff comparison is finished
165	cbz       w3,      L_vectorDone
166	subs      x2,      x2, #16
167	b.hi      L_naiveVector
168
169L_readNBytes:
170	eor       x0,      x0, x0
171	ClearFrameAndReturn
172
173L_vectorDone:
174//	Load the bytes corresponding to the first mismatch or EOS and return
175//  their difference.
176	eor.16b   v1,      v1, v1
177	cmhi.16b  v0,      v0, v1   // force non-zero lanes to 0xff
178	ldr       q1,      L_mask
179	orr.16b   v0,      v0, v1   // lane index in lanes containing mismatch or EOS
180	uminv.16b b1,      v0
181	fmov      w3,      s1
182//	If the index of the mismatch or EOS is greater than or equal to n, it
183//	occurs after the first n bytes of the string, and doesn't count.
184	cmp       x3,      x2
185	b.cs      L_readNBytes
186	sub       x3,      x3, #(kVectorSize)
187	ldrb      w4,     [x0, x3]
188	ldrb      w5,     [x1, x3]
189	sub       x0,      x4, x5
190	ClearFrameAndReturn
191