xref: /xnu-10002.81.5/osfmk/arm64/lz4_decode_arm64.s (revision 5e3eaea39dcf651e66cb99ba7d70e32cc4a99587) !
1/*
2 * Copyright (c) 2017 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28#include <vm/lz4_assembly_select.h>
29#include <arm64/asm.h>
30#if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64
31
32/*
33
34  int64_t lz4_decode_asm(
35    uint8_t ** dst_ptr,                     *dst_ptr points to next output byte to write
36    uint8_t * dst_begin,                    points to first valid output byte we can access, dst_begin <= dst
37    uint8_t * dst_end,                      "relaxed" end of output buffer (see below)
38    const uint8_t ** src_ptr,               *src_ptr points to next input byte to read
39    const uint8_t * src_end)                "relaxed" end of input buffer (see below)
40
41  We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant.
42  We never read before dst_begin.
43
44  Return 0 on success, -1 on failure
45  On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction.
46
47*/
48
49.globl _lz4_decode_asm
50
51#define dst                x0   // arg0
52#define dst_begin          x1   // arg1
53#define dst_end            x2   // arg2
54#define src                x3   // arg3
55#define src_end            x4   // arg4
56
57#define w_n_matches        w5   // lower 32 bits of n_matches
58#define n_matches          x5
59#define n_literals         x6
60#define copy_src           x7   // match/literal copy source
61#define copy_dst           x8   // match/literal copy destination
62
63#define w_aux1             w9   // lower 32 bits of aux1
64#define aux1               x9
65#define aux2              x10
66
67#define w_match_distance  w11   // lower 32 bits of match_distance
68#define match_distance    x11
69
70#define match_permtable   x12
71#define match_disttable   x13
72
73#define dst_good          x19
74#define src_good          x20
75
76.macro establish_frame
77    ARM64_STACK_PROLOG
78    stp     fp, lr,    [sp, #-16]!
79    mov     fp, sp
80.endm
81
82.macro clear_frame_and_return
83    ldp     fp, lr,    [sp], #16
84    ARM64_STACK_EPILOG
85.endm
86
87// copy_1x16 SOURCE_ADDR DESTINATION_ADDR
88// Copy 16 bytes, clobber: q0
89.macro copy_1x16
90    ldr     q0,[$0]
91    str     q0,[$1]
92.endm
93
94// copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR
95// Copy 16 bytes, and increment both addresses by 16, clobber: q0
96.macro copy_1x16_and_increment
97    ldr     q0,[$0],#16
98    str     q0,[$1],#16
99.endm
100
101// copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR
102// Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0
103.macro copy_2x16_and_increment
104    ldr     q0,[$0],#16
105    str     q0,[$1],#16
106    ldr     q0,[$0],#16
107    str     q0,[$1],#16
108.endm
109
110// copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR
111// Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1
112.macro copy_1x32_and_increment
113    ldp     q0,q1,[$0],#32
114    stp     q0,q1,[$1],#32
115.endm
116
117// If we don't branch, src < src_end after this
118.macro check_src_end
119    cmp     src,src_end
120    b.hs    L_done                            // extremely unlikely, DONE when src >= src_end
121.endm
122
123// If we don't branch, dst < dst_end after this
124.macro check_dst_end
125    cmp     dst,dst_end
126    b.hs    L_done                            // extremely unlikely, DONE when dst >= dst_end
127.endm
128
129.text
130.p2align 4
131_lz4_decode_asm:
132    establish_frame
133    stp     x19,x20,[sp,#-16]!                // need to preserve these
134    stp     src,dst,[sp,#-16]!                // save src_ptr,dst_ptr on stack
135    ldr     src,[src]                         // src = *src_ptr
136    ldr     dst,[dst]                         // dst = *dst_ptr
137    adr     match_permtable,L_match_permtable
138    adr     match_disttable,L_match_disttable
139
140L_decode_command:
141    // Keep last known good positions in both streams
142    mov     dst_good,dst
143    mov     src_good,src
144
145    // Check limits
146    check_src_end
147    check_dst_end
148
149    // Decode 1-byte command
150    ldrb    w_aux1,[src],#1                   // read command byte LLLLMMMM
151    lsr     n_literals,aux1,#4                // 0000LLLL. n_literals is now 0..15
152    and     n_matches,aux1,#0xf               // 0000MMMM. n_matches is now 0..15
153    add     n_matches,n_matches,#4            // n_matches is now 4..19
154
155    // Test number of literals (do not test if n_literals==0, because branch prediction fails on it)
156    cmp     n_literals,#14
157    b.ls    L_copy_short_literal              // 96% likely: n_literals in 0..14
158    // continue to decode_long_literal
159
160    // the number of literals is encoded on more bytes, we need to decode them
161L_decode_long_literal:
162    check_src_end                             // required here, since we may loop an arbitrarily high number of times
163    ldrb    w_aux1,[src],#1
164    add     n_literals,n_literals,aux1
165    cmp     aux1,#255
166    b.eq    L_decode_long_literal             // extremely unlikely
167    // continue to copy_long_literal
168
169    // Copy literals, n_literals >= 15
170L_copy_long_literal:
171    mov     copy_src,src                      // literal copy origin
172    mov     copy_dst,dst                      // literal copy destination
173    add     src,src,n_literals
174    add     dst,dst,n_literals
175    check_src_end                             // required here, since n_literals can be arbitrarily high
176    check_dst_end
177
178    // fixed + loop
179    copy_1x32_and_increment copy_src,copy_dst
180L_copy_long_literal_loop:
181    copy_1x32_and_increment copy_src,copy_dst
182    cmp     dst,copy_dst
183    b.hi    L_copy_long_literal_loop          // first test occurs after 64 bytes have been copied, and is unlikely to loop back
184    b       L_expand_match
185
186    // Copy literals, n_literals <= 14: copy 16 bytes
187L_copy_short_literal:
188    copy_1x16 src,dst
189    add     src,src,n_literals
190    add     dst,dst,n_literals
191    // continue to expand match
192
193L_expand_match:
194
195    // Decode match distance
196    ldrh    w_match_distance,[src],#2         // 16-bit distance
197    cbz     match_distance,L_fail             // distance == 0 is invalid
198    sub     copy_src,dst,match_distance       // copy_src is the match copy source
199    cmp     copy_src,dst_begin
200    b.lo    L_fail                            // copy_src < dst_begin: FAIL
201    mov     copy_dst,dst                      // copy_dst is the match copy destination
202    add     dst,dst,n_matches                 // dst is updated to be the byte after the match; n_matches <= 19 here
203
204    // Do we need to decode a long match?
205    cmp     n_matches,#19
206    b.eq    L_decode_long_match               // unlikely, n_matches >= 19 encoded on more bytes
207    cmp     n_matches,#16
208    b.hi    L_long_match                      // unlikely, n_matches == 17 or 18
209    // continue to short match (most probable case)
210
211    // Copy match, n_matches <= 16
212L_short_match:
213    cmp     match_distance,#15
214    b.ls    L_copy_short_match_small_distance
215
216    // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes
217    copy_1x16 copy_src,copy_dst
218    b       L_decode_command
219
220    // Copy match, n_matches <= 16, match_distance < 16:
221    // load shuffle table, and permute to replicate the pattern on 16 bytes
222L_copy_short_match_small_distance:
223    ldr     q0,[copy_src]
224    add     aux1,match_permtable,match_distance,lsl #5   // index in table
225    ldr     q1,[aux1]                         // load only permutation for the low 16 bytes
226    tbl     v0.16b,{v0.16b},v1.16b            // low 16 bytes of pattern
227    str     q0,[copy_dst]
228    b       L_decode_command
229
230    // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them
231L_decode_long_match:
232    check_src_end                             // required here, since we may loop an arbitrarily high number of times
233    ldrb    w_aux1,[src],#1
234    add     dst,dst,aux1
235    cmp     aux1,#255
236    b.eq    L_decode_long_match               // very unlikely
237    check_dst_end                             // required here, since dst was incremented by a arbitrarily high value
238    // continue to long_match
239
240    // n_matches > 16
241L_long_match:
242    cmp     match_distance,#31
243    b.hi    L_copy_long_match_32
244    cmp     match_distance,#15
245    b.hi    L_copy_long_match_16
246
247    // Copy match, n_matches >= 16, match_distance < 16:
248    // load shuffle table, and permute to replicate the pattern on 32 bytes
249L_copy_long_match_small_distance:
250    ldr     q1,[copy_src]                     // 16 pattern bytes
251    add     aux1,match_permtable,match_distance,lsl #5   // index in table
252    ldp     q2,q3,[aux1]                      // load 32-byte permutation
253    tbl     v0.16b,{v1.16b},v2.16b            // low 16 bytes of pattern in q0
254    tbl     v1.16b,{v1.16b},v3.16b            // high 16 bytes of pattern in q1
255    ldrb    w_aux1,[match_disttable,match_distance]  // valid pattern length in aux1
256    // fixed
257    stp     q0,q1,[copy_dst]
258    add     copy_dst,copy_dst,aux1
259L_copy_long_match_small_distance_loop:
260    // loop
261    stp     q0,q1,[copy_dst]
262    add     copy_dst,copy_dst,aux1
263    stp     q0,q1,[copy_dst]
264    add     copy_dst,copy_dst,aux1
265    cmp     dst,copy_dst
266    b.hi    L_copy_long_match_small_distance_loop
267    b       L_decode_command
268
269    // Copy match, n_matches >= 16, match_distance >= 32
270L_copy_long_match_32:
271    // fixed + loop
272    copy_1x16_and_increment copy_src,copy_dst
273L_copy_long_match_32_loop:
274    copy_1x32_and_increment copy_src,copy_dst
275    cmp     dst,copy_dst
276    b.hi    L_copy_long_match_32_loop
277    b       L_decode_command
278
279    // Copy match, n_matches >= 16, match_distance >= 16
280L_copy_long_match_16:
281    // fixed + loop
282    copy_1x16_and_increment copy_src,copy_dst
283L_copy_long_match_16_loop:
284    copy_2x16_and_increment copy_src,copy_dst
285    cmp     dst,copy_dst
286    b.hi    L_copy_long_match_16_loop
287    b       L_decode_command
288
289L_fail:
290    mov     aux1,#-1                          // FAIL
291    b       L_exit
292
293L_done:
294    mov     aux1,#0                           // OK
295    // continue to L_exit
296
297L_exit:
298    ldp     src,dst,[sp],#16                  // get back src_ptr,dst_ptr from stack
299    str     src_good,[src]                    // *src_ptr = src_good
300    str     dst_good,[dst]                    // *dst_ptr = dst_good
301    mov     x0,aux1                           // x0 = return value
302    ldp     x19,x20,[sp],#16                  // restore
303    clear_frame_and_return
304
305// permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15
306// value(d)[i] = i%d for i = 0..31
307.p2align 6
308L_match_permtable:
309.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // 0
310.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  // 1
311.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,    0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1  // 2
312.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,    1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1  // 3
313.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,    0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3  // 4
314.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,    1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1  // 5
315.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,    4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1  // 6
316.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,    2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3  // 7
317.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,    0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7  // 8
318.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,    7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4  // 9
319.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,    6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1  // 10
320.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4,    5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9  // 11
321.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3,    4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7  // 12
322.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2,    3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5  // 13
323.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1,    2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3  // 14
324.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0,    1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1  // 15
325
326// valid repeating pattern size, for each match_distance = 0 to 15
327// value(d) = 32 - (32%d), is the largest a multiple of d <= 32
328.p2align 6
329L_match_disttable:
330.byte 32,32,32,30  //  0 ..  3
331.byte 16,30,30,28  //  4 ..  7
332.byte 16,27,30,22  //  8 .. 11
333.byte 24,26,28,30  // 12 .. 15
334
335#endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARM64
336