xref: /xnu-8020.121.3/osfmk/arm/lz4_encode_armv7.s (revision fdd8201d7b966f0c3ea610489d29bd841d358941)
1/*
2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#include <vm/lz4_assembly_select.h>
30#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7
31
32/* void lz4_encode_2gb(uint8_t ** dst_ptr,
33                       size_t dst_size,
34                       const uint8_t ** src_ptr,
35                       const uint8_t * src_begin,
36                       size_t src_size,
37                       lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
38                       int skip_final_literals)                               */
39
40.globl _lz4_encode_2gb
41.syntax unified
42
43#define dst_ptr r0
44#define dst_end r1
45#define src_ptr r2
46#define src_beg r3
47#define src_end r4
48#define table   r5
49#define mch_ptr r6
50#define mch_dis r8
51#define mch_len r9
52#define mch_ref r10
53
54#define margin  128
55
56.macro establish_frame
57  push   {r4-r7, lr}
58  add     r7,       sp, #12
59  push   {r8-r11}
60  ldrd    r4, r5,  [sp, #36]
61  push   {r0, r2}
62  ldr     dst_ptr, [r0]
63  ldr     src_ptr, [r2]
64  subs    r1,       r1,  margin // subtract safety margin from dst_size
65  bls     L_done
66  add     dst_end,  dst_ptr, r1 // dst end - margin
67  sub     r4,       r4,  margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop).
68  add     src_end,  src_ptr, r4 // src end - margin.
69  vmov.i8 q1,       #255        // vector of all 1s used to emit
70.endm
71
72.macro clear_frame_and_return
73  pop    {r1, r3}
74  str     dst_ptr, [r1]
75  str     src_ptr, [r3]
76  pop    {r8-r11}
77  pop    {r4-r7, pc}
78.endm
79
80.p2align 4
81_lz4_encode_2gb:
82  establish_frame
83L_next_match:
84  //  Start searching for the next match, starting at the end of the last one.
85  //  [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr
86  //  within the search loop itself].  Load the hash magic number in lr, and
87  //  zero out mch_len (when we find a match, its length will initially be
88  //  four, but we actually work with the match length minus four at all times).
89  ldr     lr,       L_hash
90  sub     mch_ptr,  src_ptr, #1
91
92L_match_candidate_loop:
93  //  If mch_ptr >= src_end, we are near the end of the source buffer (remember
94  //  that we subtracted margin from src_end, so we are *not* actually past the
95  //  end just yet).
96  cmp     mch_ptr,  src_end
97  bhs     L_trailing_literal
98
99  //  Load the four-byte word starting at mch_ptr, and get the address of the
100  //  corresponding row of the hash table.
101  ldr     r9,      [mch_ptr, #1]!
102  sub     r8,       mch_ptr, src_beg
103  mul     r12,      r9, lr
104  mvn     r10,      #0x7
105  and     r12,      r10, r12, lsr #17 // byte offset of table entry.
106
107  //  Load offset and word from hash table row, then update with the new offset
108  //  and word that we just computed.
109  ldrd    r10,r11, [table, r12]
110  strd    r8, r9,  [table, r12]
111
112  //  At this point, we only know that the hashes of the words match; check to
113  //  see if the words themselves match.  If not, move on to the next candidate.
114  cmp     r9,       r11
115  bne     L_match_candidate_loop
116
117  //  It's not enough for the words to match; the match distance must also be
118  //  in the representable range (i.e. less than 0x10000).
119  sub     mch_dis,  r8, r10
120  add     mch_ref,  src_beg, r10
121  cmp     mch_dis,  #0x10000
122  bhs     L_match_candidate_loop
123
124  //  We have found a match; registers at this point are as follows:
125  //
126  //   register   symbolic name   meaning
127  //      r0         dst_ptr      pointer into destination buffer where the
128  //                              match information will be stored.
129  //      r1         dst_end      pointer to the end of the destination buffer,
130  //                              less margin.
131  //      r2         src_ptr      pointer to the byte after the last match that
132  //                              we found, or to the point from which we
133  //                              started searching if this is the first match.
134  //      r3         src_beg      pointer to the actual start of the buffer.
135  //      r4         src_end      pointer to the end of the source buffer, less
136  //                              margin.
137  //      r5         table        address of hash table.
138  //      r6         mch_ptr      pointer to match.
139  //      r8         mch_dis      match distance ("D")
140  //      r9         mch_len      length of match less four ("M")
141  //      r10        mch_ref      pointer to match reference.
142  //      r11        -
143  //      r12        -
144  //      lr         -
145  //
146  //  Attempt to grow the match backwards (typically we only grow backwards by
147  //  a byte or two, if at all, so we use a byte-by-byte scan).
148  eor     mch_len,  mch_len
1490:cmp     mch_ref,  src_beg
150  cmpne   mch_ptr,  src_ptr
151  beq     1f
152  ldrb    r11,     [mch_ref, #-1]
153  ldrb    r12,     [mch_ptr, #-1]
154  cmp     r11,      r12
155  bne     1f
156  sub     mch_ref,  #1
157  sub     mch_ptr,  #1
158  add     mch_len,  #1
159  b       0b
160
161  //  Now that we have the start of the match, we can compute the literal
162  //  length.  Then advance the mch and ref pointers to the end of the match
163  //  and its reference.  Because mch_len is the real match length minus four,
164  //  we actually advance to four before the end of the match, but our loop
165  //  to grow the matches uses pre-incremented loads with writeback, so this
166  //  works out correctly.
167#define lit_len lr
1681:sub     lit_len,  mch_ptr, src_ptr
169  add     mch_ptr,  mch_len
170  add     mch_ref,  mch_len
171
172  //  Now attempt to grow the match forwards.  This is much more common, and
173  //  there is a safety margin at the end of the buffer, so we grow forwards
174  //  in four-byte chunks.
1750:ldr     r11,     [mch_ptr, #4]!
176  ldr     r12,     [mch_ref, #4]!
177  eors    r11,      r12
178  bne     1f
179  add     mch_len,  #4
180  cmp     mch_ptr,  src_end
181  blo     0b
182  b       L_emit_match
183  //  At least one of the bytes in the last comparison did not match.  Identify
184  //  which byte had the mismatch and compute the final length (less four).
1851:rev     r11,      r11
186  clz     r11,      r11
187  add     mch_len,  r11, lsr #3
188
189L_emit_match:
190  //  Time to emit what we've found!
191  //
192  //   register   symbolic name   meaning
193  //      r0         dst_ptr      pointer into destination buffer where the
194  //                              match information will be stored.
195  //      r1         dst_end      pointer to the end of the destination buffer,
196  //                              less margin.
197  //      r2         src_ptr      pointer to the byte after the last match that
198  //                              we found, or to the point from which we
199  //                              started searching if this is the first match.
200  //      r3         src_beg      pointer to the actual start of the buffer.
201  //      r4         src_end      pointer to the end of the source buffer, less
202  //                              margin.
203  //      r5         table        address of hash table.
204  //      r6         -
205  //      r8         mch_dis      match distance ("D")
206  //      r9         mch_len      length of match ("M")
207  //      r10        -
208  //      r11        -
209  //      r12        -
210  //      lr         lit_len      literal length ("L")
211  //      q1                      vector of all ones
212  //
213  //  Synthesize control byte under the assumption that L and M are both less
214  //  than 15, jumping out of the fast path if one of them is not.
215  cmp     lit_len,  #15
216  orr     r10,      mch_len, lit_len, lsl #4
217  cmplo   mch_len,  #15
218  bhs     L_emit_careful
219  //  L and M are both less than 15, which means (a) we use the most compact
220  //  encoding for the match and (b) we do not need to do a bounds check on
221  //  the destination buffer before writing, only before continuing our search.
222  //  Store the command byte.
223  strb    r10,     [dst_ptr], #1
224  //  Copy literal.
225  vld1.8  q0,      [src_ptr]
226  add     src_ptr,  lit_len
227  vst1.8  q0,      [dst_ptr]
228  add     dst_ptr,  lit_len
229  //  Restore "true" match length before updating src_ptr.
230  add     mch_len,  #4
231  //  Store match distance (D) and update the source pointer.
232  strh    r8,      [dst_ptr], #2
233  add     src_ptr,  mch_len
234  //  If we're not into the safety margin of the destination buffer, look for
235  //  another match.
236  cmp     dst_ptr,  dst_end
237  blo     L_next_match
238  //  If we *are* into the safety margin of the destination buffer, we're done
239  //  encoding this block; update the source and destination pointers and
240  //  return.
241L_done:
242  clear_frame_and_return
243
244//  Constant island
245L_hash: .long 2654435761
246L_magic: .long 0x80808081
247
248L_emit_careful:
249  //  Either L or M is >= 15, which means that we don't get to use the compact
250  //  encoding, and that we need to do extra bounds checks while writing.
251  //
252  //   register   symbolic name   meaning
253  //      r0         dst_ptr      pointer into destination buffer where the
254  //                              match information will be stored.
255  //      r1         dst_end      pointer to the end of the destination buffer,
256  //                              less margin.
257  //      r2         src_ptr      pointer to the byte after the last match that
258  //                              we found, or to the point from which we
259  //                              started searching if this is the first match.
260  //      r3         src_beg      pointer to the actual start of the buffer.
261  //      r4         src_end      pointer to the end of the source buffer, less
262  //                              margin.
263  //      r5         table        address of hash table.
264  //      r6         -
265  //      r8         mch_dis      match distance ("D")
266  //      r9         mch_len      length of match ("M") less four
267  //      r10        -
268  //      r11        -
269  //      r12        -
270  //      lr         lit_len      literal length ("L")
271  //      q1                      vector of all ones
272  //
273  //  Start by creating the low 4 bits of the control word; M if M < 15, 0xf
274  //  otherwise.  We also load 0x80808081, which is the magic number for
275  //  division by 255; this will be required later on.
276  ldr     r12,      L_magic
277  cmp     mch_len,  #15
278  mov     r10,      mch_len
279  movhs   r10,      #0x0f
280  subs    r6,       lit_len, #15
281  bhs     L_large_L
282  //  M is large, but L is < 15.  This means we can use the simple approach
283  //  for copying the literal with no bounds checks.
284  orr     r10,      lit_len, lsl #4
285  strb    r10,     [dst_ptr], #1
286  //  Copy literal.
287  vld1.8  q0,      [src_ptr]
288  add     src_ptr,  lit_len
289  vst1.8  q0,      [dst_ptr]
290  add     dst_ptr,  lit_len
291  //  Store match distance (D).
292  strh    r8,      [dst_ptr], #2
293  sub     r6,       mch_len, #15
294  b       L_large_M
295
296L_large_L:
297  //  L is large, M may or may not be.  We need to encode extra literal length
298  //  bytes and we need to do bounds checks while store both those byte and the
299  //  literal itself.
300  orr     r10,      #0xf0
301  strb    r10,     [dst_ptr], #1
302  //  How many extra literal bytes do we need to store?  We need to store
303  //  (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is
304  //  (L - 15)%255.  Get these quantities via magic number multiplication:
305  //  (L - 15)*0x80808081 >> (32 + 7)
306  umull   r10, r11, r6, r12
307  mov     r12,      #255
308  lsr     r10,      r11, #7       // (L - 15) / 255
309  mls     r11,      r10, r12, r6  // (L - 15) % 255
310  ldr     r12,      L_magic       // may need magic number again for M.
311  //  Compute address dst_ptr will have after storing all 0xff bytes, and
312  //  check that we won't exceed dst_end in doing so.
313  add     r10,      dst_ptr, r10
314  cmp     r10,      dst_end
315  bhs     L_done
316  //  There's enough space for all the 0xff bytes, so go ahead and store them.
3170:vst1.8  q1,      [dst_ptr]!
318  cmp     dst_ptr,  r10
319  blo     0b
320  //  Store the (L - 15) % 255 byte.
321  strb    r11,     [r10], #1
322  //  Compute the address we'll have reached after storing all literal bytes.
323  //  If that passes dst_end, we're done.
324  add     dst_ptr,  r10, lit_len
325  cmp     dst_ptr,  dst_end
326  bhs     L_done
327  //  Copy the literal.
3280:vld1.8  q0,      [src_ptr]!
329  vst1.8  q0,      [r10]!
330  subs    r6,       r10, dst_ptr
331  blo     0b
332  //  Fixup src_ptr, store match distance (D), and check whether or not M is
333  //  bigger than 14.  If not, go find the next match.
334  strh    r8,      [dst_ptr], #2
335  sub     src_ptr,  r6
336  subs    r6,       mch_len, #15
337  bhs     L_large_M
338  //  M is small, so we're all done; we just need to update the source pointer
339  //  and we can go look for the next match.
340  add     mch_len,  #4
341  add     src_ptr,  mch_len
342  b       L_next_match
343
344L_large_M:
345  //  Just like with large L, we split (M - 15) into (M - 15) / 255 and
346  //  (M - 15) % 255 via magic number multiply.
347  umull   r10, r11, r6, r12
348  mov     r12,      #255
349  lsr     r10,      r11, #7       // (M - 15) / 255
350  mls     r11,      r10, r12, r6  // (M - 15) % 255
351  //  Compute address dst_ptr will have after storing all 0xff bytes, and
352  //  check that we won't exceed dst_end in doing so.
353  add     r10,      dst_ptr, r10
354  cmp     r10,      dst_end
355  bhs     L_done
356  //  There's enough space for all the 0xff bytes, so go ahead and store them.
3570:vst1.8  q1,      [dst_ptr]!
358  cmp     dst_ptr,  r10
359  blo     0b
360  //  Store final M % 255 byte, update dst_ptr and src_ptr, and look for next
361  //  match.
362  strb    r11,     [r10]
363  add     mch_len,  #4
364  add     dst_ptr,  r10, #1
365  add     src_ptr,  mch_len
366  b       L_next_match
367
368L_trailing_literal:
369  //  Check if skip_final_literals is set.
370  ldr     r5,      [sp, #52]
371  tst     r5,       r5
372  bne     L_done
373  //  Emit a trailing literal that covers the remainder of the source buffer,
374  //  if we can do so without exceeding the bounds of the destination buffer.
375  add     src_end,  margin
376  sub     lit_len,  src_end, src_ptr
377  subs    r6,       lit_len, #15
378  bhs     L_trailing_literal_long
379  lsl     r10,      lit_len, #4
380  strb    r10,     [dst_ptr], #1
381  vld1.8  q0,      [src_ptr]
382  mov     src_ptr,  src_end
383  vst1.8  q0,      [dst_ptr]
384  add     dst_ptr,  lit_len
385  b       L_done
386
387L_trailing_literal_long:
388  ldr     r12,      L_magic
389  mov     r10,      #0xf0
390  add     dst_end,  margin
391  strb    r10,     [dst_ptr], #1
392  umull   r10, r11, r6, r12
393  mov     r12,      #255
394  lsr     r10,      r11, #7       // (L - 15) / 255
395  mls     r11,      r10, r12, r6  // (L - 15) % 255
396  //  We want to write out lit_len + (L - 15)/255 + 1 bytes.  Check if we have
397  //  space for all of them.
398  add     r10,      dst_ptr
399  add     r12,      r10, lit_len
400  cmp     r12,      dst_end
401  bhs     L_done
402  //  We have enough space, so go ahead and write them all out.  Because we
403  //  know that we have enough space, and that the literal is at least 15 bytes,
404  //  we can write the block of 0xffs using vector stores, even without a
405  //  safety margin.
4060:vst1.8  q1,      [dst_ptr]!
407  cmp     dst_ptr,  r10
408  blo     0b
409  //  Store the (L - 15) % 255 byte.
410  strb    r11,     [r10], #1
411  mov     dst_ptr,  r10
412  //  Now store the literal itself; here we need to actually be somewhat
413  //  careful to ensure that we don't write past the end of the destination
414  //  buffer or read past the end of the source buffer.
415  subs    lit_len,  #16
416  blo     1f
4170:vld1.8  q0,      [src_ptr]!
418  subs    lit_len,  #16
419  vst1.8  q0,      [dst_ptr]!
420  bhs     0b
4211:adds    lit_len,  #16
422  beq     L_done
4232:ldrb    r6,      [src_ptr], #1
424  subs    lit_len,  #1
425  strb    r6,      [dst_ptr], #1
426  bne     2b
427  b       L_done
428
429#endif
430