1/* 2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <vm/lz4_assembly_select.h> 30#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARMV7 31 32/* void lz4_encode_2gb(uint8_t ** dst_ptr, 33 size_t dst_size, 34 const uint8_t ** src_ptr, 35 const uint8_t * src_begin, 36 size_t src_size, 37 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], 38 int skip_final_literals) */ 39 40.globl _lz4_encode_2gb 41.syntax unified 42 43#define dst_ptr r0 44#define dst_end r1 45#define src_ptr r2 46#define src_beg r3 47#define src_end r4 48#define table r5 49#define mch_ptr r6 50#define mch_dis r8 51#define mch_len r9 52#define mch_ref r10 53 54#define margin 128 55 56.macro establish_frame 57 push {r4-r7, lr} 58 add r7, sp, #12 59 push {r8-r11} 60 ldrd r4, r5, [sp, #36] 61 push {r0, r2} 62 ldr dst_ptr, [r0] 63 ldr src_ptr, [r2] 64 subs r1, r1, margin // subtract safety margin from dst_size 65 bls L_done 66 add dst_end, dst_ptr, r1 // dst end - margin 67 sub r4, r4, margin // subtract safety margin from src_size (src_size < margin is detected by check on mch_ptr in match_candidate_loop). 68 add src_end, src_ptr, r4 // src end - margin. 69 vmov.i8 q1, #255 // vector of all 1s used to emit 70.endm 71 72.macro clear_frame_and_return 73 pop {r1, r3} 74 str dst_ptr, [r1] 75 str src_ptr, [r3] 76 pop {r8-r11} 77 pop {r4-r7, pc} 78.endm 79 80.p2align 4 81_lz4_encode_2gb: 82 establish_frame 83L_next_match: 84 // Start searching for the next match, starting at the end of the last one. 85 // [We begin with mch_ptr = src_ptr - 1 because we pre-increment mch_ptr 86 // within the search loop itself]. Load the hash magic number in lr, and 87 // zero out mch_len (when we find a match, its length will initially be 88 // four, but we actually work with the match length minus four at all times). 89 ldr lr, L_hash 90 sub mch_ptr, src_ptr, #1 91 92L_match_candidate_loop: 93 // If mch_ptr >= src_end, we are near the end of the source buffer (remember 94 // that we subtracted margin from src_end, so we are *not* actually past the 95 // end just yet). 96 cmp mch_ptr, src_end 97 bhs L_trailing_literal 98 99 // Load the four-byte word starting at mch_ptr, and get the address of the 100 // corresponding row of the hash table. 101 ldr r9, [mch_ptr, #1]! 102 sub r8, mch_ptr, src_beg 103 mul r12, r9, lr 104 mvn r10, #0x7 105 and r12, r10, r12, lsr #17 // byte offset of table entry. 106 107 // Load offset and word from hash table row, then update with the new offset 108 // and word that we just computed. 109 ldrd r10,r11, [table, r12] 110 strd r8, r9, [table, r12] 111 112 // At this point, we only know that the hashes of the words match; check to 113 // see if the words themselves match. If not, move on to the next candidate. 114 cmp r9, r11 115 bne L_match_candidate_loop 116 117 // It's not enough for the words to match; the match distance must also be 118 // in the representable range (i.e. less than 0x10000). 119 sub mch_dis, r8, r10 120 add mch_ref, src_beg, r10 121 cmp mch_dis, #0x10000 122 bhs L_match_candidate_loop 123 124 // We have found a match; registers at this point are as follows: 125 // 126 // register symbolic name meaning 127 // r0 dst_ptr pointer into destination buffer where the 128 // match information will be stored. 129 // r1 dst_end pointer to the end of the destination buffer, 130 // less margin. 131 // r2 src_ptr pointer to the byte after the last match that 132 // we found, or to the point from which we 133 // started searching if this is the first match. 134 // r3 src_beg pointer to the actual start of the buffer. 135 // r4 src_end pointer to the end of the source buffer, less 136 // margin. 137 // r5 table address of hash table. 138 // r6 mch_ptr pointer to match. 139 // r8 mch_dis match distance ("D") 140 // r9 mch_len length of match less four ("M") 141 // r10 mch_ref pointer to match reference. 142 // r11 - 143 // r12 - 144 // lr - 145 // 146 // Attempt to grow the match backwards (typically we only grow backwards by 147 // a byte or two, if at all, so we use a byte-by-byte scan). 148 eor mch_len, mch_len 1490:cmp mch_ref, src_beg 150 cmpne mch_ptr, src_ptr 151 beq 1f 152 ldrb r11, [mch_ref, #-1] 153 ldrb r12, [mch_ptr, #-1] 154 cmp r11, r12 155 bne 1f 156 sub mch_ref, #1 157 sub mch_ptr, #1 158 add mch_len, #1 159 b 0b 160 161 // Now that we have the start of the match, we can compute the literal 162 // length. Then advance the mch and ref pointers to the end of the match 163 // and its reference. Because mch_len is the real match length minus four, 164 // we actually advance to four before the end of the match, but our loop 165 // to grow the matches uses pre-incremented loads with writeback, so this 166 // works out correctly. 167#define lit_len lr 1681:sub lit_len, mch_ptr, src_ptr 169 add mch_ptr, mch_len 170 add mch_ref, mch_len 171 172 // Now attempt to grow the match forwards. This is much more common, and 173 // there is a safety margin at the end of the buffer, so we grow forwards 174 // in four-byte chunks. 1750:ldr r11, [mch_ptr, #4]! 176 ldr r12, [mch_ref, #4]! 177 eors r11, r12 178 bne 1f 179 add mch_len, #4 180 cmp mch_ptr, src_end 181 blo 0b 182 b L_emit_match 183 // At least one of the bytes in the last comparison did not match. Identify 184 // which byte had the mismatch and compute the final length (less four). 1851:rev r11, r11 186 clz r11, r11 187 add mch_len, r11, lsr #3 188 189L_emit_match: 190 // Time to emit what we've found! 191 // 192 // register symbolic name meaning 193 // r0 dst_ptr pointer into destination buffer where the 194 // match information will be stored. 195 // r1 dst_end pointer to the end of the destination buffer, 196 // less margin. 197 // r2 src_ptr pointer to the byte after the last match that 198 // we found, or to the point from which we 199 // started searching if this is the first match. 200 // r3 src_beg pointer to the actual start of the buffer. 201 // r4 src_end pointer to the end of the source buffer, less 202 // margin. 203 // r5 table address of hash table. 204 // r6 - 205 // r8 mch_dis match distance ("D") 206 // r9 mch_len length of match ("M") 207 // r10 - 208 // r11 - 209 // r12 - 210 // lr lit_len literal length ("L") 211 // q1 vector of all ones 212 // 213 // Synthesize control byte under the assumption that L and M are both less 214 // than 15, jumping out of the fast path if one of them is not. 215 cmp lit_len, #15 216 orr r10, mch_len, lit_len, lsl #4 217 cmplo mch_len, #15 218 bhs L_emit_careful 219 // L and M are both less than 15, which means (a) we use the most compact 220 // encoding for the match and (b) we do not need to do a bounds check on 221 // the destination buffer before writing, only before continuing our search. 222 // Store the command byte. 223 strb r10, [dst_ptr], #1 224 // Copy literal. 225 vld1.8 q0, [src_ptr] 226 add src_ptr, lit_len 227 vst1.8 q0, [dst_ptr] 228 add dst_ptr, lit_len 229 // Restore "true" match length before updating src_ptr. 230 add mch_len, #4 231 // Store match distance (D) and update the source pointer. 232 strh r8, [dst_ptr], #2 233 add src_ptr, mch_len 234 // If we're not into the safety margin of the destination buffer, look for 235 // another match. 236 cmp dst_ptr, dst_end 237 blo L_next_match 238 // If we *are* into the safety margin of the destination buffer, we're done 239 // encoding this block; update the source and destination pointers and 240 // return. 241L_done: 242 clear_frame_and_return 243 244// Constant island 245L_hash: .long 2654435761 246L_magic: .long 0x80808081 247 248L_emit_careful: 249 // Either L or M is >= 15, which means that we don't get to use the compact 250 // encoding, and that we need to do extra bounds checks while writing. 251 // 252 // register symbolic name meaning 253 // r0 dst_ptr pointer into destination buffer where the 254 // match information will be stored. 255 // r1 dst_end pointer to the end of the destination buffer, 256 // less margin. 257 // r2 src_ptr pointer to the byte after the last match that 258 // we found, or to the point from which we 259 // started searching if this is the first match. 260 // r3 src_beg pointer to the actual start of the buffer. 261 // r4 src_end pointer to the end of the source buffer, less 262 // margin. 263 // r5 table address of hash table. 264 // r6 - 265 // r8 mch_dis match distance ("D") 266 // r9 mch_len length of match ("M") less four 267 // r10 - 268 // r11 - 269 // r12 - 270 // lr lit_len literal length ("L") 271 // q1 vector of all ones 272 // 273 // Start by creating the low 4 bits of the control word; M if M < 15, 0xf 274 // otherwise. We also load 0x80808081, which is the magic number for 275 // division by 255; this will be required later on. 276 ldr r12, L_magic 277 cmp mch_len, #15 278 mov r10, mch_len 279 movhs r10, #0x0f 280 subs r6, lit_len, #15 281 bhs L_large_L 282 // M is large, but L is < 15. This means we can use the simple approach 283 // for copying the literal with no bounds checks. 284 orr r10, lit_len, lsl #4 285 strb r10, [dst_ptr], #1 286 // Copy literal. 287 vld1.8 q0, [src_ptr] 288 add src_ptr, lit_len 289 vst1.8 q0, [dst_ptr] 290 add dst_ptr, lit_len 291 // Store match distance (D). 292 strh r8, [dst_ptr], #2 293 sub r6, mch_len, #15 294 b L_large_M 295 296L_large_L: 297 // L is large, M may or may not be. We need to encode extra literal length 298 // bytes and we need to do bounds checks while store both those byte and the 299 // literal itself. 300 orr r10, #0xf0 301 strb r10, [dst_ptr], #1 302 // How many extra literal bytes do we need to store? We need to store 303 // (L - 15)/255 extra literal bytes of 0xff, plus one more byte that is 304 // (L - 15)%255. Get these quantities via magic number multiplication: 305 // (L - 15)*0x80808081 >> (32 + 7) 306 umull r10, r11, r6, r12 307 mov r12, #255 308 lsr r10, r11, #7 // (L - 15) / 255 309 mls r11, r10, r12, r6 // (L - 15) % 255 310 ldr r12, L_magic // may need magic number again for M. 311 // Compute address dst_ptr will have after storing all 0xff bytes, and 312 // check that we won't exceed dst_end in doing so. 313 add r10, dst_ptr, r10 314 cmp r10, dst_end 315 bhs L_done 316 // There's enough space for all the 0xff bytes, so go ahead and store them. 3170:vst1.8 q1, [dst_ptr]! 318 cmp dst_ptr, r10 319 blo 0b 320 // Store the (L - 15) % 255 byte. 321 strb r11, [r10], #1 322 // Compute the address we'll have reached after storing all literal bytes. 323 // If that passes dst_end, we're done. 324 add dst_ptr, r10, lit_len 325 cmp dst_ptr, dst_end 326 bhs L_done 327 // Copy the literal. 3280:vld1.8 q0, [src_ptr]! 329 vst1.8 q0, [r10]! 330 subs r6, r10, dst_ptr 331 blo 0b 332 // Fixup src_ptr, store match distance (D), and check whether or not M is 333 // bigger than 14. If not, go find the next match. 334 strh r8, [dst_ptr], #2 335 sub src_ptr, r6 336 subs r6, mch_len, #15 337 bhs L_large_M 338 // M is small, so we're all done; we just need to update the source pointer 339 // and we can go look for the next match. 340 add mch_len, #4 341 add src_ptr, mch_len 342 b L_next_match 343 344L_large_M: 345 // Just like with large L, we split (M - 15) into (M - 15) / 255 and 346 // (M - 15) % 255 via magic number multiply. 347 umull r10, r11, r6, r12 348 mov r12, #255 349 lsr r10, r11, #7 // (M - 15) / 255 350 mls r11, r10, r12, r6 // (M - 15) % 255 351 // Compute address dst_ptr will have after storing all 0xff bytes, and 352 // check that we won't exceed dst_end in doing so. 353 add r10, dst_ptr, r10 354 cmp r10, dst_end 355 bhs L_done 356 // There's enough space for all the 0xff bytes, so go ahead and store them. 3570:vst1.8 q1, [dst_ptr]! 358 cmp dst_ptr, r10 359 blo 0b 360 // Store final M % 255 byte, update dst_ptr and src_ptr, and look for next 361 // match. 362 strb r11, [r10] 363 add mch_len, #4 364 add dst_ptr, r10, #1 365 add src_ptr, mch_len 366 b L_next_match 367 368L_trailing_literal: 369 // Check if skip_final_literals is set. 370 ldr r5, [sp, #52] 371 tst r5, r5 372 bne L_done 373 // Emit a trailing literal that covers the remainder of the source buffer, 374 // if we can do so without exceeding the bounds of the destination buffer. 375 add src_end, margin 376 sub lit_len, src_end, src_ptr 377 subs r6, lit_len, #15 378 bhs L_trailing_literal_long 379 lsl r10, lit_len, #4 380 strb r10, [dst_ptr], #1 381 vld1.8 q0, [src_ptr] 382 mov src_ptr, src_end 383 vst1.8 q0, [dst_ptr] 384 add dst_ptr, lit_len 385 b L_done 386 387L_trailing_literal_long: 388 ldr r12, L_magic 389 mov r10, #0xf0 390 add dst_end, margin 391 strb r10, [dst_ptr], #1 392 umull r10, r11, r6, r12 393 mov r12, #255 394 lsr r10, r11, #7 // (L - 15) / 255 395 mls r11, r10, r12, r6 // (L - 15) % 255 396 // We want to write out lit_len + (L - 15)/255 + 1 bytes. Check if we have 397 // space for all of them. 398 add r10, dst_ptr 399 add r12, r10, lit_len 400 cmp r12, dst_end 401 bhs L_done 402 // We have enough space, so go ahead and write them all out. Because we 403 // know that we have enough space, and that the literal is at least 15 bytes, 404 // we can write the block of 0xffs using vector stores, even without a 405 // safety margin. 4060:vst1.8 q1, [dst_ptr]! 407 cmp dst_ptr, r10 408 blo 0b 409 // Store the (L - 15) % 255 byte. 410 strb r11, [r10], #1 411 mov dst_ptr, r10 412 // Now store the literal itself; here we need to actually be somewhat 413 // careful to ensure that we don't write past the end of the destination 414 // buffer or read past the end of the source buffer. 415 subs lit_len, #16 416 blo 1f 4170:vld1.8 q0, [src_ptr]! 418 subs lit_len, #16 419 vst1.8 q0, [dst_ptr]! 420 bhs 0b 4211:adds lit_len, #16 422 beq L_done 4232:ldrb r6, [src_ptr], #1 424 subs lit_len, #1 425 strb r6, [dst_ptr], #1 426 bne 2b 427 b L_done 428 429#endif 430