1/* 2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29#include <vm/lz4_assembly_select.h> 30#include <vm/lz4_constants.h> 31#include <arm64/asm.h> 32 33#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64 34 35/* void lz4_encode_2gb(uint8_t ** dst_ptr, 36 size_t dst_size, 37 const uint8_t ** src_ptr, 38 const uint8_t * src_begin, 39 size_t src_size, 40 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], 41 int skip_final_literals) */ 42 43.globl _lz4_encode_2gb 44 45#define dst_ptr x0 46#define dst_size x1 47#define src_ptr x2 48#define src_begin x3 49#define src_size x4 50#define hash_table x5 51#define skip_final_literals x6 52 53.text 54.p2align 4 55_lz4_encode_2gb: 56 57 // esteblish frame 58 ARM64_STACK_PROLOG 59 stp fp, lr, [sp, #-16]! 60 mov fp, sp 61 62 stp x19, x20, [sp, #-16]! 63 stp x21, x22, [sp, #-16]! 64 stp x23, x24, [sp, #-16]! 65 stp x25, x26, [sp, #-16]! 66 stp x27, x28, [sp, #-16]! 67 68 // constant registers 69 adr x7, L_constant 70 ldr w28, [x7, #4] // x28 = 0x80808081 (magic number to cmopute 1/255) 71 ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY 72 mov x27, #-1 // x27 = 0xffffffffffffffff 73 dup.4s v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff} 74 75 76 // x9 - is current dst 77 // x10 - dst_end - safety_margin 78 ldr x9, [x0] // dst 79 add x10, x9, x1 // dst_end 80 sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin 81 cmp x10, x9 // if dst_size < safety_margin abort 82 b.lt L_done 83 84 // x11 - is current src 85 // x12 - is src_end - safety margin 86 ldr x11, [x2] // src 87 add x12, x11, x4 // src_end 88 sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin 89 cmp x12, x11 // if src_size < safety_margin skip to trailing_literals 90 b.lt L_trailing_literals 91 92 93 // this block search for the next available match 94 // set match_begin to current src (which is also where last match ended) 95L_search_next_available_match: 96 mov x13, x11 // match_begin = src 97 sub x14, x13, x3 // match_postion = match_begin - src_begin 98 99 // compute hash value for the next 5 "quads" 100 // hash distance need to be 0 < D < 0x10000 101 102L_hash_match: 103 ldr x15, [x13] // match_first_4_bytes 104 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY 105 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index 106 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) 107 108 ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) 109 stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) 110 111 add x26, x14, #1 // next_match pos 112 lsr x25, x15, #8 // next_match_first_4_bytes 113 umull x21, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY 114 lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index 115 add x21, x5, x21, lsl #3 // hash_table_entry ptr (hash + 8*index) 116 117 ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos) 118 stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes) 119 120 cmp w15, w22 121 b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match 122 sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos) 123 cmp w19, #0x10000 124 ccmp w19, #0, #0xf, lo 125 b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000 126 b L_found_valid_match 127 128L_try_next_match_0: 129 add x13, x13, #1 130 add x14, x14, #1 131 132 add x26, x14, #1 // next_match pos 133 lsr x15, x15, #16 // next_match_first_4_bytes 134 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY 135 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index 136 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) 137 138 ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) 139 stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) 140 141 cmp w25, w24 142 b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match 143 sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos) 144 cmp w19, #0x10000 145 ccmp w19, #0, #0xf, lo 146 b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000 147 b L_found_valid_match 148 149L_try_next_match_1: 150 add x13, x13, #1 151 add x14, x14, #1 152 153 add x26, x14, #1 // next_match pos 154 lsr x25, x15, #8 // next_match_first_4_bytes 155 umull x20, w7, w25 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY 156 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index 157 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) 158 159 ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos) 160 stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes) 161 162 cmp w15, w22 163 b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match 164 sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos) 165 cmp w19, #0x10000 166 ccmp w19, #0, #0xf, lo 167 b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000 168 b L_found_valid_match 169 170L_try_next_match_2: 171 add x13, x13, #1 172 add x14, x14, #1 173 174 add x26, x14, #1 // next_match pos 175 lsr x15, x15, #16 // next_match_first_4_bytes 176 umull x20, w7, w15 // match_bytes * LZ4_COMPRESS_HASH_MULTIPLY 177 lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index 178 add x20, x5, x20, lsl #3 // hash_table_entry ptr (hash + 8*index) 179 180 ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos) 181 stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes) 182 183 cmp w25, w24 184 b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match 185 sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos) 186 cmp w19, #0x10000 187 ccmp w19, #0, #0xf, lo 188 b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000 189 b L_found_valid_match 190 191L_try_next_match_3: 192 add x13, x13, #1 193 add x14, x14, #1 194 195 cmp w15, w22 196 b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match 197 sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos) 198 cmp w19, #0x10000 199 ccmp w19, #0, #0xf, lo 200 b.eq L_try_next_matchs // verify the 0 < dist < 0x10000 201 b L_found_valid_match 202 203 // this block exapnd the valid match as much as possible 204 // first it try to expand the match forward 205 // next it try to expand the match backword 206L_found_valid_match: 207 add x20, x13, #4 // match_end = match_begin+4 (already confirmd the first 4 bytes) 208 sub x21, x20, x19 // ref_end = match_end - dist 209L_found_valid_match_expand_forward_loop: 210 ldr x22, [x20], #8 // load match_current_8_bytes (safe to load becasue of safety margin) 211 ldr x23, [x21], #8 // load ref_current_8_bytes 212 cmp x22, x23 213 b.ne L_found_valid_match_expand_forward_partial 214 cmp x20, x12 // check if match_end reached src_end 215 b.lo L_found_valid_match_expand_forward_loop 216 b L_found_valid_match_expand_backward 217L_found_valid_match_expand_forward_partial: 218 sub x20, x20, #8 // revert match_end by 8 and compute actual match of current 8 bytes 219 eor x22, x22, x23 // compare the bits using xor 220 rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit) 221 clz x22, x22 // after the revrse for every equal prefix byte clz would count 8 222 add x20, x20, x22, lsr #3 // add the actual number of matching bytes is (clz result)>>3 223L_found_valid_match_expand_backward: 224 sub x15, x13, x19 // ref_begin = match_begin - dist 225L_found_valid_match_expand_backward_loop: 226 cmp x13, x11 // check if match_begin reached src (previous match end) 227 ccmp x15, x3, #0xd, gt // check if ref_begin reached src_begin 228 b.le L_found_valid_match_emit_match 229 ldrb w22, [x13, #-1]! // load match_current_8_bytes (safe to load becasue of safety margin) 230 ldrb w23, [x15, #-1]! // load ref_current_8_bytes 231 cmp w22, w23 232 b.eq L_found_valid_match_expand_backward_loop 233 add x13, x13, #1 // revert x13, last compare didn't match 234 235 // this block write the match into dst 236 // it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens] 237 // it update src & dst positions and progress to L_search_next_available_match 238L_found_valid_match_emit_match: 239 sub x21, x20, x13 // match_length - match_end - match_begin 240 sub x21, x21, #4 // match_length - 4 (first 4 bytes are guaranteed) 241 sub x22, x13, x11 // literals_length = match_begin - src // compute 242 sub x26, x10, x9 // dst_remaining_space = dst_end - dst 243 sub x26, x26, x22 // dst_remaining_space -= literals_length 244 subs x26, x26, #3 // dst_remaining_space -= 2_dist_bytes + L/M_token 245 b.lo L_done // exit if dst isn't sufficent 246 247 and x23, x21, #0xf // store M 4 LSbits 248 add x23, x23, x22, lsl #4 // add L 4 LSbits 249 add x15, x9, #1 // tmp_dst = dst + 1 250 cmp x22, #15 // if L >= 15 need to write more L tokens 251 b.lo L_found_valid_match_copy_literals 252 orr x23, x23, #0xf0 // update L/M token to be 0xfM 253 sub x24, x22, #15 // reduce 15 from number_of_literals 254 sub x26, x26, #1 // check if there is space for the extra L token 255 b.lo L_done 256 cmp x24, #255 // check if need to compute number of 255 tokens 257 b.lo L_found_valid_match_skip_L_255_tokens 258 umull x25, w24, w28 // x25 - (literals_to_token * 1_DIV_255_magic_number) 259 lsr x25, x25, #39 // x25 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39 260 subs x26, x26, x25 // check if there is sufficent space for the 255_tokens 261 b.lo L_done 262 mov x13, #255 263 umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255) 264L_found_valid_match_L_255_tokens_loop: 265 str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin 266 subs x25, x25, #16 // check if there are any 255 token left after current 16 267 b.hi L_found_valid_match_L_255_tokens_loop 268 add x15, x15, x25 // revert tmp_dst if written too many 255 tokens. 269L_found_valid_match_skip_L_255_tokens: 270 strb w24, [x15], #1 // write last L token 271L_found_valid_match_copy_literals: 272 ldr q0, [x11], #16 // load current 16 literals. (safe becasue src_end has safety margin) 273 str q0, [x15], #16 // store current 16 literals. (safe becasue dst_end has safety margin) 274 subs x22, x22, #16 275 b.gt L_found_valid_match_copy_literals 276 add x15, x15, x22 // revert tmp_dst if written too many literals 277 strh w19, [x15], #2 // store dist bytes 278 cmp x21, #15 // if M >= 15 need to write more M tokens 279 b.lo L_found_valid_match_finish_writing_match 280 orr x23, x23, #0xf // update L/M token to be 0xLf 281 sub x24, x21, #15 // reduce 15 from match_length 282 sub x26, x26, #1 // check if there is space for the extra M token 283 b.lo L_done 284 cmp x24, #255 // check if need to compute number of 255 tokens 285 b.lo L_found_valid_match_skip_M_255_tokens 286 umull x25, w24, w28 // x25 - (match_length * 1_DIV_255_magic_number) 287 lsr x25, x25, #39 // x25 - number_of_255_tokens = (match_length * 1_DIV_255_magic_number)>>39 288 subs x26, x26, x25 // check if there is sufficent space for the 255_tokens 289 b.lo L_done 290 mov x13, #255 291 umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length*255) 292L_found_valid_match_M_255_tokens_loop: 293 str q1, [x15], #16 // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin 294 subs x25, x25, #16 // check if there are any 255 token left after current 16 295 b.hi L_found_valid_match_M_255_tokens_loop 296 add x15, x15, x25 // revert tmp_dst if written too many 255 tokens. 297L_found_valid_match_skip_M_255_tokens: 298 strb w24, [x15], #1 // write last M token 299L_found_valid_match_finish_writing_match: 300 strb w23, [x9] // store first token of match in dst 301 mov x9, x15 // update dst to last postion written 302 mov x11, x20 // update src to match_end (last byte that was encoded) 303 cmp x11, x12 // check if src reached src_end 304 ccmp x9, x10, #9, lt // check if dst reached dst_end 305 b.ge L_trailing_literals 306 b L_search_next_available_match 307 // !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 308 // attempted to hash three quad values from the end of each emited match 309 // this eneded up being slower and less compression (???) 310 // this block set match_begin and pos for next hash search and 311 // compute the hash values for the last 3 bytes of currently emited match 312 // only need to comute these hash becasue other "quads" were hashed when the original 313 // data was read. 314 315L_try_next_matchs: 316 add x13, x13, #1 // move to next match 317 add x14, x14, #1 // update next match pos 318 cmp x13, x12 // check match_begin didn't reach src_end 319 b.lo L_hash_match 320 321L_trailing_literals: 322 // unless skip_final_literals is set 323 // write the trailing bytes as literals 324 // traliing bytes include the whole src (with the safty margin) 325 // need to verify whole dst (withthe safty margin) has sufficent space 326 327 tst x6, x6 328 b.ne L_done // if skip_final_literals is set skip writing them 329 330 add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin 331 subs x13, x12, x11 // remaining_src 332 b.eq L_done // finish if there are 0 trailing literals 333 334 add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin 335 sub x14, x10, x9 // remaining dst (dst_end - dst) 336 sub x14, x14, #1 // 1 byte is needed at least to write literals token 337 subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token 338 b.le L_done // (need to verify that it has room for literals tokens 339 340 cmp x13, #15 341 b.lt L_trailing_literals_store_less_than_15_literals 342 subs x14, x14, #1 // 1-extra byte is needed for literals tokens 343 b.mi L_done 344 mov w15, #0xf0 345 strb w15, [x9], #1 // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1) 346 sub x15, x13, #15 347 cmp x15, #255 348 b.lo L_trailing_literals_no_255_tokens 349 umull x19, w15, w28 // x19 - (literals_to_token * 1_DIV_255_magic_number) 350 lsr x19, x19, #39 // x19 - number_of_255_tokens = (literals_to_token * 1_DIV_255_magic_number)>>39 351 subs x14, x14, x19 352 b.mi L_revert_x9_and_done 353 mov x26, #255 354 umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens*255) 355L_tariling_literals_write_16_255_tokens: 356 str q1, [x9], #16 // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token) 357 subs x19, x19, #16 358 b.gt L_tariling_literals_write_16_255_tokens 359 add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16) 360L_trailing_literals_no_255_tokens: 361 strb w15, [x9], #1 // store remainder_token 362 lsr x14, x13, #4 // check if there are more than 16 literals left to be written 363 tst x14, x14 364 b.eq L_trailing_literals_copy_less_than_16_literals 365L_trailing_literals_copy_16_literals: 366 ldr q0, [x11], #16 // load current_16_literals 367 str q0, [ x9], #16 // *dst16++ = current_16_literals 368 subs x14, x14, #1 369 b.gt L_trailing_literals_copy_16_literals 370 cmp x11, x12 371 b.lo L_trailing_literals_copy_less_than_16_literals 372 b L_done 373 374L_trailing_literals_store_less_than_15_literals: 375 lsl x14, x13, #4 // literals_only_token is 0xL0 (where L is 4 bits) 376 strb w14, [x9], #1 // *dst++ = literals_only_token 377L_trailing_literals_copy_less_than_16_literals: 378 ldrb w13, [x11], #1 // load current_literal 379 strb w13, [ x9], #1 // *dst++ = current_literal 380 cmp x11, x12 381 b.lo L_trailing_literals_copy_less_than_16_literals 382 383 // this block upadte dst & src pointers and remove frame 384L_done: 385 str x9, [x0] 386 str x11, [x2] 387 388 ldp x27, x28, [sp], #16 389 ldp x25, x26, [sp], #16 390 ldp x23, x24, [sp], #16 391 ldp x21, x22, [sp], #16 392 ldp x19, x20, [sp], #16 393 394 // clear frame 395 ldp fp, lr, [sp], #16 396 ARM64_STACK_EPILOG 397 398L_revert_x9_and_done: 399 sub x9, x9, #1 400 b L_done 401 402.p2align 2 403L_constant: 404.long LZ4_COMPRESS_HASH_MULTIPLY 405.long 0x80808081 406 407#endif 408 409