1/* 2 * Copyright (c) 2017 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28#include <vm/lz4_assembly_select.h> 29#include <arm64/asm.h> 30#if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 31 32/* 33 34 int64_t lz4_decode_asm( 35 uint8_t ** dst_ptr, *dst_ptr points to next output byte to write 36 uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst 37 uint8_t * dst_end, "relaxed" end of output buffer (see below) 38 const uint8_t ** src_ptr, *src_ptr points to next input byte to read 39 const uint8_t * src_end) "relaxed" end of input buffer (see below) 40 41 We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant. 42 We never read before dst_begin. 43 44 Return 0 on success, -1 on failure 45 On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction. 46 47*/ 48 49.globl _lz4_decode_asm 50 51#define dst x0 // arg0 52#define dst_begin x1 // arg1 53#define dst_end x2 // arg2 54#define src x3 // arg3 55#define src_end x4 // arg4 56 57#define w_n_matches w5 // lower 32 bits of n_matches 58#define n_matches x5 59#define n_literals x6 60#define copy_src x7 // match/literal copy source 61#define copy_dst x8 // match/literal copy destination 62 63#define w_aux1 w9 // lower 32 bits of aux1 64#define aux1 x9 65#define aux2 x10 66 67#define w_match_distance w11 // lower 32 bits of match_distance 68#define match_distance x11 69 70#define match_permtable x12 71#define match_disttable x13 72 73#define dst_good x19 74#define src_good x20 75 76.macro establish_frame 77 ARM64_STACK_PROLOG 78 stp fp, lr, [sp, #-16]! 79 mov fp, sp 80.endm 81 82.macro clear_frame_and_return 83 ldp fp, lr, [sp], #16 84 ARM64_STACK_EPILOG 85.endm 86 87// copy_1x16 SOURCE_ADDR DESTINATION_ADDR 88// Copy 16 bytes, clobber: q0 89.macro copy_1x16 90 ldr q0,[$0] 91 str q0,[$1] 92.endm 93 94// copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR 95// Copy 16 bytes, and increment both addresses by 16, clobber: q0 96.macro copy_1x16_and_increment 97 ldr q0,[$0],#16 98 str q0,[$1],#16 99.endm 100 101// copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR 102// Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0 103.macro copy_2x16_and_increment 104 ldr q0,[$0],#16 105 str q0,[$1],#16 106 ldr q0,[$0],#16 107 str q0,[$1],#16 108.endm 109 110// copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR 111// Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1 112.macro copy_1x32_and_increment 113 ldp q0,q1,[$0],#32 114 stp q0,q1,[$1],#32 115.endm 116 117// If we don't branch, src < src_end after this 118.macro check_src_end 119 cmp src,src_end 120 b.hs L_done // extremely unlikely, DONE when src >= src_end 121.endm 122 123// If we don't branch, dst < dst_end after this 124.macro check_dst_end 125 cmp dst,dst_end 126 b.hs L_done // extremely unlikely, DONE when dst >= dst_end 127.endm 128 129.text 130.p2align 4 131_lz4_decode_asm: 132 establish_frame 133 stp x19,x20,[sp,#-16]! // need to preserve these 134 stp src,dst,[sp,#-16]! // save src_ptr,dst_ptr on stack 135 ldr src,[src] // src = *src_ptr 136 ldr dst,[dst] // dst = *dst_ptr 137 adr match_permtable,L_match_permtable 138 adr match_disttable,L_match_disttable 139 140L_decode_command: 141 // Keep last known good positions in both streams 142 mov dst_good,dst 143 mov src_good,src 144 145 // Check limits 146 check_src_end 147 check_dst_end 148 149 // Decode 1-byte command 150 ldrb w_aux1,[src],#1 // read command byte LLLLMMMM 151 lsr n_literals,aux1,#4 // 0000LLLL. n_literals is now 0..15 152 and n_matches,aux1,#0xf // 0000MMMM. n_matches is now 0..15 153 add n_matches,n_matches,#4 // n_matches is now 4..19 154 155 // Test number of literals (do not test if n_literals==0, because branch prediction fails on it) 156 cmp n_literals,#14 157 b.ls L_copy_short_literal // 96% likely: n_literals in 0..14 158 // continue to decode_long_literal 159 160 // the number of literals is encoded on more bytes, we need to decode them 161L_decode_long_literal: 162 check_src_end // required here, since we may loop an arbitrarily high number of times 163 ldrb w_aux1,[src],#1 164 add n_literals,n_literals,aux1 165 cmp aux1,#255 166 b.eq L_decode_long_literal // extremely unlikely 167 // continue to copy_long_literal 168 169 // Copy literals, n_literals >= 15 170L_copy_long_literal: 171 mov copy_src,src // literal copy origin 172 mov copy_dst,dst // literal copy destination 173 add src,src,n_literals 174 add dst,dst,n_literals 175 check_src_end // required here, since n_literals can be arbitrarily high 176 check_dst_end 177 178 // fixed + loop 179 copy_1x32_and_increment copy_src,copy_dst 180L_copy_long_literal_loop: 181 copy_1x32_and_increment copy_src,copy_dst 182 cmp dst,copy_dst 183 b.hi L_copy_long_literal_loop // first test occurs after 64 bytes have been copied, and is unlikely to loop back 184 b L_expand_match 185 186 // Copy literals, n_literals <= 14: copy 16 bytes 187L_copy_short_literal: 188 copy_1x16 src,dst 189 add src,src,n_literals 190 add dst,dst,n_literals 191 // continue to expand match 192 193L_expand_match: 194 195 // Decode match distance 196 ldrh w_match_distance,[src],#2 // 16-bit distance 197 cbz match_distance,L_fail // distance == 0 is invalid 198 sub copy_src,dst,match_distance // copy_src is the match copy source 199 cmp copy_src,dst_begin 200 b.lo L_fail // copy_src < dst_begin: FAIL 201 mov copy_dst,dst // copy_dst is the match copy destination 202 add dst,dst,n_matches // dst is updated to be the byte after the match; n_matches <= 19 here 203 204 // Do we need to decode a long match? 205 cmp n_matches,#19 206 b.eq L_decode_long_match // unlikely, n_matches >= 19 encoded on more bytes 207 cmp n_matches,#16 208 b.hi L_long_match // unlikely, n_matches == 17 or 18 209 // continue to short match (most probable case) 210 211 // Copy match, n_matches <= 16 212L_short_match: 213 cmp match_distance,#15 214 b.ls L_copy_short_match_small_distance 215 216 // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes 217 copy_1x16 copy_src,copy_dst 218 b L_decode_command 219 220 // Copy match, n_matches <= 16, match_distance < 16: 221 // load shuffle table, and permute to replicate the pattern on 16 bytes 222L_copy_short_match_small_distance: 223 ldr q0,[copy_src] 224 add aux1,match_permtable,match_distance,lsl #5 // index in table 225 ldr q1,[aux1] // load only permutation for the low 16 bytes 226 tbl v0.16b,{v0.16b},v1.16b // low 16 bytes of pattern 227 str q0,[copy_dst] 228 b L_decode_command 229 230 // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them 231L_decode_long_match: 232 check_src_end // required here, since we may loop an arbitrarily high number of times 233 ldrb w_aux1,[src],#1 234 add dst,dst,aux1 235 cmp aux1,#255 236 b.eq L_decode_long_match // very unlikely 237 check_dst_end // required here, since dst was incremented by a arbitrarily high value 238 // continue to long_match 239 240 // n_matches > 16 241L_long_match: 242 cmp match_distance,#31 243 b.hi L_copy_long_match_32 244 cmp match_distance,#15 245 b.hi L_copy_long_match_16 246 247 // Copy match, n_matches >= 16, match_distance < 16: 248 // load shuffle table, and permute to replicate the pattern on 32 bytes 249L_copy_long_match_small_distance: 250 ldr q1,[copy_src] // 16 pattern bytes 251 add aux1,match_permtable,match_distance,lsl #5 // index in table 252 ldp q2,q3,[aux1] // load 32-byte permutation 253 tbl v0.16b,{v1.16b},v2.16b // low 16 bytes of pattern in q0 254 tbl v1.16b,{v1.16b},v3.16b // high 16 bytes of pattern in q1 255 ldrb w_aux1,[match_disttable,match_distance] // valid pattern length in aux1 256 // fixed 257 stp q0,q1,[copy_dst] 258 add copy_dst,copy_dst,aux1 259L_copy_long_match_small_distance_loop: 260 // loop 261 stp q0,q1,[copy_dst] 262 add copy_dst,copy_dst,aux1 263 stp q0,q1,[copy_dst] 264 add copy_dst,copy_dst,aux1 265 cmp dst,copy_dst 266 b.hi L_copy_long_match_small_distance_loop 267 b L_decode_command 268 269 // Copy match, n_matches >= 16, match_distance >= 32 270L_copy_long_match_32: 271 // fixed + loop 272 copy_1x16_and_increment copy_src,copy_dst 273L_copy_long_match_32_loop: 274 copy_1x32_and_increment copy_src,copy_dst 275 cmp dst,copy_dst 276 b.hi L_copy_long_match_32_loop 277 b L_decode_command 278 279 // Copy match, n_matches >= 16, match_distance >= 16 280L_copy_long_match_16: 281 // fixed + loop 282 copy_1x16_and_increment copy_src,copy_dst 283L_copy_long_match_16_loop: 284 copy_2x16_and_increment copy_src,copy_dst 285 cmp dst,copy_dst 286 b.hi L_copy_long_match_16_loop 287 b L_decode_command 288 289L_fail: 290 mov aux1,#-1 // FAIL 291 b L_exit 292 293L_done: 294 mov aux1,#0 // OK 295 // continue to L_exit 296 297L_exit: 298 ldp src,dst,[sp],#16 // get back src_ptr,dst_ptr from stack 299 str src_good,[src] // *src_ptr = src_good 300 str dst_good,[dst] // *dst_ptr = dst_good 301 mov x0,aux1 // x0 = return value 302 ldp x19,x20,[sp],#16 // restore 303 clear_frame_and_return 304 305// permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15 306// value(d)[i] = i%d for i = 0..31 307.p2align 6 308L_match_permtable: 309.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0 310.byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1 311.byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2 312.byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3 313.byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4 314.byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5 315.byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6 316.byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7 317.byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8 318.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9 319.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10 320.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11 321.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12 322.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13 323.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14 324.byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15 325 326// valid repeating pattern size, for each match_distance = 0 to 15 327// value(d) = 32 - (32%d), is the largest a multiple of d <= 32 328.p2align 6 329L_match_disttable: 330.byte 32,32,32,30 // 0 .. 3 331.byte 16,30,30,28 // 4 .. 7 332.byte 16,27,30,22 // 8 .. 11 333.byte 24,26,28,30 // 12 .. 15 334 335#endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 336