1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains arm64 hand optimized implementation of WKdm memory page decompressor. 31 32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word *scratch, __unused__ unsigned int words); 33 34 input : 35 src_buf : address of input compressed data buffer 36 dest_buf : address of output decompressed buffer 37 scratch : an 8-k bytes scratch mempro provided by the caller 38 words : this argument is not used in the implementation 39 (The 4th argument is, in fact, used by the Mostly Zero Value decoder) 40 41 output : 42 43 the input buffer is decompressed and the dest_buf is written with decompressed data. 44 45 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress arm64 assembly code WKdmCompress.s 46 47 The bit stream (*src_buf) consists of 48 a. 12 bytes header 49 b. 256 bytes for 1024 packed tags 50 c. (varying number of) words for new words not matched to dictionary word. 51 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 52 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 53 54 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) of the bit stream of c,d,e, respectively. 55 56 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows 57 58 for (i=0;i<1024;i++) { 59 tag = *next_tag++ 60 switch (tag) { 61 case 0 : *dest_buf++ = 0; break; 62 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; 63 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; 64 case 3 : *dest_buf++ = dictionary[*dict_index++]; break; 65 } 66 67 cclee, Nov 9, '12 68 69 Added zero page, single value page, sparse page, early abort optimizations 70 rsrini, 09/14/14 71*/ 72 73#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding 74 75#ifndef PAGES_SIZE_IN_KBYTES 76#define PAGES_SIZE_IN_KBYTES 4 77#endif 78 79#if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16)) 80#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported" 81#endif 82 83#define scale (PAGES_SIZE_IN_KBYTES/4) 84 85 86 .align 4 87 .text 88 89/* 90 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes); 91*/ 92 93 .globl _WKdm_decompress_4k 94_WKdm_decompress_4k: 95 96 /* 97 -------- symbolizing registers -------- 98 the arm64 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names. 99 */ 100 101 #define src_buf x0 102 #define dest_buf x1 103 #define scratch x2 104 #define n_bytes x3 105 #define dictionary sp 106 #define rax x13 107 #define eax w13 108 #define rbx x4 109 #define ebx w4 110 #define rcx x5 111 #define ecx w5 112 #define rdx x6 113 #define edx w6 114 #define tags_counter x7 115 #define next_tag x12 116 #define r8 x8 117 #define r9 x9 118 #define r10 x10 119 #define r11 x11 120 #define r12 x12 121 122 /* 123 124 ------ scratch memory for local variables --------- 125 126 [sp,#0] : dictionary 127 [scratch,#0] : tempTagsArray 128 [scratch,#1024] : tempQPosArray 129 [scratch,#2048] : tempLowBitsArray 130 131 */ 132 133#if KERNEL 134 sub rax, sp, #96 135 sub sp, sp, #96 136 st1.4s {v0,v1,v2},[rax],#48 137 st1.4s {v3,v4,v5},[rax],#48 138#endif 139 140 sub sp, sp, #64 141 142 ldr eax, [src_buf] // read the 1st word from the header 143 mov ecx, #MZV_MAGIC 144 cmp eax, ecx // is the alternate packer used (i.e. is MZV page)? 145 b.ne L_default_decompressor // default decompressor was used 146 147 // Mostly Zero Page Handling... 148 // { 149 add src_buf, src_buf, 4 // skip the header 150 mov rax, dest_buf 151 mov rcx, #(PAGES_SIZE_IN_KBYTES*1024) // number of bytes to zero out 1521: 153 dc zva, rax // zero 64 bytes. since dest_buf is a page, it will be 4096 or 16384 byte aligned 154 add rax, rax, #64 155 dc zva, rax 156 add rax, rax, #64 157 dc zva, rax 158 add rax, rax, #64 159 dc zva, rax 160 add rax, rax, #64 161 subs rcx, rcx, #256 162 b.ne 1b 163 164 mov r12, #4 // current byte position in src to read from 165 mov rdx, #0 1662: 167 ldr eax, [src_buf], #4 // get the word 168 ldrh edx, [src_buf], #2 // get the index 169 str eax, [dest_buf, rdx] // store non-0 word in the destination buffer 170 add r12, r12, #6 // 6 more bytes processed 171 cmp r12, n_bytes // finished processing all the bytes? 172 b.ne 2b 173 b L_done 174 // } 175 176L_default_decompressor: 177 178 /* 179 ---------------------- set up registers and PRELOAD_DICTIONARY --------------------------------- 180 */ 181 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR 182 adrp rbx, _table_2bits@GOTPAGE 183 stp xzr, xzr, [dictionary, #0] 184 add r10, src_buf, #(12+256*scale) // TAGS_AREA_END 185 stp xzr, xzr, [dictionary, #16] 186 add rax, src_buf, #12 // TAGS_AREA_START 187 ldr rbx, [rbx, _table_2bits@GOTPAGEOFF] 188 stp xzr, xzr, [dictionary, #32] 189 mov rcx, scratch // tempTagsArray 190 stp xzr, xzr, [dictionary, #48] 191 ld1.4s {v0,v1},[rbx] 192 193 194 /* 195 ------------------------------ unpacking bit stream ---------------------------------- 196 */ 197 198 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 199/* 200 unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes 201 for arm64, this can be done by 202 1. read the input 32-bit word into GPR w 203 2. duplicate GPR into 4 elements in a vector register v0 204 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6} 205 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303 206*/ 207 208L_WK_unpack_2bits: 209 ldr q5, [rax], #16 // read 4 32-bit words for 64 2-bit tags 210 dup.4s v2, v5[0] // duplicate to 4 elements 211 dup.4s v3, v5[1] // duplicate to 4 elements 212 dup.4s v4, v5[2] // duplicate to 4 elements 213 dup.4s v5, v5[3] // duplicate to 4 elements 214 ushl.4s v2, v2, v0 // v1 = {0, -2, -4, -6} 215 ushl.4s v3, v3, v0 // v1 = {0, -2, -4, -6} 216 ushl.4s v4, v4, v0 // v1 = {0, -2, -4, -6} 217 ushl.4s v5, v5, v0 // v1 = {0, -2, -4, -6} 218 and.16b v2, v2, v1 // v2 = {3,3,...,3} 219 and.16b v3, v3, v1 // v2 = {3,3,...,3} 220 and.16b v4, v4, v1 // v2 = {3,3,...,3} 221 and.16b v5, v5, v1 // v2 = {3,3,...,3} 222 cmp r10, rax // TAGS_AREA_END vs TAGS_AREA_START 223 st1.4s {v2,v3,v4,v5}, [rcx], #64 // write 64 tags into tempTagsArray 224 b.hi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits 225 226 227 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 228 229 ldp w8, w9, [src_buf] // WKdm header qpos start and end 230 adrp rbx, _table_4bits@GOTPAGE 231 subs x14, r9, r8 // x14 = (QPOS_AREA_END - QPOS_AREA_START)/4 232 add r8, src_buf, r8, lsl #2 // QPOS_AREA_START 233 add r9, src_buf, r9, lsl #2 // QPOS_AREA_END 234 235 b.ls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits 236 ldr rbx, [rbx, _table_4bits@GOTPAGEOFF] 237 add rcx, scratch, #(1024*scale) // tempQPosArray 238 ld1.4s {v0,v1},[rbx] 239 subs w14, w14, #1 240 b.ls 2f // do loop of 2 only if w14 >= 5 241L_WK_unpack_4bits: 242 ldr d2, [r8], #8 // read a 32-bit word for 8 4-bit positions 243 subs w14, w14, #2 244 zip1.4s v2, v2, v2 245 ushl.4s v2, v2, v0 // v1 = {0, -4, 0, -4} 246 and.16b v2, v2, v1 // v2 = {15,15,...,15} 247 str q2, [rcx], #16 248 b.hi L_WK_unpack_4bits 2492: 250 adds w14, w14, #1 251 b.le 1f 252 253 ldr s3, [r8], #4 // read a 32-bit word for 8 4-bit positions 254 dup.2s v2, v3[0] // duplicate to 2 elements 255 ushl.2s v2, v2, v0 // v1 = {0, -4} 256 and.8b v2, v2, v1 // v2 = {15,15,...,15} 257 str d2, [rcx], #8 // write 16 tags into tempTagsArray 258 2591: 260 261 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 262 263 ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset 264 add r8, src_buf, rax, lsl #2 // LOW_BITS_AREA_END 265 add rcx, scratch, #(2048*scale) // tempLowBitsArray 266#if (scale==1) 267 add r11, scratch, #(4096*scale-2) // final tenbits for the rare case 268#else 269 add r11, scratch, #(4096*scale) // final tenbits for the rare case 270 sub r11, r11, #2 271#endif 272 subs r8, r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 273 b.ls 1f // if START>=END, skip L_WK_unpack_3_tenbits 274 275 adrp rbx, _table_10bits@GOTPAGE 276 ldr rbx, [rbx, _table_10bits@GOTPAGEOFF] 277 ld1.4s {v0,v1,v2,v3},[rbx] 278 279 /* 280 a very rare case : 1024 tenbits, 1023 + 1 -> 341 + final 1 that is padded with 2 zeros 281 since the scratch memory is 4k (2k for this section), we need to pay attention to the last case 282 so we don't overwrite to the scratch memory 283 284 we 1st do a single 3_tenbits, followed by 2x_3_tenbits loop, and detect whether the last 3_tenbits 285 hits the raee case 286 */ 287#if 1 288 subs r8, r8, #4 // pre-decrement by 8 289 ldr s4, [r9], #4 // read 32-bit words for 3 low 10-bits 290 zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. 291 ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. 292 ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. 293 and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. 294 and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets 295 orr.16b v4, v4, v5 // combine data 296 str d4, [rcx], #6 // write 3 low 10-bits 297 b.eq 1f 298#endif 299 300 subs r8, r8, #8 // pre-decrement by 8 301 b.lt L_WK_unpack_3_tenbits 302 303L_WK_unpack_2x_3_tenbits: 304 ldr d4, [r9], #8 // read 2 32-bit words for a pair of 3 low 10-bits 305 zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. 306 ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. 307 ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. 308 and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. 309 and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets 310 orr.16b v4, v4, v5 // combine data 311 ins v5.d[0], v4.d[1] 312 str d4, [rcx], #6 // write 3 low 10-bits 313 str d5, [rcx], #6 // write 3 low 10-bits 314 315 subs r8, r8, #8 316 b.ge L_WK_unpack_2x_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 317 318 tst r8, #4 319 b.eq 1f 320 321L_WK_unpack_3_tenbits: 322 ldr s4, [r9] // read 32-bit words for 3 low 10-bits 323 zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. 324 ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. 325 ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. 326 and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. 327 and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets 328 orr.16b v4, v4, v5 // combine data 329#if 0 330 str d4, [rcx] // write 3 low 10-bits 331#else 332 cmp rcx, r11 333 b.eq 2f 334 str d4, [rcx] // write 3 low 10-bits 335 b 1f 3362: 337 str h4, [rcx] // write final 1 low 10-bits 338#endif 3391: 340 341 /* 342 set up before going to the main decompress loop 343 */ 344 mov next_tag, scratch // tempTagsArray 345 add r8, scratch, #(1024*scale) // next_qpos 346 add r11, scratch, #(2048*scale) // tempLowBitsArray 347 adrp rbx, _hashLookupTable@GOTPAGE 348 mov tags_counter, #(1024*scale) // tag_area_end 349 ldr rbx, [rbx, _hashLookupTable@GOTPAGEOFF] 350 351 b L_next 352 353 .align 4,0x90 354L_ZERO_TAG: 355 /* 356 we can only get here if w9 = 0, meaning this is a zero tag 357 *dest_buf++ = 0; 358 */ 359 str w9, [dest_buf], #4 // *dest_buf++ = 0 360 subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end 361 b.ls L_done // if next_tag >= tag_area_end, we're done 362 363 /* WKdm decompress main loop */ 364L_next: 365 ldrb w9, [next_tag], #1 // new tag 366 cbz w9, L_ZERO_TAG 367 cmp w9, #2 // partial match tag ? 368 b.eq L_MISS_TAG 369 b.gt L_EXACT_TAG 370 371L_PARTIAL_TAG: 372 /* 373 this is a partial match: 374 dict_word = dictionary[*dict_index]; 375 dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; 376 */ 377 378 ldrb edx, [r8], #1 // qpos = *next_qpos++ 379 ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++ 380 ldr eax, [dictionary, rdx, lsl #2] // read dictionary word 381 bfm eax, ecx, #0, #9 // pad the lower 10-bits from *next_low_bits 382 str eax, [dictionary,rdx,lsl #2] // *dict_location = newly formed word 383 str eax, [dest_buf], #4 // *dest_buf++ = newly formed word 384 subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end 385 b.gt L_next // repeat loop until next_tag==tag_area_end 386 387L_done: 388 389 // release stack memory, restore registers, and return 390 add sp, sp, #64 // deallocate for dictionary 391#if KERNEL 392 ld1.4s {v0,v1,v2},[sp],#48 393 ld1.4s {v3,v4,v5},[sp],#48 394#endif 395 ret lr 396 397 .align 4,0x90 398L_MISS_TAG: 399 /* 400 this is a dictionary miss. 401 x = *new_word++; 402 k = (x>>10)&255; 403 k = hashTable[k]; 404 dictionary[k] = *dest_buf++ = x; 405 */ 406 ldr eax, [r10], #4 // w = *next_full_patt++ 407 ubfm edx, eax, #10, #17 // 8-bit hash table index 408 str eax, [dest_buf], #4 // *dest_buf++ = word 409 ldrb edx, [rbx, rdx] // qpos 410 str eax, [dictionary,rdx] // dictionary[qpos] = word 411 subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end 412 b.gt L_next // repeat the loop 413 b L_done // if next_tag >= tag_area_end, we're done 414 415 .align 4,0x90 416L_EXACT_TAG: 417 /* 418 this is an exact match; 419 *dest_buf++ = dictionary[*dict_index++]; 420 */ 421 ldrb eax, [r8], #1 // qpos = *next_qpos++ 422 ldr eax, [dictionary,rax,lsl #2] // w = dictionary[qpos] 423 str eax, [dest_buf], #4 // *dest_buf++ = w 424 subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end 425 b.gt L_next // repeat the loop 426 b L_done // if next_tag >= tag_area_end, we're done 427 428 429