1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains armv7 hand optimized implementation of WKdm memory page decompressor. 31 32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); 33 34 input : 35 src_buf : address of input compressed data buffer 36 dest_buf : address of output decompressed buffer 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller 38 words : this argument is not used in the implementation 39 40 output : 41 42 the input buffer is decompressed and the dest_buf is written with decompressed data. 43 44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress armv7 assembly code WKdmCompress.s 45 46 The bit stream (*src_buf) consists of 47 a. 12 bytes header 48 b. 256 bytes for 1024 packed tags 49 c. (varying number of) words for new words not matched to dictionary word. 50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 52 53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 54 55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows 56 57 for (i=0;i<1024;i++) { 58 tag = *next_tag++ 59 switch (tag) { 60 case 0 : *dest_buf++ = 0; break; 61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; 62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; 63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break; 64 } 65 66 cclee, 11/9/12 67 68 Added zero page, single value page, sparse page, early abort optimizations 69 rsrini, 09/14/14 70 71*/ 72 73 #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding 74 75 #define ZERO 0 76 #define PARTIAL_MATCH 1 77 #define MISS_TAG 2 78 #define MATCH 3 79 80 .text 81 .syntax unified 82 .align 4 83 84 // void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes); 85 86 .globl _WKdm_decompress_new 87_WKdm_decompress_new: 88 89 /* 90 -------- symbolizing registers -------- 91 the armv7 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names. 92 */ 93 94 #define src_buf r0 95 #define dest_buf r1 96 #define scratch r2 97 #define eax r3 98 #define ebx r4 99 #define hashTable r4 100 #define ecx r5 101 #define edx r6 102 #define n_bytes r8 103 #define next_tag r12 104 #define tags_counter lr 105 #define dictionary sp 106 #define v0 q0 107 #define v1 q1 108 #define v2 q2 109 #define v3 q3 110 #define v4 q4 111 #define v5 q5 112 113 // and scratch memory for local variables 114 115 // [sp,#0] : dictionary 116 // [scratch,#0] : tempTagsArray was 64 117 // [scratch,#1024] : tempQPosArray was 1088 118 // [scratch,#2048] : tempLowBitsArray was 2112 119 120 push {r7, lr} 121 mov r7, sp 122 push {r4-r6,r8-r11} 123#if KERNEL 124 sub ecx, sp, #96 125 sub sp, sp, #96 126 vst1.64 {q0,q1},[ecx]! 127 vst1.64 {q2,q3},[ecx]! 128 vst1.64 {q4,q5},[ecx]! 129#endif 130 sub sp, sp, #64 // allocate for dictionary 131 132 mov n_bytes, r3 // save the n_bytes passed as function args 133 ldr eax, [src_buf] // read the 1st word from the header 134 mov ecx, #MZV_MAGIC 135 cmp eax, ecx // is the alternate packer used (i.e. is MZV page)? 136 bne L_default_decompressor // default decompressor was used 137 138 // Mostly Zero Page Handling... 139 // { 140 add src_buf, src_buf, 4 // skip the header 141 mov eax, dest_buf 142 mov ecx, #4096 // number of bytes to zero out 143 mov r9, #0 144 mov r10, #0 145 mov r11, #0 146 mov r12, #0 1471: 148 subs ecx, ecx, #64 149 stmia eax!, {r9-r12} 150 stmia eax!, {r9-r12} 151 stmia eax!, {r9-r12} 152 stmia eax!, {r9-r12} 153 bne 1b 154 155 mov r12, #4 // current byte position in src to read from 1562: 157 ldr eax, [src_buf], #4 // get the word 158 ldrh edx, [src_buf], #2 // get the index 159 str eax, [dest_buf, edx] // store non-0 word in the destination buffer 160 add r12, r12, #6 // 6 more bytes processed 161 cmp r12, n_bytes // finished processing all the bytes? 162 bne 2b 163 b L_done 164 // } 165 166L_default_decompressor: 167 168 /* 169 ---------------------- set up registers and PRELOAD_DICTIONARY --------------------------------- 170 */ 171 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR 172 vmov.i32 q0, #0 173 mov r8, sp 174 adr ebx, _table_2bits 175 vst1.64 {q0}, [r8]! 176 add r10, src_buf, #268 // TAGS_AREA_END 177 vst1.64 {q0}, [r8]! 178 add eax, src_buf, #12 // TAGS_AREA_START 179 vst1.64 {q0}, [r8]! 180 mov ecx, scratch // tempTagsArray 181 vst1.64 {q0}, [r8]! 182 vld1.64 {q0,q1},[ebx,:128] 183 184 185 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 186/* 187 unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes 188 for arm64, this can be done by 189 1. read the input 32-bit word into GPR w 190 2. duplicate GPR into 4 elements in a vector register v0 191 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6} 192 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303 193*/ 194 195L_WK_unpack_2bits: 196 vld1.64 {v5}, [eax]! // read 4 32-bit words for 64 2-bit tags 197 vdup.32 v2, d10[0] // duplicate to 4 elements 198 vdup.32 v3, d10[1] // duplicate to 4 elements 199 vdup.32 v4, d11[0] // duplicate to 4 elements 200 vdup.32 v5, d11[1] // duplicate to 4 elements 201 vshl.u32 v2, v2, v0 // v0 = {0, -2, -4, -6} 202 vshl.u32 v3, v3, v0 // v0 = {0, -2, -4, -6} 203 vshl.u32 v4, v4, v0 // v0 = {0, -2, -4, -6} 204 vshl.u32 v5, v5, v0 // v0 = {0, -2, -4, -6} 205 vand v2, v2, v1 // v1 = {3,3,...,3} 206 vand v3, v3, v1 // v1 = {3,3,...,3} 207 vand v4, v4, v1 // v1 = {3,3,...,3} 208 vand v5, v5, v1 // v1 = {3,3,...,3} 209 vst1.64 {v2,v3}, [ecx,:128]! // write 64 tags into tempTagsArray 210 cmp r10, eax // TAGS_AREA_END vs TAGS_AREA_START 211 vst1.64 {v4,v5}, [ecx,:128]! // write 64 tags into tempTagsArray 212 bhi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits 213 214 215 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 216 217 ldm src_buf, {r8,r9} // WKdm header qpos start and end 218 adr ebx, _table_4bits 219 subs r12, r9, r8 // r12 = (QPOS_AREA_END - QPOS_AREA_START)/4 220 add r8, src_buf, r8, lsl #2 // QPOS_AREA_START 221 add r9, src_buf, r9, lsl #2 // QPOS_AREA_END 222 bls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits 223 add ecx, scratch, #1024 // tempQPosArray 224 vld1.64 {v0,v1},[ebx,:128] 225 226 subs r12, r12, #1 227 bls 2f // do loop of 2 only if w14 >= 5 228L_WK_unpack_4bits: 229 vld1.64 {d4}, [r8]! // read a 32-bit word for 8 4-bit positions 230 subs r12, r12, #2 231 vmov d5, d4 232 vzip.32 d4, d5 233 vshl.u32 v2, v2, v0 // v0 = {0, -4, 0, -4} 234 vand v2, v2, v1 // v1 = {15,15,...,15} 235 vst1.64 {q2}, [ecx,:128]! 236 bhi L_WK_unpack_4bits 2372: 238 adds r12, r12, #1 239 ble 1f 240 241 ldr r12, [r8], #4 // read a 32-bit word for 8 4-bit positions 242 vdup.32 d4, r12 // duplicate to 2 elements 243 vshl.u32 v2, v2, v0 // v0 = {0, -4} 244 vand v2, v2, v1 // v1 = {15,15,...,15} 245 vst1.64 {d4}, [ecx,:64]! // write 16 tags into tempTagsArray 246 2471: 248 249 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 250 251 ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset 252 add r8, src_buf, eax, lsl #2 // LOW_BITS_AREA_END 253 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 254 add ecx, scratch, #2048 // tempLowBitsArray 255 add edx, scratch, #4096 // last tenbits 256 bls 1f // if START>=END, skip L_WK_unpack_3_tenbits 257 258 adr ebx, _table_10bits 259 vld1.64 {v0,v1},[ebx,:128] 260 261 mov r11, #0x03ff 262L_WK_unpack_3_tenbits: 263 ldr r12, [r9], #4 // read a 32-bit word for 3 low 10-bits 264 and lr, r11, r12 265 strh lr, [ecx], #2 266 cmp ecx, edx 267 and lr, r11, r12, lsr #10 268 beq 1f 269 strh lr, [ecx], #2 270 and lr, r11, r12, lsr #20 271 strh lr, [ecx], #2 272 273 cmp r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 274 bhi L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 275 2761: 277 /* 278 set up before going to the main decompress loop 279 */ 280 281 mov next_tag, scratch // tempTagsArray 282 add r8, scratch, #1024 // next_qpos 283 add r11, scratch, #2048 // tempLowBitsArray 284#if defined(KERNEL) && !SLIDABLE 285 adr hashTable, L_table 286 ldr hashTable, [hashTable] 287#else 288 ldr hashTable, L_table 289L_table0: 290 ldr hashTable, [pc, hashTable] 291#endif 292 mov tags_counter, #1024 // tags_counter 293 294 b L_next 295 296 .align 4,0x90 297L_ZERO_TAG: 298 /* 299 we can only get here if w9 = 0, meaning this is a zero tag 300 *dest_buf++ = 0; 301 */ 302 subs tags_counter,tags_counter,#1 // tags_counter-- 303 str r9, [dest_buf], #4 // *dest_buf++ = 0 304 ble L_done // if next_tag >= tag_area_end, we're done 305 306 /* WKdm decompress main loop */ 307L_next: 308 ldrb r9, [next_tag], #1 // new tag 309 cmp r9, #0 310 beq L_ZERO_TAG 311 cmp r9, #2 // partial match tag ? 312 beq L_MISS_TAG 313 bgt L_EXACT_TAG 314 315L_PARTIAL_TAG: 316 /* 317 this is a partial match: 318 dict_word = dictionary[*dict_index]; 319 dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; 320 */ 321 ldrb edx, [r8], #1 // qpos = *next_qpos++ 322 ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++ 323 ldr eax, [dictionary, edx, lsl #2] // read dictionary word 324 subs tags_counter,tags_counter,#1 // tags_counter-- 325 lsr eax, eax, #10 // clear lower 10 bits 326 orr eax, ecx, eax, lsl #10 // pad the lower 10-bits from *next_low_bits 327 str eax, [dictionary,edx,lsl #2] // *dict_location = newly formed word 328 str eax, [dest_buf], #4 // *dest_buf++ = newly formed word 329 bgt L_next // repeat loop until next_tag==tag_area_end 330 331L_done: 332 333 add sp, sp, #64 // deallocate for dictionary 334 335 // release stack memory, restore registers, and return 336#if KERNEL 337 vld1.64 {q0,q1},[sp]! 338 vld1.64 {q2,q3},[sp]! 339 vld1.64 {q4,q5},[sp]! 340#endif 341 pop {r4-r6,r8-r11} 342 pop {r7,pc} 343 344 .align 4,0x90 345L_MISS_TAG: 346 /* 347 this is a dictionary miss. 348 x = *new_word++; 349 k = (x>>10)&255; 350 k = hashTable[k]; 351 dictionary[k] = *dest_buf++ = x; 352 */ 353 subs tags_counter,tags_counter,#1 // tags_counter-- 354 ldr eax, [r10], #4 // w = *next_full_patt++ 355 lsr edx, eax, #10 // w>>10 356 str eax, [dest_buf], #4 // *dest_buf++ = word 357 and edx, edx, #0x0ff // 8-bit hash table index 358 ldrb edx, [ebx, edx] // qpos 359 str eax, [dictionary,edx] // dictionary[qpos] = word 360 bgt L_next // repeat the loop 361 b L_done // if next_tag >= tag_area_end, we're done 362 363 .align 4,0x90 364 365L_EXACT_TAG: 366 /* 367 this is an exact match; 368 *dest_buf++ = dictionary[*dict_index++]; 369 */ 370 371 ldrb eax, [r8], #1 // qpos = *next_qpos++ 372 subs tags_counter,tags_counter,#1 // tags_counter-- 373 ldr eax, [dictionary,eax,lsl #2] // w = dictionary[qpos] 374 str eax, [dest_buf], #4 // *dest_buf++ = w 375 bgt L_next // repeat the loop 376 b L_done // if next_tag >= tag_area_end, we're done 377 378 379 .align 4 380 381_table_2bits: 382 .word 0 383 .word -2 384 .word -4 385 .word -6 386 .word 0x03030303 387 .word 0x03030303 388 .word 0x03030303 389 .word 0x03030303 390 391_table_4bits: 392 .word 0 393 .word -4 394 .word 0 395 .word -4 396 .word 0x0f0f0f0f 397 .word 0x0f0f0f0f 398 .word 0x0f0f0f0f 399 .word 0x0f0f0f0f 400 401_table_10bits: 402 .word 0 403 .word -10 404 .word -20 405 .word 0 406 .word 1023 407 .word 1023 408 .word 1023 409 .word 0 410 411 412#if defined(KERNEL) && !SLIDABLE 413 .align 2 414L_table: 415 .long _hashLookupTable_new 416#else 417 .align 2 418L_table: 419 .long L_Tab$non_lazy_ptr-(L_table0+8) 420 421 .section __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers 422 .align 2 423L_Tab$non_lazy_ptr: 424 .indirect_symbol _hashLookupTable_new 425 .long 0 426#endif 427 428