1/* 2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor. 31 32 void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words); 33 34 input : 35 src_buf : address of input compressed data buffer 36 dest_buf : address of output decompressed buffer 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller 38 words : this argument is not used in the implementation 39 40 output : 41 42 the input buffer is decompressed and the dest_buf is written with decompressed data. 43 44 Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s 45 46 The bit stream (*src_buf) consists of 47 a. 12 bytes header 48 b. 256 bytes for 1024 packed tags 49 c. (varying number of) words for new words not matched to dictionary word. 50 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 51 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 52 53 where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 54 55 The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows 56 57 for (i=0;i<1024;i++) { 58 tag = *next_tag++ 59 switch (tag) { 60 case 0 : *dest_buf++ = 0; break; 61 case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; 62 case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; 63 case 3 : *dest_buf++ = dictionary[*dict_index++]; break; 64 } 65 66 cclee, 11/30/12 67 68 Added zero page, single value page, sparse page, early abort optimizations 69 rsrini, 09/14/14 70*/ 71 72#define MZV_MAGIC $17185 // magic value used to identify MZV page encoding 73 74 .text 75 76 .globl _WKdm_decompress_new 77_WKdm_decompress_new: 78 79 // save registers, and allocate stack memory for local variables 80 81 pushq %rbp 82 movq %rsp, %rbp 83 pushq %r12 84 pushq %r13 85 pushq %rbx 86 87 subq $(64+8+16), %rsp 88 89 movl 0(%rdi), %eax // read the 1st word from the header 90 cmpl MZV_MAGIC, %eax // is the alternate packer used (i.e. is MZV page)? 91 jne L_default_decompressor // default decompressor was used 92 93 // Mostly Zero Page Handling... 94 // { 95 movq $0, %rax 961: // Zero out the entire page 97 movq $0, 0(%rsi, %rax) 98 movq $0, 8(%rsi, %rax) 99 movq $0, 16(%rsi, %rax) 100 movq $0, 24(%rsi, %rax) 101 movq $0, 32(%rsi, %rax) 102 movq $0, 40(%rsi, %rax) 103 movq $0, 48(%rsi, %rax) 104 movq $0, 56(%rsi, %rax) 105 addq $64, %rax 106 cmpq $4096, %rax 107 jne 1b 108 109 movq $4, %r12 // current byte position in src to read from 1102: 111 movl 0(%rdi, %r12), %eax // get the word 112 movzwq 4(%rdi, %r12), %rdx // get the index 113 movl %eax, 0(%rsi, %rdx) // store non-0 word in the destination buffer 114 addq $6, %r12 // 6 more bytes processed 115 cmpl %ecx, %r12d // finished processing all the bytes? 116 jne 2b 117 jmp L_done 118 // } 119 120L_default_decompressor: 121 122 movq %rsi, %r12 // dest_buf 123 movq %rdx, %r13 // scratch_buf 124 125 // PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp) 126 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR 127#if 1 128 movl $0, 0(%rsp) 129 movl $0, 4(%rsp) 130 movl $0, 8(%rsp) 131 movl $0, 12(%rsp) 132 movl $0, 16(%rsp) 133 movl $0, 20(%rsp) 134 movl $0, 24(%rsp) 135 movl $0, 28(%rsp) 136 movl $0, 32(%rsp) 137 movl $0, 36(%rsp) 138 movl $0, 40(%rsp) 139 movl $0, 44(%rsp) 140 movl $0, 48(%rsp) 141 movl $0, 52(%rsp) 142 movl $0, 56(%rsp) 143 movl $0, 60(%rsp) 144#else 145 mov $0x100000001, %rax 146 mov %rax, (%rsp) 147 mov %rax, 8(%rsp) 148 mov %rax, 16(%rsp) 149 mov %rax, 24(%rsp) 150 mov %rax, 32(%rsp) 151 mov %rax, 40(%rsp) 152 mov %rax, 48(%rsp) 153 mov %rax, 56(%rsp) 154#endif 155 156 // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); 157 158 leaq 268(%rdi), %r10 // TAGS_AREA_END 159 leaq 12(%rdi), %rax // TAGS_AREA_START 160 movq %r13, %rsi // tempTagsArray 161 cmpq %rax, %r10 // TAGS_AREA_END vs TAGS_AREA_START 162 jbe 1f // if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits 163 movq %r13, %rcx // next_word 164 xorl %r8d, %r8d // i = 0 165 mov $(50529027<<32)+50529027, %r9 166L_WK_unpack_2bits: 167 movl 12(%rdi,%r8, 4), %eax 168 movl 12(%rdi,%r8, 4), %edx 169 shrl $2, %eax 170 shlq $32, %rax 171 orq %rdx, %rax 172 movq %rax, %rdx 173 shrq $4, %rax 174 andq %r9, %rdx 175 andq %r9, %rax 176 incq %r8 // i++ 177 movq %rdx, (%rcx) 178 movq %rax, 8(%rcx) 179 addq $16, %rcx // next_tags += 16 180 cmpq $64, %r8 // i vs 64 181 jne L_WK_unpack_2bits // repeat loop until i==64 1821: 183 184 185 // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); 186 187 mov 4(%rdi), %eax // WKdm header qpos end 188 leaq (%rdi,%rax,4), %r9 // QPOS_AREA_END 189 mov 0(%rdi), %eax // WKdm header qpos start 190 leaq (%rdi,%rax,4), %r8 // QPOS_AREA_START 191 leaq 1024(%r13), %rbx // tempQPosArray 192 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 193 jbe 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits 194 leaq 8(%rbx), %rcx // next_qpos 195 196 mov $(252645135<<32)+252645135, %r11 197L_WK_unpack_4bits: 198 movl (%r8), %eax // w = *next_word 199 movl %eax, %edx // w 200 shlq $28, %rax 201 orq %rdx, %rax 202 addq $4, %r8 // next_word++ 203 andq %r11, %rax 204 movq %rax, -8(%rcx) 205 addq $8, %rcx // next_qpos+=8 206 cmpq %r8, %r9 // QPOS_AREA_END vs QPOS_AREA_START 207 ja L_WK_unpack_4bits // repeat loop until QPOS_AREA_END <= QPOS_AREA_START 208 209 2101: 211 212 // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); 213 214 movl 8(%rdi), %eax // LOW_BITS_AREA_END offset 215 leaq (%rdi,%rax,4), %rdi // LOW_BITS_AREA_END 216 leaq 2048(%r13), %r11 // tempLowBitsArray 217 leaq 4094(%r13), %r13 // final tenbits addr 218 sub %r9, %rdi // LOW_BITS_AREA_START vs LOW_BITS_AREA_END 219 jle 1f // if START>=END, skip L_WK_unpack_3_tenbits 220 movq %r11, %rcx // next_low_bits 221L_WK_unpack_3_tenbits: 222 movl (%r9), %eax // w = *next_word, 0:c:b:a 223 movl $(1023<<10), %edx 224 movl $(1023<<20), %r8d 225 andl %eax, %edx // b << 10 226 andl %eax, %r8d // c << 20 227 andq $1023, %rax 228 shll $6, %edx 229 shlq $12, %r8 230 orl %edx, %eax 231 orq %r8, %rax 232 cmp %r13, %rcx 233 je 2f 234 mov %rax, (%rcx) 235 jmp 3f 2362: mov %ax, (%rcx) 2373: 238 addq $4, %r9 // next_word++ 239 addq $6, %rcx // next_low_bits += 3 240 sub $4, %rdi 241 jg L_WK_unpack_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word 2421: 243 244 245 #define next_qpos %rbx 246 #define hash %r8 247 #define tags_counter %edi 248 #define dest_buf %r12 249 #define next_full_patt %r10 250 251 leaq _hashLookupTable_new(%rip), hash // hash look up table 252 movl $1024, tags_counter // tags_counter 253 jmp L_next 254 255 .align 4,0x90 256L_nonpartital: 257 jl L_ZERO_TAG 258 cmpb $2, -1(%rsi) 259 je L_MISS_TAG 260 261L_EXACT_TAG: 262 movzbl (next_qpos), %eax // qpos = *next_qpos 263 incq next_qpos // next_qpos++ 264 decl tags_counter // tags_counter-- 265 movl (%rsp,%rax,4), %eax // w = dictionary[qpos] 266 movl %eax, -4(dest_buf) // *dest_buf = w 267 je L_done 268 269L_next: 270 incq %rsi // next_tag++ 271 addq $4, dest_buf 272 cmpb $1, -1(%rsi) 273 jne L_nonpartital 274 275L_PARTIAL_TAG: 276 movzbl (next_qpos),%edx // qpos = *next_qpos 277 incq next_qpos // next_qpos++ 278 movl (%rsp,%rdx,4), %eax // read dictionary word 279 andl $-1024, %eax // clear lower 10 bits 280 or (%r11), %ax // pad the lower 10-bits from *next_low_bits 281 addq $2, %r11 // next_low_bits++ 282 decl tags_counter // tags_counter-- 283 movl %eax, (%rsp,%rdx,4) // *dict_location = newly formed word 284 movl %eax, -4(dest_buf) // *dest_buf = newly formed word 285 jg L_next // repeat loop until next_tag==tag_area_end 286 287L_done: 288 289 // release stack memory, restore registers, and return 290 291 addq $(64+8+16), %rsp 292 popq %rbx 293 popq %r13 294 popq %r12 295 leave 296 ret 297 298 .align 4,0x90 299L_MISS_TAG: 300 movl (next_full_patt), %edx // w = *next_full_patt 301 movl (next_full_patt), %eax // w = *next_full_patt 302 shrl $10, %edx // w>>10 303 addq $4, next_full_patt // next_full_patt++ 304 movzbl %dl, %edx // 8-bit hash table index 305 movl %eax, -4(dest_buf) // *dest_buf = word 306 movzbl (hash,%rdx),%edx // qpos 307 decl tags_counter // tags_counter-- 308 movl %eax, (%rsp,%rdx) // dictionary[qpos] = word 309 jg L_next // repeat the loop 310 jmp L_done 311 312 .align 4,0x90 313L_ZERO_TAG: 314 decl tags_counter // tags_counter-- 315 movl $0, -4(dest_buf) // *dest_buf = 0 316 jg L_next // repeat the loop 317 jmp L_done 318 319 320