1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains arm64 hand optimized implementation of WKdm memory page compressor. 31 32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); 33 34 input : 35 src_buf : address of input page (length = 1024 words) 36 dest_buf : address of output buffer (may not be 16-byte aligned) 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller, 38 bytes_budget : a given byte target in compression 39 40 output : 41 42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data 43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget. 44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area. 45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined. 46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault. 47 48 The 4th argument bytes_budget is the target compress budget in bytes. 49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes. 50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed). 51 52 WKdm Compression algorithm is briefly stated as follows: 53 54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function. 55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits). 56 a. k = (x>>10)&255; // 8-bit hash table index 57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed 58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated 59 60 Each input word x is classified/tagged into 4 classes : 61 0 : x = 0 62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word 63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss 64 3 : (x == dict_word), the exact input word is in the dictionary 65 66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word. 67 0 : 2-bits tag (32->2 compression) 68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression) 69 2 : 2-bits tag + 32-bits new word (32->34 expansion) 70 3 : 2-bits tag + 4-bits dict_index (32->6 compression) 71 72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6). 73 74 the output bit stream (*dest_buf) consists of 75 a. 12 bytes header 76 b. 256 bytes for 1024 packed tags 77 c. (varying number of) words for new words not matched to dictionary word. 78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 80 81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word. 83 84 85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer. 86 87 The temp buffers are 88 89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing 90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing 91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing 92 93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to 94 the destination buffer. 95 96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags. 97 98 Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass. 99 100 byte_count -= 12 + 256; // bit budget minus header and tags 101 102 whenever an input word is classified as class 103 104 2 : byte_count -= 4; 105 106 the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0). 107 108 without showing the bit budget management, the pseudo code is given as follows: 109 110 uint8_t *tags=tempTagsArray; 111 uint8_t *dict=tempQPosArray; 112 uint8_t *partial=tempLowBitsArray; 113 114 for (i=0;i<1024;i++) { 115 x = *src_buf++; 116 if (x == 0) { // zero, 2-bits tag 117 *tags++ = 0; 118 } else { 119 120 // find dict_index and dict_word from x 121 k = (x>>10)&255; 122 dict_index = hashTable[k]; 123 dict_word = dictionary[dict_index]; 124 125 if (dict_word == x) { // exactly match 126 // 2-bits tag + 4-bits table index 127 *tags++ = 3; 128 *dict++ = dict_index; 129 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched 130 // 2-bits tag + 4-bits table index + 10-bits lower partial 131 *tags++ = 1; 132 *dict++ = dict_index; 133 *partial++ = x &0x3ff; 134 dictionary[dict_index] = x; 135 } else { // not matched 136 // 2-bits tag + 32-bits new word 137 *tags++ = 2; 138 *new_word++ = x; 139 dictionary[dict_index] = x; 140 } 141 } 142 } 143 144 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf: 145 146 1. 1024 tags are packed into 256 bytes right after the 12-bytes header 147 2. dictionary indices (4-bits each) are packed into are right after the new words section 148 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section. 149 150 cclee, 11/9/12 151 152 Added zero page, single value page, sparse page, early abort optimizations 153 rsrini, 09/14/14 154*/ 155 156#ifndef PAGES_SIZE_IN_KBYTES 157#define PAGES_SIZE_IN_KBYTES 4 158#endif 159 160#if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16)) 161#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported" 162#endif 163 164 165 .text 166 .align 4 167 168/* 169 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); 170*/ 171 172.globl _WKdm_compress_4k 173_WKdm_compress_4k: 174 175/* 176 ------------------------- symbolizing register use ----------------------------------- 177*/ 178 #define src_buf x0 179 #define next_input_word x0 180 #define dest_buf x1 181 #define scratch x2 182 #define byte_count x3 183 #define next_tag x4 184 #define tempTagsArray x2 // scratch 185 #define dictionary x5 186 #define remaining x6 187 #define next_full_patt x7 188 #define dict_location x8 189 #define wdict_location w8 190 #define next_qp x9 191 #define hashTable x10 192 #define tempQPosArray x11 193 #define next_low_bits x12 194 195/* 196 this arm64 assembly code is ported from x86_64 assembly code, 197 therefore need such symbolization to quickly reuse the x86_64 assembly code 198 for these intermediate/temporary register use 199*/ 200 #define rax x13 201 #define eax w13 202 #define rcx x14 203 #define ecx w14 204 #define rdx x15 205 #define edx w15 206 #define rdi x0 /* after some point, x0/rdi becomes free other usage */ 207 208 209/* 210 ------------------------- scratch memory -------------------------------------- 211 212 need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray) 213 total 6208 bytes 214 [sp,#0] : dictionary 215 [scratch,#0] : tempTagsArray 216 [scratch,#1024] : tempQPosArray 217 [scratch,#2048] : tempLowBitsArray 218*/ 219 220#define scale (PAGES_SIZE_IN_KBYTES/4) 221 222#define SV_RETURN 0 // return value when SV, ZV page is found 223#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding 224#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096] 225#define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words 226#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data 227#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing .. 228 // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096 229#if CHKPT_BYTES > 4096 230 #error CHKPT_BYTES must be <= 4096 231#endif 232#if CHKPT_BYTES < 4 233 #error CHKPT_BYTES must be >= 4 234#endif 235 236#if KERNEL 237 sub sp, sp, #64 238 st1.4s {v0,v1,v2,v3},[sp] 239#endif 240 241 sub sp, sp, #64 // allocate for dictionary 242 mov dictionary, sp // use x5 to point to sp, so we can use sub xd, xn, sp 243 244 sub sp, sp, #64 // allocate space for saving callee-saved registers 245 mov x15, sp 246 stp x20, x21, [x15, #0] // save x20, x21 247 stp x22, x23, [x15, #16] // save x22, x23 248 stp x24, x25, [x15, #32] // save x24, x25 249 stp x26, x27, [x15, #48] // save x26, x27 250 251/* 252 ------- entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY ------------------- 253*/ 254 255 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO 256 // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES 257 mov next_tag, tempTagsArray // &tempTagsArray[0] 258 add next_qp, scratch, #(1024*scale) // next_qp 259 mov remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint 260 add next_full_patt, dest_buf, #(12+256*scale) // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 261 sub byte_count, byte_count, #(12+256*scale) // bit_count - header - tags 262 add next_low_bits, scratch, #(2048*scale) // &tempLowBitsArray[0] 263 stp xzr, xzr, [dictionary, #0] // initialize dictionary 264 adrp hashTable, _hashLookupTable@GOTPAGE 265 stp xzr, xzr, [dictionary, #16] // initialize dictionary 266 stp xzr, xzr, [dictionary, #32] // initialize dictionary 267 ldr hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF] 268 stp xzr, xzr, [dictionary, #48] // initialize dictionary 269 270#define EARLYCHECK 0 271#define NORMAL 1 272 273#define mode w20 274#define start_next_full_patt x21 275#define start_next_input_word x22 276#define start_next_low_bits x23 277#define r11 x24 278#define r13 x25 279#define byte_budget x26 280#define start_next_qp tempQPosArray 281 282 add tempQPosArray, scratch, #(1024*scale) // &tempQPosArray[0] 283 mov mode, EARLYCHECK // indicate we are yet to evaluate the early aborts 284 mov start_next_full_patt, next_full_patt // remember the start of next_full_patt 285 mov start_next_input_word, next_input_word // remember the start of next_input_word 286 mov start_next_low_bits, next_low_bits // remember the start of next_low_bit 287 add byte_budget, byte_count, #(12+256*scale) // remember the byte budget 288 289 b L_loop 290 291 .align 4, 0x90 292 293 /* we've just detected a zero input word in edx */ 294L_RECORD_ZERO: 295 strb edx, [next_tag], #1 // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0 296 subs remaining, remaining, #1 // remaing--; 297 b.le CHECKPOINT // if remaining = 0, break 298 299 /* -------------- scan/tag pass loop ------------------------- */ 300L_loop: 301 302 /* load new input word to edx */ 303 ldr edx, [next_input_word], #4 304 cbz edx, L_RECORD_ZERO // if (input_word==0) RECORD_ZERO 305 306 /* 307 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location 308 */ 309 ubfm eax, edx, #10, #17 310 ldrb wdict_location, [hashTable, rax] // HASH_TO_DICT_BYTE_OFFSET(input_word) 311 ldr eax, [dictionary, dict_location] // dict_word = *dict_location; 312 313 /* detect whether we match input to its corresponding dictionary word */ 314 eor eax, eax, edx // dict_word vs input_word 315 cbz eax, L_RECORD_EXACT // if identical, RECORD_EXACT 316 lsr eax, eax, #10 // HIGH_BITS(dict_word^input_word) 317 cbz eax, L_RECORD_PARTIAL // if identical, RECORD_PARTIAL 318 319L_RECORD_MISS: 320/* 321 if we are here, the input word can not be derived from the dictionary, 322 we write the input word as a new word, 323 and update the dictionary with this new word 324*/ 325 subs byte_count, byte_count, #4 // byte_count -= 4 326 b.le L_budgetExhausted // return -1 to signal this page is not compressable 327 str edx, [next_full_patt], #4 // *next_full_patt++ = input_word; 328 mov eax, #2 // tag for MISS 329 subs remaining, remaining, #1 // remaing--; 330 str edx, [dictionary, dict_location] // *dict_location = input_word 331 strb eax, [next_tag], #1 // *next_tag++ = 2 for miss 332 b.gt L_loop // // if remaining > 0, repeat 333 b CHECKPOINT 334 335L_done_search: 336 337 // SET_QPOS_AREA_START(dest_buf,next_full_patt); 338 /* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */ 339 340 sub rax, next_full_patt, dest_buf // next_full_patt - dest_buf 341 lsr eax, eax, #2 // offset in 4-bytes 342 str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf 343 344 /* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/ 345 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); 346 347 add rdi, dest_buf, #12 // dest_buf 348 mov rcx, tempTagsArray // &tempTagsArray[0] 349 350L_pack_2bits: 351 ld1.2s {v0,v1,v2,v3},[rcx],#32 352 353 shl.2d v1,v1,#4 354 shl.2d v3,v3,#4 355 356 orr.8b v0, v0, v1 357 orr.8b v2, v2, v3 358 359 ushr.2d v1, v0, #30 360 ushr.2d v3, v2, #30 361 362 orr.8b v0, v0, v1 363 orr.8b v2, v2, v3 364 365 zip1.2s v0, v0, v2 366 st1.2s {v0},[rdi],#8 367 cmp next_tag, rcx 368 b.hi L_pack_2bits 369 370 /* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */ 371 372 /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */ 373 sub rax, next_qp, tempQPosArray // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray; 374 add eax, eax, #7 // num_bytes_to_pack+7 375 lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3 376 add rcx, tempQPosArray, rax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words 377 lsl rax, rax, #2 378 subs byte_count, byte_count, rax 379 b.lt L_budgetExhausted 380 381 cmp rcx, next_qp // endQPosArray vs next_qp 382 b.ls 2f // if (next_qp >= endQPosArray) skip the following zero paddings 383 sub rax, rcx, next_qp 384 mov edx, #0 385 tst eax, #4 386 b.eq 1f 387 str edx, [next_qp], #4 3881: tst eax, #2 389 b.eq 1f 390 strh edx, [next_qp], #2 3911: tst eax, #1 392 b.eq 2f 393 strb edx, [next_qp], #1 3942: 395 mov rdi, next_full_patt // next_full_patt 396 cmp rcx, tempQPosArray // endQPosArray vs tempQPosArray 397 ldr eax, [dest_buf] 398 b.ls L20 // if (endQPosArray <= tempQPosArray) skip the following 399 mov rdx, tempQPosArray // tempQPosArray 400 401 /* packing 4-bits dict indices into dest_buf */ 402L_pack_4bits: 403 ldr rax, [rdx], #8 // src_next[1]:src_next[0] 404 orr rax, rax, rax, lsr #28 // eax = src_next[0] | (src_next[1] << 4) 405 cmp rcx, rdx // source_end vs src_next 406 str eax, [rdi], #4 // *dest_next++ = temp; 407 b.hi L_pack_4bits // while (src_next < source_end) repeat the loop 408 409 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); 410 sub rax, rdi, dest_buf // boundary_tmp - dest_buf 411 lsr eax, eax, #2 // boundary_tmp - dest_buf in words 412L20: 413 str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf 414 415 416 417 /* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */ 418 419 add rcx, scratch, #(2048*scale) // tempLowBitsArray 420 sub rdx, next_low_bits, rcx // next_low_bits - tempLowBitsArray (in bytes) 421 lsr rdx, rdx, #1 // num_tenbits_to_pack (in half-words) 422 subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3 423 b.lt 1f // if num_tenbits_to_pack < 3, skip the following loop 4240: 425 subs byte_count, byte_count, #4 // byte_count -= 4 426 b.le L_budgetExhausted // return -1 to signal this page is not compressable 427 subs edx, edx, #3 // num_tenbits_to_pack-=3 428 ldr rax, [rcx], #6 429 bfm rax, rax, #58, #9 // pack 1st toward 2nd 430 bfm rax, rax, #58, #25 // pack 1st/2nd toward 3rd 431 lsr rax, rax, #12 432 str eax, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word 433 b.ge 0b // if no less than 3 elements, back to loop head 434 4351: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3 436 b.eq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following 437 subs byte_count, byte_count, #4 // byte_count -= 4 438 b.le L_budgetExhausted // return -1 to signal this page is not compressable 439 ldrh eax,[rcx] // w0 440 subs edx, edx, #1 // num_tenbits_to_pack-- 441 b.eq 2f // 442 ldrh edx, [rcx, #2] // w1 443 orr eax, eax, edx, lsl #10 // w0 | (w1<<10) 444 4452: str eax, [rdi], #4 // write the final dest_buf word 446 4473: sub rax, rdi, dest_buf // boundary_tmp - dest_buf 448 lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words 449 str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) 450 lsl w0, eax, #2 // boundary_tmp - dest_buf in terms of bytes 451 452L_done: 453 454 // restore registers and return 455 mov x15, sp 456 ldp x20, x21, [x15, #0] // restore x20, x21 457 ldp x22, x23, [x15, #16] // restore x22, x23 458 ldp x24, x25, [x15, #32] // restore x24, x25 459 ldp x26, x27, [x15, #48] // restore x26, x27 460 add sp, sp, #128 // deallocate for dictionary + saved register space 461 462#if KERNEL 463 ld1.4s {v0,v1,v2,v3},[sp],#64 464#endif 465 ret lr 466 467 .align 4 468L_budgetExhausted: 469 mov x0, #-1 470 b L_done 471 472 473 .align 4,0x90 474L_RECORD_EXACT: 475/* 476 we have an exact match of the input word to its corresponding dictionary word 477 write tag/dict_index to the temorary buffers 478*/ 479 mov eax, #3 480 lsr w14, wdict_location, #2 // divide by 4 for word offset 481 subs remaining, remaining, #1 // remaing--; 482 strb eax, [next_tag], #1 // *next_tag++ = 3 for exact 483 strb w14, [next_qp], #1 // *next_qp = word offset (4-bit) 484 b.gt L_loop 485 b CHECKPOINT // if remaining = 0, break 486 487 .align 4,0x90 488L_RECORD_PARTIAL: 489/* 490 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word 491 write tag/dict_index/low 10 bits to the temorary buffers 492*/ 493 mov ecx, #1 494 strb ecx, [next_tag], #1 // *next_tag++ = 1 for partial matched 495 str edx, [dictionary, dict_location] // *dict_location = input_word; 496 subs remaining, remaining, #1 // remaing--; 497 lsr eax, wdict_location, #2 // offset in 32-bit word 498 and edx, edx, #1023 // lower 10 bits 499 strb eax, [next_qp], #1 // update *next_qp++ 500 strh edx, [next_low_bits], #2 // save next_low_bits++ 501 b.gt L_loop 502 503CHECKPOINT: 504 505 cbz mode, L_check_compression_ratio // if this this an early abort check.. 506 507L_check_zero_page: 508 509 cmp start_next_full_patt, next_full_patt // check if any dictionary misses in page 510 b.ne L_check_single_value_page 511 512 cmp start_next_qp, next_qp // check if any partial or exact dictionary matches 513 b.ne L_check_single_value_page 514 515 mov x0, #SV_RETURN // Magic return value 516 b L_done 517 518L_check_single_value_page: 519 520 sub rax, next_full_patt, start_next_full_patt // get # dictionary misses 521 lsr rax, rax, #2 522 523 sub r11, next_qp, start_next_qp // get # dictionary hits (exact + partial) 524 525 sub r13, next_low_bits, start_next_low_bits // get # dictionary partial hits 526 lsr r13, r13, #1 527 528 // Single value page if one of the follwoing is true: 529 // partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss) 530 // partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial) 531 // 532 cbnz r13, 1f // were there 0 partial hits? 533 534 cmp r11, #(256*PAGES_SIZE_IN_KBYTES - 1) // were there 1023 dictionary hits 535 b.ne 1f 536 537 cmp rax, #1 // was there exacly 1 dictionary miss? 538 b.ne 1f 539 540 ldrb edx, [tempTagsArray] // read the very 1st tag 541 cmp edx, #2 // was the very 1st tag a miss? 542 b.eq L_is_single_value_page 543 5441: 545 cmp r13, #1 // was there 1 partial hit? 546 b.ne L_check_mostly_zero 547 548 cmp r11, #(256*PAGES_SIZE_IN_KBYTES) // were there 1024 dictionary hits 549 b.ne L_check_mostly_zero 550 551 ldrb edx, [tempTagsArray] // read the very 1st tag 552 cmp edx, #1 // was the very 1st tag a partial? 553 b.ne L_check_mostly_zero 554 555L_is_single_value_page: 556 557 mov x0, #SV_RETURN // Magic return value 558 b L_done 559 560L_check_mostly_zero: 561 // how much space will the sparse packer take? 562 add rax, rax, r11 // rax += (next_qp - start_next_qp) 563 mov rdx, #6 564 mov rcx, #4 565 madd r11, rax, rdx, rcx // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header 566 567 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits 568 mov rdx, #1365 569 mul rax, rax, rdx 570 571 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses 572 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) 573 574 sub rdx, next_qp, start_next_qp 575 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2 576 add rax, rax, #(12+256*scale) // rax += bytes taken by the header + tags 577 578 cmp rax, r11 // is the default packer the better option? 579 b.lt L_done_search 580 581 cmp r11, byte_budget // can the sparse packer fit into the given budget? 582 b.gt L_budgetExhausted 583 584L_sparse_packer: 585 mov edx, #MZV_MAGIC 586 str edx, [dest_buf], #4 // header to indicate a sparse packer 587 588 mov rdx, #0 // rdx = byte offset in src of non-0 word 5891: 590 ldr rax, [start_next_input_word, rdx] // rax = read dword 591 cbnz rax, 5f // is dword != 0 5923: 593 add rdx, rdx, #8 // 8 more bytes have been processed 5944: 595 cmp rdx, #(4096*scale) // has the entire page been processed 596 b.ne 1b 597 mov x0, r11 // store the size of the compressed stream 598 b L_done 599 6005: 601 cbz eax, 6f // is lower word == 0 602 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer 603 strh edx, [dest_buf], #2 // store the byte index 6046: 605 lsr rax, rax, 32 // get the upper word into position 606 cbz eax, 3b // is dword == 0 607 add rdx, rdx, #4 608 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer 609 strh edx, [dest_buf], #2 // store the byte index 610 add rdx, rdx, #4 611 b 4b 612 613L_check_compression_ratio: 614 615 mov mode, NORMAL 616 mov remaining, #((1024 - CHKPT_WORDS)*scale) // remaining input words to process 617 cbz remaining, CHECKPOINT // if there are no remaining words to process 618 619 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits 620 mov rdx, #1365 621 mul rax, rax, rdx 622 623 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses 624 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) 625 626 sub rdx, next_qp, start_next_qp 627 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2 628 subs rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale) 629 // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES 630 631 b.gt L_budgetExhausted // if rax is > 0, we need to early abort 632 b L_loop // we are done 633