1/* 2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved. 3 * 4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. The rights granted to you under the License 10 * may not be used to create, or enable the creation or redistribution of, 11 * unlawful or unlicensed copies of an Apple operating system, or to 12 * circumvent, violate, or enable the circumvention or violation of, any 13 * terms of an Apple operating system software license agreement. 14 * 15 * Please obtain a copy of the License at 16 * http://www.opensource.apple.com/apsl/ and read it before using this file. 17 * 18 * The Original Code and all software distributed under the License are 19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 23 * Please see the License for the specific language governing rights and 24 * limitations under the License. 25 * 26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ 27 */ 28 29/* 30 This file contains arm64 hand optimized implementation of WKdm memory page compressor. 31 32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); 33 34 input : 35 src_buf : address of input page (length = 1024 words) 36 dest_buf : address of output buffer (may not be 16-byte aligned) 37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller, 38 bytes_budget : a given byte target in compression 39 40 output : 41 42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data 43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget. 44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area. 45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined. 46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault. 47 48 The 4th argument bytes_budget is the target compress budget in bytes. 49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes. 50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed). 51 52 WKdm Compression algorithm is briefly stated as follows: 53 54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function. 55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits). 56 a. k = (x>>10)&255; // 8-bit hash table index 57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed 58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated 59 60 Each input word x is classified/tagged into 4 classes : 61 0 : x = 0 62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word 63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss 64 3 : (x == dict_word), the exact input word is in the dictionary 65 66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word. 67 0 : 2-bits tag (32->2 compression) 68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression) 69 2 : 2-bits tag + 32-bits new word (32->34 expansion) 70 3 : 2-bits tag + 4-bits dict_index (32->6 compression) 71 72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6). 73 74 the output bit stream (*dest_buf) consists of 75 a. 12 bytes header 76 b. 256 bytes for 1024 packed tags 77 c. (varying number of) words for new words not matched to dictionary word. 78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) 79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) 80 81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively. 82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word. 83 84 85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer. 86 87 The temp buffers are 88 89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing 90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing 91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing 92 93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to 94 the destination buffer. 95 96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags. 97 98 Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass. 99 100 byte_count -= 12 + 256; // bit budget minus header and tags 101 102 whenever an input word is classified as class 103 104 2 : byte_count -= 4; 105 106 the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0). 107 108 without showing the bit budget management, the pseudo code is given as follows: 109 110 uint8_t *tags=tempTagsArray; 111 uint8_t *dict=tempQPosArray; 112 uint8_t *partial=tempLowBitsArray; 113 114 for (i=0;i<1024;i++) { 115 x = *src_buf++; 116 if (x == 0) { // zero, 2-bits tag 117 *tags++ = 0; 118 } else { 119 120 // find dict_index and dict_word from x 121 k = (x>>10)&255; 122 dict_index = hashTable[k]; 123 dict_word = dictionary[dict_index]; 124 125 if (dict_word == x) { // exactly match 126 // 2-bits tag + 4-bits table index 127 *tags++ = 3; 128 *dict++ = dict_index; 129 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched 130 // 2-bits tag + 4-bits table index + 10-bits lower partial 131 *tags++ = 1; 132 *dict++ = dict_index; 133 *partial++ = x &0x3ff; 134 dictionary[dict_index] = x; 135 } else { // not matched 136 // 2-bits tag + 32-bits new word 137 *tags++ = 2; 138 *new_word++ = x; 139 dictionary[dict_index] = x; 140 } 141 } 142 } 143 144 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf: 145 146 1. 1024 tags are packed into 256 bytes right after the 12-bytes header 147 2. dictionary indices (4-bits each) are packed into are right after the new words section 148 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section. 149 150 cclee, 11/9/12 151 152 Added zero page, single value page, sparse page, early abort optimizations 153 rsrini, 09/14/14 154*/ 155 156#if KERNEL 157#include <machine/asm.h> 158#endif /* KERNEL */ 159 160#define PAGES_SIZE_IN_KBYTES 16 161 162#ifndef PAGES_SIZE_IN_KBYTES 163#define PAGES_SIZE_IN_KBYTES 4 164#endif 165 166#if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16)) 167#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported" 168#endif 169 170 171 .text 172 .align 4 173 174/* 175 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget); 176*/ 177 178.globl _WKdm_compress_16k 179_WKdm_compress_16k: 180 181/* 182 ------------------------- symbolizing register use ----------------------------------- 183*/ 184 #define src_buf x0 185 #define next_input_word x0 186 #define dest_buf x1 187 #define scratch x2 188 #define byte_count x3 189 #define next_tag x4 190 #define tempTagsArray x2 // scratch 191 #define dictionary x5 192 #define remaining x6 193 #define next_full_patt x7 194 #define dict_location x8 195 #define wdict_location w8 196 #define next_qp x9 197 #define hashTable x10 198 #define tempQPosArray x11 199 #define next_low_bits x12 200 201/* 202 this arm64 assembly code is ported from x86_64 assembly code, 203 therefore need such symbolization to quickly reuse the x86_64 assembly code 204 for these intermediate/temporary register use 205*/ 206 #define rax x13 207 #define eax w13 208 #define rcx x14 209 #define ecx w14 210 #define rdx x15 211 #define edx w15 212 #define rdi x0 /* after some point, x0/rdi becomes free other usage */ 213 214 215/* 216 ------------------------- scratch memory -------------------------------------- 217 218 need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray) 219 total 6208 bytes 220 [sp,#0] : dictionary 221 [scratch,#0] : tempTagsArray 222 [scratch,#1024] : tempQPosArray 223 [scratch,#2048] : tempLowBitsArray 224*/ 225 226#define scale (PAGES_SIZE_IN_KBYTES/4) 227 228#define SV_RETURN 0 // return value when SV, ZV page is found 229#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding 230#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096] 231#define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words 232#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data 233#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing .. 234 // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096 235#if CHKPT_BYTES > 4096 236 #error CHKPT_BYTES must be <= 4096 237#endif 238#if CHKPT_BYTES < 4 239 #error CHKPT_BYTES must be >= 4 240#endif 241 242#if KERNEL 243 ARM64_PROLOG 244 sub sp, sp, #64 245 st1.4s {v0,v1,v2,v3},[sp] 246#endif 247 248 sub sp, sp, #64 // allocate for dictionary 249 mov dictionary, sp // use x5 to point to sp, so we can use sub xd, xn, sp 250 251 sub sp, sp, #64 // allocate space for saving callee-saved registers 252 mov x15, sp 253 stp x20, x21, [x15, #0] // save x20, x21 254 stp x22, x23, [x15, #16] // save x22, x23 255 stp x24, x25, [x15, #32] // save x24, x25 256 stp x26, x27, [x15, #48] // save x26, x27 257 258/* 259 ------- entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY ------------------- 260*/ 261 262 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO 263 // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES 264 mov next_tag, tempTagsArray // &tempTagsArray[0] 265 add next_qp, scratch, #(1024*scale) // next_qp 266 mov remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint 267 add next_full_patt, dest_buf, #(12+256*scale) // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4 268 sub byte_count, byte_count, #(12+256*scale) // bit_count - header - tags 269 add next_low_bits, scratch, #(2048*scale) // &tempLowBitsArray[0] 270 stp xzr, xzr, [dictionary, #0] // initialize dictionary 271 adrp hashTable, _hashLookupTable@GOTPAGE 272 stp xzr, xzr, [dictionary, #16] // initialize dictionary 273 stp xzr, xzr, [dictionary, #32] // initialize dictionary 274 ldr hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF] 275 stp xzr, xzr, [dictionary, #48] // initialize dictionary 276 277#define EARLYCHECK 0 278#define NORMAL 1 279 280#define mode w20 281#define start_next_full_patt x21 282#define start_next_input_word x22 283#define start_next_low_bits x23 284#define r11 x24 285#define r13 x25 286#define byte_budget x26 287#define start_next_qp tempQPosArray 288 289 add tempQPosArray, scratch, #(1024*scale) // &tempQPosArray[0] 290 mov mode, EARLYCHECK // indicate we are yet to evaluate the early aborts 291 mov start_next_full_patt, next_full_patt // remember the start of next_full_patt 292 mov start_next_input_word, next_input_word // remember the start of next_input_word 293 mov start_next_low_bits, next_low_bits // remember the start of next_low_bit 294 add byte_budget, byte_count, #(12+256*scale) // remember the byte budget 295 296 b L_loop 297 298 .align 4, 0x90 299 300 /* we've just detected a zero input word in edx */ 301L_RECORD_ZERO: 302 strb edx, [next_tag], #1 // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0 303 subs remaining, remaining, #1 // remaing--; 304 b.le CHECKPOINT // if remaining = 0, break 305 306 /* -------------- scan/tag pass loop ------------------------- */ 307L_loop: 308 309 /* load new input word to edx */ 310 ldr edx, [next_input_word], #4 311 cbz edx, L_RECORD_ZERO // if (input_word==0) RECORD_ZERO 312 313 /* 314 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location 315 */ 316 ubfm eax, edx, #10, #17 317 ldrb wdict_location, [hashTable, rax] // HASH_TO_DICT_BYTE_OFFSET(input_word) 318 ldr eax, [dictionary, dict_location] // dict_word = *dict_location; 319 320 /* detect whether we match input to its corresponding dictionary word */ 321 eor eax, eax, edx // dict_word vs input_word 322 cbz eax, L_RECORD_EXACT // if identical, RECORD_EXACT 323 lsr eax, eax, #10 // HIGH_BITS(dict_word^input_word) 324 cbz eax, L_RECORD_PARTIAL // if identical, RECORD_PARTIAL 325 326L_RECORD_MISS: 327/* 328 if we are here, the input word can not be derived from the dictionary, 329 we write the input word as a new word, 330 and update the dictionary with this new word 331*/ 332 subs byte_count, byte_count, #4 // byte_count -= 4 333 b.le L_budgetExhausted // return -1 to signal this page is not compressable 334 str edx, [next_full_patt], #4 // *next_full_patt++ = input_word; 335 mov eax, #2 // tag for MISS 336 subs remaining, remaining, #1 // remaing--; 337 str edx, [dictionary, dict_location] // *dict_location = input_word 338 strb eax, [next_tag], #1 // *next_tag++ = 2 for miss 339 b.gt L_loop // // if remaining > 0, repeat 340 b CHECKPOINT 341 342L_done_search: 343 344 // SET_QPOS_AREA_START(dest_buf,next_full_patt); 345 /* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */ 346 347 sub rax, next_full_patt, dest_buf // next_full_patt - dest_buf 348 lsr eax, eax, #2 // offset in 4-bytes 349 str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf 350 351 /* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/ 352 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS); 353 354 add rdi, dest_buf, #12 // dest_buf 355 mov rcx, tempTagsArray // &tempTagsArray[0] 356 357L_pack_2bits: 358 ld1.2s {v0,v1,v2,v3},[rcx],#32 359 360 shl.2d v1,v1,#4 361 shl.2d v3,v3,#4 362 363 orr.8b v0, v0, v1 364 orr.8b v2, v2, v3 365 366 ushr.2d v1, v0, #30 367 ushr.2d v3, v2, #30 368 369 orr.8b v0, v0, v1 370 orr.8b v2, v2, v3 371 372 zip1.2s v0, v0, v2 373 st1.2s {v0},[rdi],#8 374 cmp next_tag, rcx 375 b.hi L_pack_2bits 376 377 /* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */ 378 379 /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */ 380 sub rax, next_qp, tempQPosArray // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray; 381 add eax, eax, #7 // num_bytes_to_pack+7 382 lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3 383 add rcx, tempQPosArray, rax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words 384 lsl rax, rax, #2 385 subs byte_count, byte_count, rax 386 b.lt L_budgetExhausted 387 388 cmp rcx, next_qp // endQPosArray vs next_qp 389 b.ls 2f // if (next_qp >= endQPosArray) skip the following zero paddings 390 sub rax, rcx, next_qp 391 mov edx, #0 392 tst eax, #4 393 b.eq 1f 394 str edx, [next_qp], #4 3951: tst eax, #2 396 b.eq 1f 397 strh edx, [next_qp], #2 3981: tst eax, #1 399 b.eq 2f 400 strb edx, [next_qp], #1 4012: 402 mov rdi, next_full_patt // next_full_patt 403 cmp rcx, tempQPosArray // endQPosArray vs tempQPosArray 404 ldr eax, [dest_buf] 405 b.ls L20 // if (endQPosArray <= tempQPosArray) skip the following 406 mov rdx, tempQPosArray // tempQPosArray 407 408 /* packing 4-bits dict indices into dest_buf */ 409L_pack_4bits: 410 ldr rax, [rdx], #8 // src_next[1]:src_next[0] 411 orr rax, rax, rax, lsr #28 // eax = src_next[0] | (src_next[1] << 4) 412 cmp rcx, rdx // source_end vs src_next 413 str eax, [rdi], #4 // *dest_next++ = temp; 414 b.hi L_pack_4bits // while (src_next < source_end) repeat the loop 415 416 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp); 417 sub rax, rdi, dest_buf // boundary_tmp - dest_buf 418 lsr eax, eax, #2 // boundary_tmp - dest_buf in words 419L20: 420 str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf 421 422 423 424 /* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */ 425 426 add rcx, scratch, #(2048*scale) // tempLowBitsArray 427 sub rdx, next_low_bits, rcx // next_low_bits - tempLowBitsArray (in bytes) 428 lsr rdx, rdx, #1 // num_tenbits_to_pack (in half-words) 429 subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3 430 b.lt 1f // if num_tenbits_to_pack < 3, skip the following loop 4310: 432 subs byte_count, byte_count, #4 // byte_count -= 4 433 b.le L_budgetExhausted // return -1 to signal this page is not compressable 434 subs edx, edx, #3 // num_tenbits_to_pack-=3 435 ldr rax, [rcx], #6 436 bfm rax, rax, #58, #9 // pack 1st toward 2nd 437 bfm rax, rax, #58, #25 // pack 1st/2nd toward 3rd 438 lsr rax, rax, #12 439 str eax, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word 440 b.ge 0b // if no less than 3 elements, back to loop head 441 4421: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3 443 b.eq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following 444 subs byte_count, byte_count, #4 // byte_count -= 4 445 b.le L_budgetExhausted // return -1 to signal this page is not compressable 446 ldrh eax,[rcx] // w0 447 subs edx, edx, #1 // num_tenbits_to_pack-- 448 b.eq 2f // 449 ldrh edx, [rcx, #2] // w1 450 orr eax, eax, edx, lsl #10 // w0 | (w1<<10) 451 4522: str eax, [rdi], #4 // write the final dest_buf word 453 4543: sub rax, rdi, dest_buf // boundary_tmp - dest_buf 455 lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words 456 str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp) 457 lsl w0, eax, #2 // boundary_tmp - dest_buf in terms of bytes 458 459L_done: 460 461 // restore registers and return 462 mov x15, sp 463 ldp x20, x21, [x15, #0] // restore x20, x21 464 ldp x22, x23, [x15, #16] // restore x22, x23 465 ldp x24, x25, [x15, #32] // restore x24, x25 466 ldp x26, x27, [x15, #48] // restore x26, x27 467 add sp, sp, #128 // deallocate for dictionary + saved register space 468 469#if KERNEL 470 ld1.4s {v0,v1,v2,v3},[sp],#64 471#endif 472 ret lr 473 474 .align 4 475L_budgetExhausted: 476 mov x0, #-1 477 b L_done 478 479 480 .align 4,0x90 481L_RECORD_EXACT: 482/* 483 we have an exact match of the input word to its corresponding dictionary word 484 write tag/dict_index to the temorary buffers 485*/ 486 mov eax, #3 487 lsr w14, wdict_location, #2 // divide by 4 for word offset 488 subs remaining, remaining, #1 // remaing--; 489 strb eax, [next_tag], #1 // *next_tag++ = 3 for exact 490 strb w14, [next_qp], #1 // *next_qp = word offset (4-bit) 491 b.gt L_loop 492 b CHECKPOINT // if remaining = 0, break 493 494 .align 4,0x90 495L_RECORD_PARTIAL: 496/* 497 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word 498 write tag/dict_index/low 10 bits to the temorary buffers 499*/ 500 mov ecx, #1 501 strb ecx, [next_tag], #1 // *next_tag++ = 1 for partial matched 502 str edx, [dictionary, dict_location] // *dict_location = input_word; 503 subs remaining, remaining, #1 // remaing--; 504 lsr eax, wdict_location, #2 // offset in 32-bit word 505 and edx, edx, #1023 // lower 10 bits 506 strb eax, [next_qp], #1 // update *next_qp++ 507 strh edx, [next_low_bits], #2 // save next_low_bits++ 508 b.gt L_loop 509 510CHECKPOINT: 511 512 cbz mode, L_check_compression_ratio // if this this an early abort check.. 513 514L_check_zero_page: 515 516 cmp start_next_full_patt, next_full_patt // check if any dictionary misses in page 517 b.ne L_check_single_value_page 518 519 cmp start_next_qp, next_qp // check if any partial or exact dictionary matches 520 b.ne L_check_single_value_page 521 522 mov x0, #SV_RETURN // Magic return value 523 b L_done 524 525L_check_single_value_page: 526 527 sub rax, next_full_patt, start_next_full_patt // get # dictionary misses 528 lsr rax, rax, #2 529 530 sub r11, next_qp, start_next_qp // get # dictionary hits (exact + partial) 531 532 sub r13, next_low_bits, start_next_low_bits // get # dictionary partial hits 533 lsr r13, r13, #1 534 535 // Single value page if one of the follwoing is true: 536 // partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss) 537 // partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial) 538 // 539 cbnz r13, 1f // were there 0 partial hits? 540 541 cmp r11, #(256*PAGES_SIZE_IN_KBYTES - 1) // were there 1023 dictionary hits 542 b.ne 1f 543 544 cmp rax, #1 // was there exacly 1 dictionary miss? 545 b.ne 1f 546 547 ldrb edx, [tempTagsArray] // read the very 1st tag 548 cmp edx, #2 // was the very 1st tag a miss? 549 b.eq L_is_single_value_page 550 5511: 552 cmp r13, #1 // was there 1 partial hit? 553 b.ne L_check_mostly_zero 554 555 cmp r11, #(256*PAGES_SIZE_IN_KBYTES) // were there 1024 dictionary hits 556 b.ne L_check_mostly_zero 557 558 ldrb edx, [tempTagsArray] // read the very 1st tag 559 cmp edx, #1 // was the very 1st tag a partial? 560 b.ne L_check_mostly_zero 561 562L_is_single_value_page: 563 564 mov x0, #SV_RETURN // Magic return value 565 b L_done 566 567L_check_mostly_zero: 568 // how much space will the sparse packer take? 569 add rax, rax, r11 // rax += (next_qp - start_next_qp) 570 mov rdx, #6 571 mov rcx, #4 572 madd r11, rax, rdx, rcx // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header 573 574 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits 575 mov rdx, #1365 576 mul rax, rax, rdx 577 578 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses 579 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) 580 581 sub rdx, next_qp, start_next_qp 582 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2 583 add rax, rax, #(12+256*scale) // rax += bytes taken by the header + tags 584 585 cmp rax, r11 // is the default packer the better option? 586 b.lt L_done_search 587 588 cmp r11, byte_budget // can the sparse packer fit into the given budget? 589 b.gt L_budgetExhausted 590 591L_sparse_packer: 592 mov edx, #MZV_MAGIC 593 str edx, [dest_buf], #4 // header to indicate a sparse packer 594 595 mov rdx, #0 // rdx = byte offset in src of non-0 word 5961: 597 ldr rax, [start_next_input_word, rdx] // rax = read dword 598 cbnz rax, 5f // is dword != 0 5993: 600 add rdx, rdx, #8 // 8 more bytes have been processed 6014: 602 cmp rdx, #(4096*scale) // has the entire page been processed 603 b.ne 1b 604 mov x0, r11 // store the size of the compressed stream 605 b L_done 606 6075: 608 cbz eax, 6f // is lower word == 0 609 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer 610 strh edx, [dest_buf], #2 // store the byte index 6116: 612 lsr rax, rax, 32 // get the upper word into position 613 cbz eax, 3b // is dword == 0 614 add rdx, rdx, #4 615 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer 616 strh edx, [dest_buf], #2 // store the byte index 617 add rdx, rdx, #4 618 b 4b 619 620L_check_compression_ratio: 621 622 mov mode, NORMAL 623 mov remaining, #((1024 - CHKPT_WORDS)*scale) // remaining input words to process 624 cbz remaining, CHECKPOINT // if there are no remaining words to process 625 626 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits 627 mov rdx, #1365 628 mul rax, rax, rdx 629 630 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses 631 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt) 632 633 sub rdx, next_qp, start_next_qp 634 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2 635 subs rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale) 636 // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES 637 638 b.gt L_budgetExhausted // if rax is > 0, we need to early abort 639 b L_loop // we are done 640