xref: /xnu-11215.41.3/osfmk/x86_64/WKdmCompress_new.s (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains x86_64 hand optimized implementation of WKdm memory page compressor.
31
32 	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
33
34	input :
35		src_buf : address of input page (length = 1024 words)
36		dest_buf : address of output buffer (may not be 16-byte aligned)
37		scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
38		bytes_budget : a given byte target in compression
39
40	output :
41
42		if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
43		o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
44		During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
45		When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
46		The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
47
48 The 4th argument bytes_budget is the target compress budget in bytes.
49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
51
52 WKdm Compression algorithm is briefly stated as follows:
53
54	There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
55	For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
56		a. k = (x>>10)&255;						// 8-bit hash table index
57		b. dict_index = hashTable[k];			// 4-bit dictionary index, hashTable[] is fixed
58		c. dict_word = dictionary[dict_index];	// 32-bit dictionary word, dictionary[] is dynamically updated
59
60 	Each input word x is classified/tagged into 4 classes :
61		0 : x = 0
62		1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
63  		2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
64  		3 : (x == dict_word), the exact input word is in the dictionary
65
66	For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
67		0 : 2-bits tag (32->2 compression)
68		1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
69		2 : 2-bits tag + 32-bits new word (32->34 expansion)
70		3 : 2-bits tag + 4-bits dict_index (32->6 compression)
71
72	It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
73
74	the output bit stream (*dest_buf) consists of
75		a. 12 bytes header
76		b. 256 bytes for 1024 packed tags
77		c. (varying number of) words for new words not matched to dictionary word.
78		d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
79		e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
80
81	the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
82	Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
83
84
85	The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
86
87	The temp buffers are
88
89		uint8_t 	tempTagsArray[1024];			// temporary saving for tags before final packing
90		uint8_t 	tempQPosArray[1024];			// temporary saving for dict_indices before final packing
91		uint16_t 	tempLowBitsArray[1024];			// temporary saving for partially matched lower 10 bits before final packing
92
93	Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
94	the destination buffer.
95
96		uint32_t	*new_word = dest_buf+3+64;		// 3 words for header, 64 words for tags, new words come right after the tags.
97
98	Now since we are given a byte budget for this compressor, we can monitor the byte usage on the fly in the scanning and tagging pass.
99
100	bytes_budget -= 12 + 256; // header and tags (1024 * 2 /8 = 256 bytes)
101
102	whenever an input word is classified as class
103
104		2 : bytes_budget-=4; if (bytes_budget<=0) exit -1;
105
106	when writing the 8 4-bits/3 10-bits, monitor bytes_budget and exit -1 when byte_budget <=0;
107
108	without showing the bit budget management, the pseudo code is given as follows:
109
110	uint8_t 	*tags=tempTagsArray;
111	uint8_t 	*dict=tempQPosArray;
112	uint8_t 	*partial=tempLowBitsArray;
113
114	for (i=0;i<1024;i++) {
115			x = *src_buf++;
116			if (x == 0) {		// zero, 2-bits tag
117					*tags++ = 0;
118			} else {
119
120				// find dict_index and dict_word from x
121				k = (x>>10)&255;
122				dict_index = hashTable[k];
123				dict_word = dictionary[dict_index];
124
125				if (dict_word == x) { // exactly match
126					// 2-bits tag + 4-bits table index
127					*tags++ = 3;
128					*dict++ = dict_index;
129				} else if (((x^dict_word)>>10)==0) {	// 22 higher bits matched
130					// 2-bits tag + 4-bits table index + 10-bits lower partial
131					*tags++ = 1;
132                    *dict++ = dict_index;
133					*partial++ = x &0x3ff;
134					dictionary[dict_index] = x;
135				} else {	// not matched
136					// 2-bits tag + 32-bits new word
137					*tags++ = 2;
138					*new_word++ = x;
139					dictionary[dict_index] = x;
140				}
141			}
142	}
143
144	after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
145
146		1. 1024 tags are packed into 256 bytes right after the 12-bytes header
147		2. dictionary indices (4-bits each) are packed into are right after the new words section
148		3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
149
150 	cclee, 11/30/12
151
152    Added zero page, single value page, sparse page, early abort optimizations
153    rsrini, 09/14/14
154
155*/
156
157	.text
158	.align 4,0x90
159
160#define SV_RETURN           $0                      // return value when SV, ZV page is found
161#define MZV_MAGIC           $17185                  // magic value used to identify MZV page encoding
162#define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
163#define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
164#define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
165                                                    //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
166
167#if CHKPT_BYTES > 4096
168    #error CHKPT_BYTES must be <= 4096
169#endif
170#if CHKPT_BYTES < 4
171    #error CHKPT_BYTES must be >= 4
172#endif
173
174.globl _WKdm_compress_new
175_WKdm_compress_new:
176	pushq	%rbp
177	movq	%rsp, %rbp
178	pushq	%r15
179	pushq	%r14
180	pushq	%r13
181	pushq	%r12
182	pushq	%rbx
183	subq	$(48+64), %rsp
184
185	#define	tempTagsArray       64(%rsp)
186	#define	tempLowBitsArray	72(%rsp)
187
188    #define start_next_full_patt  80(%rsp)
189    #define start_next_input_word 88(%rsp)
190    #define byte_budget           96(%rsp)
191    #define start_next_qp         tempQPosArray
192    #define start_next_low_bits   tempLowBitsArray
193
194	#define	next_tag			%r8
195	#define	next_input_word		%rdi
196	#define	end_of_input		%r13
197	#define	next_full_patt		%rbx
198	#define	dict_location		%rcx
199	#define	next_qp				%r10
200    #define checkpoint          %r11
201	#define	dictionary			%rsp
202	#define	dest_buf			%r12
203	#define	hashTable			%r14
204	#define tempQPosArray		%r15
205	#define	next_low_bits		%rsi
206	#define	byte_count			%r9d
207
208	movq	%rsi, %r12						// dest_buf
209
210	movq	%rdx, tempTagsArray 			// &tempTagsArray[0]
211	movq	%rdx, next_tag					// next_tag always points to the one following the current tag
212
213	leaq	1024(%rdx), tempQPosArray		// &tempQPosArray[0]
214	movq	tempQPosArray, next_qp			// next_qp
215
216    leaq    CHKPT_BYTES(%rdi), checkpoint   // checkpoint = src_buf + CHKPT_BYTES
217	leaq	4096(%rdi), end_of_input		// end_of_input = src_buf + num_input_words
218	leaq	268(%rsi), %rbx					// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
219
220	movl	%ecx, byte_count
221	subl	$(12+256), byte_count			// header + tags
222	jle		L_budgetExhausted
223
224                                            // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
225                                            // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
226	// PRELOAD_DICTIONARY;
227	movl	$0, 0(dictionary)
228	movl	$0, 4(dictionary)
229	movl	$0, 8(dictionary)
230	movl	$0, 12(dictionary)
231	movl	$0, 16(dictionary)
232	movl	$0, 20(dictionary)
233	movl	$0, 24(dictionary)
234	movl	$0, 28(dictionary)
235	movl	$0, 32(dictionary)
236	movl	$0, 36(dictionary)
237	movl	$0, 40(dictionary)
238	movl	$0, 44(dictionary)
239	movl	$0, 48(dictionary)
240	movl	$0, 52(dictionary)
241	movl	$0, 56(dictionary)
242	movl	$0, 60(dictionary)
243
244	leaq	2048(%rdx), %rax				// &tempLowBitsArray[0]
245	movq	%rax, tempLowBitsArray			// save for later reference
246	movq	%rax, next_low_bits				// next_low_bits
247
248	leaq	_hashLookupTable_new(%rip), hashTable	// hash look up table
249
250    movq    next_full_patt, start_next_full_patt
251    movq    next_input_word, start_next_input_word
252    movl    %ecx, byte_budget               // save the byte budget
253
254
255	jmp		L_scan_loop
256
257	.align 4,0x90
258L_RECORD_ZERO:
259	movb	$0, -1(next_tag)						// *next_tag = ZERO;
260	addq	$4, next_input_word 					// next_input_word++;
261	cmpq	next_input_word, checkpoint             // checkpoint time?
262	je		CHECKPOINT
263
264L_scan_loop:
265	movl	(next_input_word), %edx
266	incq	next_tag								// next_tag++
267	testl	%edx, %edx
268	je		L_RECORD_ZERO							// if (input_word==0) RECORD_ZERO
269	movl	%edx, %eax								// a copy of input_word
270	shrl	$10, %eax								// input_high_bits = HIGH_BITS(input_word);
271	movzbl	%al, %eax								// 8-bit index to the Hash Table
272	movsbq	(hashTable,%rax),%rax					// HASH_TO_DICT_BYTE_OFFSET(input_word)
273	leaq	(dictionary, %rax), dict_location		// ((char*) dictionary) + HASH_TO_DICT_BYTE_OFFSET(input_word));
274	movl	(dict_location), %eax					// dict_word = *dict_location;
275	addq	$4, next_input_word						// next_input_word++
276	cmpl	%eax, %edx								// dict_word vs input_word
277	je		L_RECORD_EXACT							// if identical, RECORD_EXACT
278	xorl	%edx, %eax
279	shrl	$10, %eax								// HIGH_BITS(dict_word)
280	je		L_RECORD_PARTIAL						// if identical, RECORD_PARTIAL
281
282L_RECORD_MISS:
283	movl	%edx, (next_full_patt)					// *next_full_patt = input_word;
284	addq	$4, next_full_patt						// next_full_patt++
285	movl	%edx, (dict_location)					// *dict_location = input_word
286	movb	$2, -1(next_tag)						// *next_tag = 2 for miss
287	subl	$4, byte_count							// fill in a new 4-bytes word
288	jle		L_budgetExhausted
289	cmpq	next_input_word, checkpoint             // checkpoint time?
290	jne     L_scan_loop
291	jmp	    CHECKPOINT
292
293L_done_search:
294
295	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
296	movq	next_full_patt, %rax					// next_full_patt
297	subq	dest_buf, %rax							// next_full_patt - dest_buf
298	sarq	$2, %rax								// offset in 4-bytes
299	movl	%eax, %r13d								// r13d = (next_full_patt - dest_buf)
300	movl	%eax, 0(dest_buf)						// dest_buf[0] = next_full_patt - dest_buf
301	decq	next_tag
302	cmpq	next_tag, tempTagsArray					// &tempTagsArray[0] vs next_tag
303	jae		L13										// if (&tempTagsArray[0] >= next_tag), skip the following
304
305	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
306
307	movq	dest_buf, %rdi							// dest_buf
308	movq	tempTagsArray, %rcx						// &tempTagsArray[0]
309
310	.align 4,0x90
311L_pack_2bits:
312	movq	8(%rcx), %rax							// w3
313	addq	$16, %rcx								// tempTagsArray += 16;
314	shlq	$4, %rax
315	addq	$4, %rdi								// dest_buf += 4;
316	orq		-16(%rcx), %rax							// w3
317	movq	%rax, %rdx
318	shrq	$30, %rax
319	orl		%edx, %eax
320	cmpq	%rcx, next_tag							// cmp next_tag vs dest_buf
321	movl	%eax, 8(%rdi)							// save at *(dest_buf + HEADER_SIZE_IN_WORDS)
322	ja		L_pack_2bits							// if (next_tag > dest_buf) repeat L_pack_2bits
323
324	/* Pack the queue positions into the area just after the full words. */
325
326L13:
327	mov		next_qp, %rax							// next_qp
328	sub		tempQPosArray, %rax						// num_bytes_to_pack = next_qp - (char *) tempQPosArray;
329	addl	$7, %eax								// num_bytes_to_pack+7
330	shrl	$3, %eax								// num_packed_words = (num_bytes_to_pack + 7) >> 3
331
332	shll	$2, %eax								// turn into bytes
333	subl	%eax, byte_count						//
334	jl		L_budgetExhausted
335	shrl	$1, %eax 								// num_source_words = num_packed_words * 2;
336
337	leaq	(tempQPosArray,%rax,4), %rcx			// endQPosArray = tempQPosArray + num_source_words
338	cmpq	%rcx, next_qp							// next_qp vs endQPosArray
339	jae		L16										// if (next_qp >= endQPosArray) skip the following zero paddings
340	movq	%rcx, %rax
341	subq	next_qp, %rax
342	subl	$4, %eax
343	jl		1f
344	.align 4,0x90
3450:	movl	$0, (next_qp)
346	addq	$4, next_qp
347	subl	$4, %eax
348	jge		0b
3491:	testl	$2, %eax
350	je		1f
351	movw	$0, (next_qp)
352	addq	$2, next_qp
3531:	testl	$1, %eax
354	je		1f
355	movb	$0, (next_qp)
356	addq	$1, next_qp
3571:
358L16:
359	movq	next_full_patt, %rdi					// next_full_patt
360	cmpq	tempQPosArray, %rcx						// endQPosArray vs tempQPosArray
361	jbe		L20										// if (endQPosArray <= tempQPosArray) skip the following
362	movq	tempQPosArray, %rdx						// tempQPosArray
363
364	/* byte_count -= (rcx - tempQPosArray)/2 */
365
366	.align 4,0x90
367L_pack_4bits:
368	movl	4(%rdx), %eax							// src_next[1]
369	addq	$8, %rdx								// src_next += 2;
370	sall	$4, %eax								// (src_next[1] << 4)
371	addq	$4, %rdi								// dest_next++;
372	orl		-8(%rdx), %eax							// temp = src_next[0] | (src_next[1] << 4)
373	cmpq	%rdx, %rcx								// source_end vs src_next
374	movl	%eax, -4(%rdi)							// dest_next[0] = temp;
375	ja		L_pack_4bits							// while (src_next < source_end) repeat the loop
376
377	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
378	movq	%rdi, %rax								// boundary_tmp
379	subq	dest_buf, %rax							// boundary_tmp - dest_buf
380	movq	%rax, %r13								// boundary_tmp - dest_buf
381	shrq	$2, %r13								// boundary_tmp - dest_buf in words
382L20:
383	movl	%r13d, 4(dest_buf)						// dest_buf[1] = boundary_tmp - dest_buf
384
385	movq	tempLowBitsArray, %rcx					// tempLowBitsArray
386	movq	next_low_bits, %rbx						// next_low_bits
387	subq	%rcx, %rbx								// next_low_bits - tempLowBitsArray (in bytes)
388	sarq	$1, %rbx								// num_tenbits_to_pack (in half-words)
389
390	#define	size	%ebx
391
392	subl	$3, size								// pre-decrement num_tenbits_to_pack by 3
393	jl		1f										// if num_tenbits_to_pack < 3, skip the following loop
394
395	.align	4,0x90
3960:
397	movzwl	4(%rcx), %eax							// w2
398	addq	$6, %rcx								// next w0/w1/w2 triplet
399	sall	$10, %eax								// w1 << 10
400	or		-4(%rcx), %ax							// w1
401	addq	$4, %rdi								// dest_buf++
402	sall	$10, %eax								// w1 << 10
403	or		-6(%rcx), %ax							// (w0) | (w1<<10) | (w2<<20)
404	subl	$4, byte_count							// fill in a new 4-bytes word
405	jle		L_budgetExhausted
406	subl	$3, size								// num_tenbits_to_pack-=3
407	movl	%eax, -4(%rdi)							// pack w0,w1,w2 into 1 dest_buf word
408	jge		0b										// if no less than 3 elements, back to loop head
409
4101: 	addl	$3, size								// post-increment num_tenbits_to_pack by 3
411	je		3f										// if num_tenbits_to_pack is a multiple of 3, skip the following
412	movzwl	(%rcx), %eax							// w0
413	subl	$1, size								// num_tenbits_to_pack--
414	je		2f										//
415	movzwl	2(%rcx), %edx							// w1
416	sall	$10, %edx								// w1 << 10
417	orl		%edx, %eax								// w0 | (w1<<10)
4182:
419	subl	$4, byte_count							// fill in a new 4-bytes word
420	jle		L_budgetExhausted
421	movl	%eax, (%rdi)							// write the final dest_buf word
422	addq	$4, %rdi								// dest_buf++
423
4243:	movq	%rdi, %rax								// boundary_tmp
425	subq	dest_buf, %rax							// boundary_tmp - dest_buf
426	shrq	$2, %rax								// boundary_tmp - dest_buf in terms of words
427	movl	%eax, 8(dest_buf)						// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
428	shlq	$2, %rax								// boundary_tmp - dest_buf in terms of bytes
429
430L_done:
431	// restore registers and return
432	addq	$(48+64), %rsp
433	popq	%rbx
434	popq	%r12
435	popq	%r13
436	popq	%r14
437	popq	%r15
438	leave
439	ret
440
441    .align  4
442L_budgetExhausted:
443	mov		$-1, %rax
444	jmp		L_done
445
446
447	.align 4,0x90
448L_RECORD_EXACT:
449	subq	dictionary, %rcx					// dict_location - dictionary
450	sarq	$2, %rcx							// divide by 4 for word offset
451	movb	$3, -1(next_tag)					// *next_tag = 3 for exact
452	movb	%cl, (next_qp)						// *next_qp = word offset (4-bit)
453	incq	next_qp								// next_qp++
454	cmpq	next_input_word, checkpoint         // checkpoint time?
455	jne     L_scan_loop
456	jmp	    CHECKPOINT
457
458	.align 4,0x90
459L_RECORD_PARTIAL:
460	movq	%rcx, %rax							// dict_location
461	movb	$1, -1(next_tag)					// *next_tag = 1 for partial matched
462	subq	dictionary, %rax					// dict_location - dictionary
463	movl	%edx, (%rcx)						// *dict_location = input_word;
464	sarq	$2, %rax							// offset in 32-bit word
465	movb	%al, (next_qp)						// update *next_qp
466	andl	$1023, %edx							// lower 10 bits
467	incq	next_qp								// next_qp++
468	mov		%dx, (next_low_bits)				// save next_low_bits
469	addq	$2, next_low_bits					// next_low_bits++
470	cmpq	next_input_word, checkpoint         // checkpoint time?
471	jne     L_scan_loop
472
473CHECKPOINT:
474
475    cmpq	end_of_input, checkpoint            // end of buffer or compression ratio check?
476    jne     L_check_compression_ratio
477
478L_check_zero_page:
479                                                // check if any dictionary misses in page
480    cmpq    start_next_full_patt, next_full_patt
481    jne     L_check_single_value_page
482
483    cmpq    start_next_qp, next_qp              // check if any partial or exact dictionary matches
484    jne     L_check_single_value_page
485
486    mov     SV_RETURN, %rax                     // Magic return value
487    jmp     L_done
488
489L_check_single_value_page:
490
491    movq    next_full_patt, %rax                // get # dictionary misses
492    subq    start_next_full_patt, %rax
493    shrq    $2, %rax
494
495    movq    next_qp, %r11                       // get # dictionary hits (exact + partial)
496    subq    start_next_qp, %r11
497
498    movq    next_low_bits, %r13                 // get # dictionary partial hits
499    subq    start_next_low_bits, %r13
500    shrq    $1, %r13
501
502    movq    tempTagsArray, %r14                 // get the address of the first tag
503
504    // Single value page if one of the follwoing is true:
505    //  partial == 0 AND hits == 1023 AND miss == 1 AND tag[0] == 2 (i.e. miss)
506    //  partial == 1 AND hits == 1024 AND tag[0] == 1 (i.e. partial)
507    //
508    cmpq    $0, %r13                            // were there 0 partial hits?
509    jne     1f
510
511    cmpq    $1023, %r11                         // were there 1023 dictionary hits
512    jne     1f
513
514    cmpq    $1, %rax                            // was there exacly 1 dictionary miss?
515    jne     1f
516
517    cmpb    $2, 0(%r14)                         // was the very 1st tag a miss?
518    je      L_is_single_value_page
519
5201:
521    cmpq    $1, %r13                            // was there 1 partial hit?
522    jne     L_check_mostly_zero
523
524    cmpq    $1024, %r11                         // were there 1024 dictionary hits
525    jne     L_check_mostly_zero
526
527    cmpb    $1, 0(%r14)                         // was the very 1st tag a partial?
528    jne     L_check_mostly_zero
529
530L_is_single_value_page:
531
532    mov     SV_RETURN, %rax                     // Magic return value
533    jmp     L_done
534
535L_check_mostly_zero:
536                                                // how much space will the sparse packer take?
537    addq    %r11, %rax                          // rax += (next_qp - start_next_qp)
538    movq    $6, %rdx
539    mulq    %rdx                                // rax *= 6 (i.e. 4 byte word + 2 byte offset)
540    addq    $4, %rax                            // rax += 4 byte for header
541    movq    %rax, %r11
542                                                // how much space will the defaut packer take?
543    movq    next_low_bits, %rax
544    subq    start_next_low_bits, %rax           // get bytes consumed by lower-10 bits
545    movq    $1365, %rdx
546    mulq    %rdx
547    shrq    $11, %rax                           // rax = 2/3*(next_low_bits - start_next_low_bits)
548    movq    next_full_patt, %rdx
549    subq    start_next_full_patt, %rdx          // get bytes consumed by dictionary misses
550    addq    %rdx, %rax                          // rax += (next_full_patt - start_next_full_patt)
551    movq    next_qp, %rdx
552    subq    start_next_qp, %rdx
553    shrq    $1, %rdx                            // get bytes consumed by dictionary hits
554    addq    %rdx, %rax                          // rax += (next_qp - start_next_qp)/2
555    addq    $(12+256), %rax                     // rax += bytes taken by the header + tags
556
557    cmpq    %r11, %rax                          // is default packer the better option?
558    jb      L_done_search
559
560    cmpl    byte_budget, %r11d                  // can the sparse packer fit into the given budget?
561    ja      L_budgetExhausted
562
563L_sparse_packer:
564
565    movl    MZV_MAGIC, 0(dest_buf)              // header to indicate a sparse packer
566    addq    $4, dest_buf
567
568    movq    $0, %rdx                            // rdx = byte offset in src of non-0 word
569    movq    start_next_input_word, %r8
5701:
571    movq    0(%r8, %rdx), %rax                  // rax = read dword
572	testq	%rax, %rax                          // is dword == 0
573    jne     5f
5743:
575    addq    $8, %rdx                            // 8 more bytes have been processed
5764:
577    cmpq    $4096, %rdx
578    jne     1b
579    movq    %r11, %rax                          // store the size of the compressed stream
580    jmp     L_done
581
5825:
583    testl   %eax, %eax                          // is lower word == 0
584    je      6f
585    movl    %eax, 0(dest_buf)                   // store the non-0 word in the dest buffer
586    mov     %dx, 4(dest_buf)                    // store the byte index
587    addq    $6, dest_buf
5886:
589    shrq    $32, %rax                           // get the upper word into position
590    testl   %eax, %eax                          // is upper word == 0
591    je      3b
592    addq    $4, %rdx
593    movl    %eax, 0(dest_buf)                   // store the word in the dest buffer
594    mov     %dx, 4(dest_buf)                    // store the byte index
595    addq    $6, dest_buf
596    addq    $4, %rdx
597    jmp     4b
598
599L_check_compression_ratio:
600
601    movq    end_of_input, checkpoint            // checkpoint = end of buffer
602
603    movq    next_low_bits, %rax
604    subq    start_next_low_bits, %rax           // get bytes consumed by lower-10 bits
605    movq    $1365, %rdx
606    mulq    %rdx
607    shrq    $11, %rax                           // rax = 2/3*(next_low_bits - start_next_low_bits)
608
609    movq    next_full_patt, %rdx
610    subq    start_next_full_patt, %rdx          // get bytes consumed by dictionary misses
611    addq    %rdx, %rax                          // rax += (next_full_patt - start_next_full_patt)
612
613    movq    next_qp, %rdx
614    subq    start_next_qp, %rdx
615    shrq    $1, %rdx
616    addq    %rdx, %rax                          // rax += (next_qp - start_next_qp)/2
617
618    addq    $CHKPT_TAG_BYTES, %rax              // rax += bytes taken by the tags
619    cmpq    $CHKPT_SHRUNK_BYTES, %rax
620    ja      L_budgetExhausted                   // compressed size exceeds budget
621    jmp     L_scan_loop
622
623