xref: /xnu-11215.41.3/osfmk/arm64/WKdmCompress_16k.s (revision 33de042d024d46de5ff4e89f2471de6608e37fa4)
1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains arm64 hand optimized implementation of WKdm memory page compressor.
31
32 	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
33
34	input :
35		src_buf : address of input page (length = 1024 words)
36		dest_buf : address of output buffer (may not be 16-byte aligned)
37		scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
38		bytes_budget : a given byte target in compression
39
40	output :
41
42		if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
43		o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
44		During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
45		When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
46		The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
47
48 The 4th argument bytes_budget is the target compress budget in bytes.
49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
51
52 WKdm Compression algorithm is briefly stated as follows:
53
54	There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
55	For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
56		a. k = (x>>10)&255;						// 8-bit hash table index
57		b. dict_index = hashTable[k];			// 4-bit dictionary index, hashTable[] is fixed
58		c. dict_word = dictionary[dict_index];	// 32-bit dictionary word, dictionary[] is dynamically updated
59
60 	Each input word x is classified/tagged into 4 classes :
61		0 : x = 0
62		1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
63  		2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
64  		3 : (x == dict_word), the exact input word is in the dictionary
65
66	For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
67		0 : 2-bits tag (32->2 compression)
68		1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
69		2 : 2-bits tag + 32-bits new word (32->34 expansion)
70		3 : 2-bits tag + 4-bits dict_index (32->6 compression)
71
72	It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
73
74	the output bit stream (*dest_buf) consists of
75		a. 12 bytes header
76		b. 256 bytes for 1024 packed tags
77		c. (varying number of) words for new words not matched to dictionary word.
78		d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
79		e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
80
81	the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
82	Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
83
84
85	The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
86
87	The temp buffers are
88
89		uint8_t 	tempTagsArray[1024];			// temporary saving for tags before final packing
90		uint8_t 	tempQPosArray[1024];			// temporary saving for dict_indices before final packing
91		uint16_t 	tempLowBitsArray[1024];			// temporary saving for partially matched lower 10 bits before final packing
92
93	Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
94	the destination buffer.
95
96		uint32_t	*new_word = dest_buf+3+64;		// 3 words for header, 64 words for tags, new words come right after the tags.
97
98	Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass.
99
100	byte_count -= 12 + 256;		// bit budget minus header and tags
101
102	whenever an input word is classified as class
103
104		2 : byte_count -= 4;
105
106	the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0).
107
108	without showing the bit budget management, the pseudo code is given as follows:
109
110	uint8_t 	*tags=tempTagsArray;
111	uint8_t 	*dict=tempQPosArray;
112	uint8_t 	*partial=tempLowBitsArray;
113
114	for (i=0;i<1024;i++) {
115			x = *src_buf++;
116			if (x == 0) {		// zero, 2-bits tag
117					*tags++ = 0;
118			} else {
119
120				// find dict_index and dict_word from x
121				k = (x>>10)&255;
122				dict_index = hashTable[k];
123				dict_word = dictionary[dict_index];
124
125				if (dict_word == x) { // exactly match
126					// 2-bits tag + 4-bits table index
127					*tags++ = 3;
128					*dict++ = dict_index;
129				} else if (((x^dict_word)>>10)==0) {	// 22 higher bits matched
130					// 2-bits tag + 4-bits table index + 10-bits lower partial
131					*tags++ = 1;
132                    *dict++ = dict_index;
133					*partial++ = x &0x3ff;
134					dictionary[dict_index] = x;
135				} else {	// not matched
136					// 2-bits tag + 32-bits new word
137					*tags++ = 2;
138					*new_word++ = x;
139					dictionary[dict_index] = x;
140				}
141			}
142	}
143
144	after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
145
146		1. 1024 tags are packed into 256 bytes right after the 12-bytes header
147		2. dictionary indices (4-bits each) are packed into are right after the new words section
148		3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
149
150 	cclee, 11/9/12
151
152    Added zero page, single value page, sparse page, early abort optimizations
153    rsrini, 09/14/14
154*/
155
156#if KERNEL
157#include <machine/asm.h>
158#endif /* KERNEL */
159
160#define PAGES_SIZE_IN_KBYTES    16
161
162#ifndef PAGES_SIZE_IN_KBYTES
163#define PAGES_SIZE_IN_KBYTES    4
164#endif
165
166#if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16))
167#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
168#endif
169
170
171	.text
172	.align 4
173
174/*
175	int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
176*/
177
178.globl _WKdm_compress_16k
179_WKdm_compress_16k:
180
181/*
182	 -------------------------       symbolizing register use          -----------------------------------
183*/
184	#define	src_buf				x0
185	#define	next_input_word		x0
186	#define	dest_buf			x1
187	#define	scratch				x2
188	#define	byte_count			x3
189	#define	next_tag			x4
190	#define	tempTagsArray		x2		// scratch
191	#define	dictionary			x5
192	#define	remaining			x6
193	#define	next_full_patt		x7
194	#define	dict_location		x8
195	#define	wdict_location		w8
196	#define	next_qp				x9
197	#define	hashTable			x10
198	#define tempQPosArray		x11
199	#define	next_low_bits		x12
200
201/*
202	this arm64 assembly code is ported from x86_64 assembly code,
203	therefore need such symbolization to quickly reuse the x86_64 assembly code
204	for these intermediate/temporary register use
205*/
206	#define	rax					x13
207	#define	eax					w13
208	#define	rcx					x14
209	#define	ecx					w14
210	#define	rdx					x15
211	#define	edx					w15
212	#define	rdi					x0			/* after some point, x0/rdi becomes free other usage */
213
214
215/*
216		-------------------------    scratch  memory  --------------------------------------
217
218	need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray)
219	total 6208 bytes
220	[sp,#0]         : dictionary
221	[scratch,#0]    : tempTagsArray
222	[scratch,#1024] : tempQPosArray
223	[scratch,#2048] : tempLowBitsArray
224*/
225
226#define	scale	(PAGES_SIZE_IN_KBYTES/4)
227
228#define SV_RETURN           0                       // return value when SV, ZV page is found
229#define MZV_MAGIC           17185                   // magic value used to identify MZV page encoding
230#define CHKPT_BYTES         416                     // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
231#define CHKPT_WORDS         (CHKPT_BYTES/4)         // checkpoint bytes in words
232#define CHKPT_TAG_BYTES     (CHKPT_BYTES/16)        // size of the tags for  CHKPT_BYTES of data
233#define CHKPT_SHRUNK_BYTES  426                     // for early aborts: max size of compressed stream to allow further processing ..
234                                                    //      .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
235#if CHKPT_BYTES > 4096
236    #error CHKPT_BYTES must be <= 4096
237#endif
238#if CHKPT_BYTES < 4
239    #error CHKPT_BYTES must be >= 4
240#endif
241
242#if KERNEL
243	ARM64_PROLOG
244    sub     sp, sp, #64
245    st1.4s  {v0,v1,v2,v3},[sp]
246#endif
247
248    sub     sp, sp, #64					// allocate for dictionary
249	mov		dictionary, sp				// use x5 to point to sp, so we can use sub xd, xn, sp
250
251    sub     sp, sp, #64                 // allocate space for saving callee-saved registers
252	mov		x15, sp
253    stp     x20, x21, [x15, #0]         // save x20, x21
254    stp     x22, x23, [x15, #16]        // save x22, x23
255    stp     x24, x25, [x15, #32]        // save x24, x25
256    stp     x26, x27, [x15, #48]        // save x26, x27
257
258/*
259		-------  entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY -------------------
260*/
261
262                                            // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
263                                            // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
264	mov		next_tag, tempTagsArray			// &tempTagsArray[0]
265	add		next_qp, scratch, #(1024*scale)	// next_qp
266	mov		remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint
267	add		next_full_patt, dest_buf, #(12+256*scale) 	// dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
268	sub		byte_count, byte_count, #(12+256*scale)	// bit_count - header - tags
269	add		next_low_bits, scratch, #(2048*scale)	// &tempLowBitsArray[0]
270	stp		xzr, xzr, [dictionary, #0]		// initialize dictionary
271	adrp    hashTable, _hashLookupTable@GOTPAGE
272	stp		xzr, xzr, [dictionary, #16]		// initialize dictionary
273	stp		xzr, xzr, [dictionary, #32]		// initialize dictionary
274    ldr 	hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF]
275	stp		xzr, xzr, [dictionary, #48]		// initialize dictionary
276
277#define EARLYCHECK              0
278#define NORMAL                  1
279
280#define mode                    w20
281#define start_next_full_patt    x21
282#define start_next_input_word   x22
283#define start_next_low_bits     x23
284#define r11                     x24
285#define r13                     x25
286#define byte_budget             x26
287#define start_next_qp           tempQPosArray
288
289	add		tempQPosArray, scratch, #(1024*scale)	    // &tempQPosArray[0]
290    mov     mode, EARLYCHECK                            // indicate we are yet to evaluate the early aborts
291    mov     start_next_full_patt, next_full_patt        // remember the start of next_full_patt
292    mov     start_next_input_word, next_input_word      // remember the start of next_input_word
293    mov     start_next_low_bits, next_low_bits          // remember the start of next_low_bit
294    add     byte_budget, byte_count, #(12+256*scale)    // remember the byte budget
295
296	b		L_loop
297
298	.align	4, 0x90
299
300	/* we've just detected a zero input word in edx */
301L_RECORD_ZERO:
302	strb	edx, [next_tag], #1				// *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0
303	subs	remaining, remaining, #1		// remaing--;
304	b.le	CHECKPOINT   					// if remaining = 0, break
305
306	/* --------------    scan/tag pass loop -------------------------  */
307L_loop:
308
309	/* load new input word to edx */
310	ldr		edx, [next_input_word], #4
311	cbz		edx, L_RECORD_ZERO							// if (input_word==0) RECORD_ZERO
312
313	/*
314		now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
315	*/
316	ubfm	eax, edx, #10, #17
317	ldrb	wdict_location, [hashTable, rax]		// HASH_TO_DICT_BYTE_OFFSET(input_word)
318	ldr		eax, [dictionary, dict_location]		// dict_word = *dict_location;
319
320	/* detect whether we match input to its corresponding dictionary word */
321	eor		eax, eax, edx							// dict_word vs input_word
322	cbz		eax, L_RECORD_EXACT						// if identical, RECORD_EXACT
323	lsr		eax, eax, #10							// HIGH_BITS(dict_word^input_word)
324	cbz		eax, L_RECORD_PARTIAL					// if identical, RECORD_PARTIAL
325
326L_RECORD_MISS:
327/*
328	if we are here, the input word can not be derived from the dictionary,
329	we write the input word as a new word,
330	and update the dictionary with this new word
331*/
332	subs	byte_count, byte_count, #4				// byte_count -= 4
333	b.le	L_budgetExhausted						// return -1 to signal this page is not compressable
334	str		edx, [next_full_patt], #4				// *next_full_patt++ = input_word;
335	mov		eax, #2									// tag for MISS
336	subs	remaining, remaining, #1				// remaing--;
337	str		edx, [dictionary, dict_location]		// *dict_location = input_word
338	strb	eax, [next_tag], #1						// *next_tag++ = 2 for miss
339	b.gt	L_loop									// // if remaining > 0, repeat
340    b       CHECKPOINT
341
342L_done_search:
343
344	// SET_QPOS_AREA_START(dest_buf,next_full_patt);
345	/* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */
346
347	sub		rax, next_full_patt, dest_buf			// next_full_patt - dest_buf
348	lsr		eax, eax, #2							// offset in 4-bytes
349	str		eax, [dest_buf]							// dest_buf[0] = next_full_patt - dest_buf
350
351	/* --------------------------     packing 1024 tags into 256 bytes ----------------------------------------*/
352	// boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
353
354	add		rdi, dest_buf, #12						// dest_buf
355	mov		rcx, tempTagsArray						// &tempTagsArray[0]
356
357L_pack_2bits:
358	ld1.2s  {v0,v1,v2,v3},[rcx],#32
359
360	shl.2d	v1,v1,#4
361	shl.2d	v3,v3,#4
362
363	orr.8b	v0, v0, v1
364	orr.8b	v2, v2, v3
365
366	ushr.2d	v1, v0, #30
367	ushr.2d	v3, v2, #30
368
369	orr.8b	v0, v0, v1
370	orr.8b	v2, v2, v3
371
372	zip1.2s	v0, v0, v2
373	st1.2s  {v0},[rdi],#8
374	cmp		next_tag, rcx
375	b.hi	L_pack_2bits
376
377	/* ---------------------------------      packing 4-bits dict indices into dest_buf ----------------------------------   */
378
379	/* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
380	sub		rax, next_qp, tempQPosArray				// eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
381	add		eax, eax, #7							// num_bytes_to_pack+7
382	lsr		eax, eax, #3							// num_packed_words = (num_bytes_to_pack + 7) >> 3
383	add		rcx, tempQPosArray, rax, lsl #3			// endQPosArray = tempQPosArray + 2*num_source_words
384	lsl		rax, rax, #2
385	subs	byte_count, byte_count, rax
386	b.lt	L_budgetExhausted
387
388	cmp		rcx, next_qp							// endQPosArray vs next_qp
389	b.ls	2f 										// if (next_qp >= endQPosArray) skip the following zero paddings
390	sub		rax, rcx, next_qp
391	mov		edx, #0
392	tst		eax, #4
393	b.eq	1f
394	str		edx, [next_qp], #4
3951:	tst		eax, #2
396	b.eq	1f
397	strh	edx, [next_qp], #2
3981:	tst		eax, #1
399	b.eq	2f
400	strb	edx, [next_qp], #1
4012:
402	mov		rdi, next_full_patt						// next_full_patt
403	cmp		rcx, tempQPosArray						// endQPosArray vs tempQPosArray
404	ldr		eax, [dest_buf]
405	b.ls	L20										// if (endQPosArray <= tempQPosArray) skip the following
406	mov		rdx, tempQPosArray						// tempQPosArray
407
408	/* packing 4-bits dict indices into dest_buf */
409L_pack_4bits:
410	ldr		rax, [rdx], #8							// src_next[1]:src_next[0]
411	orr		rax, rax, rax, lsr #28					// eax = src_next[0] | (src_next[1] << 4)
412	cmp		rcx, rdx								// source_end vs src_next
413	str		eax, [rdi], #4							// *dest_next++ = temp;
414	b.hi	L_pack_4bits							// while (src_next < source_end) repeat the loop
415
416	// SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
417	sub		rax, rdi, dest_buf						// boundary_tmp - dest_buf
418	lsr		eax, eax, #2							// boundary_tmp - dest_buf in words
419L20:
420	str		eax, [dest_buf,#4]						// dest_buf[1] = boundary_tmp - dest_buf
421
422
423
424	/*  --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[]   ----------------------------------------- */
425
426	add		rcx, scratch, #(2048*scale)				// tempLowBitsArray
427    sub		rdx, next_low_bits, rcx					// next_low_bits - tempLowBitsArray (in bytes)
428	lsr		rdx, rdx, #1							// num_tenbits_to_pack (in half-words)
429	subs	edx, edx, #3							// pre-decrement num_tenbits_to_pack by 3
430	b.lt	1f										// if num_tenbits_to_pack < 3, skip the following loop
4310:
432	subs	byte_count, byte_count, #4				// byte_count -= 4
433	b.le	L_budgetExhausted						// return -1 to signal this page is not compressable
434	subs	edx, edx, #3							// num_tenbits_to_pack-=3
435	ldr		rax, [rcx], #6
436	bfm		rax, rax, #58, #9						// pack 1st toward 2nd
437	bfm		rax, rax, #58, #25						// pack 1st/2nd toward 3rd
438	lsr		rax, rax, #12
439	str		eax, [rdi], #4							// pack w0,w1,w2 into 1 dest_buf word
440	b.ge	0b										// if no less than 3 elements, back to loop head
441
4421: 	adds	edx, edx, #3							// post-increment num_tenbits_to_pack by 3
443	b.eq	3f										// if num_tenbits_to_pack is a multiple of 3, skip the following
444	subs	byte_count, byte_count, #4				// byte_count -= 4
445	b.le	L_budgetExhausted						// return -1 to signal this page is not compressable
446	ldrh	eax,[rcx]								// w0
447	subs	edx, edx, #1							// num_tenbits_to_pack--
448	b.eq	2f										//
449	ldrh	edx, [rcx, #2]							// w1
450	orr		eax, eax, edx, lsl #10					// w0 | (w1<<10)
451
4522:	str		eax, [rdi], #4							// write the final dest_buf word
453
4543:	sub		rax, rdi, dest_buf						// boundary_tmp - dest_buf
455	lsr		eax, eax, #2							// boundary_tmp - dest_buf in terms of words
456	str		eax, [dest_buf, #8]						// SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
457	lsl		w0, eax, #2								// boundary_tmp - dest_buf in terms of bytes
458
459L_done:
460
461	// restore registers and return
462	mov		x15, sp
463    ldp     x20, x21, [x15, #0]             // restore x20, x21
464    ldp     x22, x23, [x15, #16]            // restore x22, x23
465    ldp     x24, x25, [x15, #32]            // restore x24, x25
466    ldp     x26, x27, [x15, #48]            // restore x26, x27
467    add     sp, sp, #128					// deallocate for dictionary + saved register space
468
469#if KERNEL
470	ld1.4s  {v0,v1,v2,v3},[sp],#64
471#endif
472	ret		lr
473
474    .align  4
475L_budgetExhausted:
476    mov     x0, #-1
477    b       L_done
478
479
480	.align 4,0x90
481L_RECORD_EXACT:
482/*
483		we have an exact match of the input word to its corresponding dictionary word
484		write tag/dict_index to the temorary buffers
485*/
486	mov		eax, #3
487	lsr		w14, wdict_location, #2				// divide by 4 for word offset
488	subs	remaining, remaining, #1			// remaing--;
489	strb	eax, [next_tag], #1					// *next_tag++ = 3 for exact
490	strb	w14, [next_qp], #1					// *next_qp = word offset (4-bit)
491	b.gt	L_loop
492	b		CHECKPOINT   						// if remaining = 0, break
493
494	.align 4,0x90
495L_RECORD_PARTIAL:
496/*
497		we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
498		write tag/dict_index/low 10 bits to the temorary buffers
499*/
500	mov		ecx, #1
501	strb	ecx, [next_tag], #1					// *next_tag++ = 1 for partial matched
502	str		edx, [dictionary, dict_location]	// *dict_location = input_word;
503	subs	remaining, remaining, #1			// remaing--;
504	lsr		eax, wdict_location, #2				// offset in 32-bit word
505	and		edx, edx, #1023						// lower 10 bits
506	strb	eax, [next_qp], #1					// update *next_qp++
507	strh	edx, [next_low_bits], #2			// save next_low_bits++
508	b.gt	L_loop
509
510CHECKPOINT:
511
512    cbz     mode, L_check_compression_ratio             // if this this an early abort check..
513
514L_check_zero_page:
515
516    cmp     start_next_full_patt, next_full_patt        // check if any dictionary misses in page
517    b.ne    L_check_single_value_page
518
519    cmp     start_next_qp, next_qp                      // check if any partial or exact dictionary matches
520    b.ne    L_check_single_value_page
521
522    mov     x0, #SV_RETURN                              // Magic return value
523    b       L_done
524
525L_check_single_value_page:
526
527    sub     rax, next_full_patt, start_next_full_patt   // get # dictionary misses
528    lsr     rax, rax, #2
529
530    sub     r11, next_qp, start_next_qp                 // get # dictionary hits (exact + partial)
531
532    sub     r13, next_low_bits, start_next_low_bits     // get # dictionary partial hits
533    lsr     r13, r13, #1
534
535    // Single value page if one of the follwoing is true:
536    //  partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss)
537    //  partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial)
538    //
539    cbnz    r13, 1f                                     // were there 0 partial hits?
540
541    cmp     r11, #(256*PAGES_SIZE_IN_KBYTES - 1)        // were there 1023 dictionary hits
542    b.ne    1f
543
544    cmp     rax, #1                                     // was there exacly 1 dictionary miss?
545    b.ne    1f
546
547    ldrb    edx, [tempTagsArray]                        // read the very 1st tag
548    cmp     edx, #2                                     // was the very 1st tag a miss?
549    b.eq    L_is_single_value_page
550
5511:
552    cmp     r13, #1                                     // was there 1 partial hit?
553    b.ne    L_check_mostly_zero
554
555    cmp     r11, #(256*PAGES_SIZE_IN_KBYTES)           // were there 1024 dictionary hits
556    b.ne    L_check_mostly_zero
557
558    ldrb    edx, [tempTagsArray]                        // read the very 1st tag
559    cmp     edx, #1                                     // was the very 1st tag a partial?
560    b.ne    L_check_mostly_zero
561
562L_is_single_value_page:
563
564    mov     x0, #SV_RETURN                              // Magic return value
565    b       L_done
566
567L_check_mostly_zero:
568                                                        // how much space will the sparse packer take?
569    add     rax, rax, r11                               // rax += (next_qp - start_next_qp)
570    mov     rdx, #6
571    mov     rcx, #4
572    madd    r11, rax, rdx, rcx                          // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
573
574    sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
575    mov     rdx, #1365
576    mul     rax, rax, rdx
577
578    sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
579    add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
580
581    sub     rdx, next_qp, start_next_qp
582    add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
583    add     rax, rax, #(12+256*scale)                   // rax += bytes taken by the header + tags
584
585    cmp     rax, r11                                    // is the default packer the better option?
586    b.lt    L_done_search
587
588    cmp     r11, byte_budget                            // can the sparse packer fit into the given budget?
589    b.gt    L_budgetExhausted
590
591L_sparse_packer:
592    mov     edx, #MZV_MAGIC
593    str     edx, [dest_buf], #4                         // header to indicate a sparse packer
594
595    mov     rdx, #0                                     // rdx = byte offset in src of non-0 word
5961:
597    ldr     rax, [start_next_input_word, rdx]           // rax = read dword
598    cbnz    rax, 5f                                     // is dword != 0
5993:
600    add     rdx, rdx, #8                                // 8 more bytes have been processed
6014:
602    cmp     rdx, #(4096*scale)                          // has the entire page been processed
603    b.ne    1b
604    mov     x0, r11                                     // store the size of the compressed stream
605    b       L_done
606
6075:
608    cbz     eax, 6f                                     // is lower word == 0
609    str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
610    strh    edx, [dest_buf], #2                         // store the byte index
6116:
612    lsr     rax, rax, 32                                // get the upper word into position
613    cbz     eax, 3b                                     // is dword == 0
614    add     rdx, rdx, #4
615    str     eax, [dest_buf], #4                         // store the non-0 word in the dest buffer
616    strh    edx, [dest_buf], #2                         // store the byte index
617    add     rdx, rdx, #4
618    b       4b
619
620L_check_compression_ratio:
621
622    mov     mode, NORMAL
623	mov		remaining, #((1024 - CHKPT_WORDS)*scale)    // remaining input words to process
624    cbz     remaining, CHECKPOINT                       // if there are no remaining words to process
625
626    sub     rax, next_low_bits, start_next_low_bits     // get bytes consumed by lower-10 bits
627    mov     rdx, #1365
628    mul     rax, rax, rdx
629
630    sub     rdx, next_full_patt, start_next_full_patt   // get bytes consumed by dictionary misses
631    add     rax, rdx, rax, lsr #11                      // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
632
633    sub     rdx, next_qp, start_next_qp
634    add     rax, rax, rdx, lsr #1                       // rax += (next_qp - start_next_qp)/2
635    subs    rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale)
636                                                        // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES
637
638    b.gt    L_budgetExhausted                           // if rax is > 0, we need to early abort
639    b       L_loop                                      // we are done
640