xref: /xnu-10002.61.3/osfmk/x86_64/WKdmDecompress_new.s (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains x86_64 hand optimized implementation of WKdm memory page decompressor.
31
32	void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
33
34	input :
35		src_buf : address of input compressed data buffer
36		dest_buf : address of output decompressed buffer
37		scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38		words : this argument is not used in the implementation
39
40	output :
41
42		the input buffer is decompressed and the dest_buf is written with decompressed data.
43
44	Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress x86_64 assembly code WKdmCompress.s
45
46	The bit stream (*src_buf) consists of
47		a. 12 bytes header
48		b. 256 bytes for 1024 packed tags
49		c. (varying number of) words for new words not matched to dictionary word.
50		d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51		e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
52
53	where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
54
55	The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
56
57		for (i=0;i<1024;i++) {
58			tag = *next_tag++
59			switch (tag) {
60				case 0 : *dest_buf++ = 0; break;
61				case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62				case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63				case 3 : *dest_buf++ = dictionary[*dict_index++];  break;
64			}
65
66 	cclee, 11/30/12
67
68    Added zero page, single value page, sparse page, early abort optimizations
69    rsrini, 09/14/14
70*/
71
72#define MZV_MAGIC           $17185      // magic value used to identify MZV page encoding
73
74	.text
75
76	.globl _WKdm_decompress_new
77_WKdm_decompress_new:
78
79	// save registers, and allocate stack memory for local variables
80
81	pushq	%rbp
82	movq	%rsp, %rbp
83	pushq	%r12
84	pushq	%r13
85	pushq	%rbx
86
87	subq	$(64+8+16), %rsp
88
89    movl    0(%rdi), %eax               // read the 1st word from the header
90    cmpl    MZV_MAGIC, %eax             // is the alternate packer used (i.e. is MZV page)?
91    jne     L_default_decompressor      // default decompressor was used
92
93                                        // Mostly Zero Page Handling...
94                                        // {
95    movq    $0, %rax
961:                                      // Zero out the entire page
97    movq    $0, 0(%rsi, %rax)
98    movq    $0, 8(%rsi, %rax)
99    movq    $0, 16(%rsi, %rax)
100    movq    $0, 24(%rsi, %rax)
101    movq    $0, 32(%rsi, %rax)
102    movq    $0, 40(%rsi, %rax)
103    movq    $0, 48(%rsi, %rax)
104    movq    $0, 56(%rsi, %rax)
105    addq    $64, %rax
106    cmpq    $4096, %rax
107    jne     1b
108
109    movq    $4, %r12                    // current byte position in src to read from
1102:
111    movl    0(%rdi, %r12), %eax         // get the word
112    movzwq  4(%rdi, %r12), %rdx         // get the index
113    movl    %eax, 0(%rsi, %rdx)         // store non-0 word in the destination buffer
114    addq    $6, %r12                    // 6 more bytes processed
115    cmpl    %ecx, %r12d                 // finished processing all the bytes?
116    jne     2b
117    jmp     L_done
118                                        // }
119
120L_default_decompressor:
121
122	movq	%rsi, %r12					// dest_buf
123	movq	%rdx, %r13					// scratch_buf
124
125	// PRELOAD_DICTONARY; dictionary starting address : starting address 0(%rsp)
126    // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
127#if 1
128	movl	$0, 0(%rsp)
129	movl	$0, 4(%rsp)
130	movl	$0, 8(%rsp)
131	movl	$0, 12(%rsp)
132	movl	$0, 16(%rsp)
133	movl	$0, 20(%rsp)
134	movl	$0, 24(%rsp)
135	movl	$0, 28(%rsp)
136	movl	$0, 32(%rsp)
137	movl	$0, 36(%rsp)
138	movl	$0, 40(%rsp)
139	movl	$0, 44(%rsp)
140	movl	$0, 48(%rsp)
141	movl	$0, 52(%rsp)
142	movl	$0, 56(%rsp)
143	movl	$0, 60(%rsp)
144#else
145	mov		$0x100000001, %rax
146	mov		%rax, (%rsp)
147	mov		%rax, 8(%rsp)
148	mov		%rax, 16(%rsp)
149	mov		%rax, 24(%rsp)
150	mov		%rax, 32(%rsp)
151	mov		%rax, 40(%rsp)
152	mov		%rax, 48(%rsp)
153	mov		%rax, 56(%rsp)
154#endif
155
156	// WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
157
158	leaq	268(%rdi), %r10				// TAGS_AREA_END
159	leaq	12(%rdi), %rax				// TAGS_AREA_START
160	movq	%r13, %rsi					// tempTagsArray
161	cmpq	%rax, %r10					// TAGS_AREA_END vs TAGS_AREA_START
162	jbe		1f							// if TAGS_AREA_END <= TAGS_AREA_START, skip L_WK_unpack_2bits
163	movq	%r13, %rcx					// next_word
164	xorl	%r8d, %r8d					// i = 0
165	mov		$(50529027<<32)+50529027, %r9
166L_WK_unpack_2bits:
167	movl	12(%rdi,%r8, 4), %eax
168	movl	12(%rdi,%r8, 4), %edx
169	shrl	$2, %eax
170	shlq	$32, %rax
171	orq		%rdx, %rax
172	movq	%rax, %rdx
173	shrq	$4, %rax
174	andq	%r9, %rdx
175	andq	%r9, %rax
176	incq	%r8							// i++
177	movq	%rdx, (%rcx)
178	movq	%rax, 8(%rcx)
179	addq	$16, %rcx					// next_tags += 16
180	cmpq	$64, %r8					// i vs 64
181	jne		L_WK_unpack_2bits			// repeat loop until i==64
1821:
183
184
185	// WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
186
187	mov		4(%rdi), %eax				// WKdm header qpos end
188	leaq	(%rdi,%rax,4), %r9			// QPOS_AREA_END
189	mov		0(%rdi), %eax				// WKdm header qpos start
190	leaq	(%rdi,%rax,4), %r8			// QPOS_AREA_START
191	leaq	1024(%r13), %rbx			// tempQPosArray
192	cmpq	%r8, %r9					// QPOS_AREA_END vs QPOS_AREA_START
193	jbe		1f							// if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
194	leaq	8(%rbx), %rcx				// next_qpos
195
196	mov		$(252645135<<32)+252645135, %r11
197L_WK_unpack_4bits:
198	movl	(%r8), %eax					// w = *next_word
199	movl	%eax, %edx					// w
200	shlq	$28, %rax
201	orq		%rdx, %rax
202	addq	$4, %r8						// next_word++
203	andq	%r11, %rax
204	movq	%rax, -8(%rcx)
205	addq	$8, %rcx					// next_qpos+=8
206	cmpq	%r8, %r9					// QPOS_AREA_END vs QPOS_AREA_START
207	ja		L_WK_unpack_4bits			// repeat loop until QPOS_AREA_END <= QPOS_AREA_START
208
209
2101:
211
212	// WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
213
214	movl	8(%rdi), %eax				// LOW_BITS_AREA_END offset
215	leaq	(%rdi,%rax,4), %rdi			// LOW_BITS_AREA_END
216	leaq	2048(%r13), %r11			// tempLowBitsArray
217	leaq	4094(%r13), %r13			// final tenbits addr
218	sub		%r9, %rdi					// LOW_BITS_AREA_START vs LOW_BITS_AREA_END
219	jle		1f							// if START>=END, skip L_WK_unpack_3_tenbits
220	movq	%r11, %rcx					// next_low_bits
221L_WK_unpack_3_tenbits:
222	movl	(%r9), %eax					// w = *next_word, 0:c:b:a
223	movl	$(1023<<10), %edx
224	movl	$(1023<<20), %r8d
225	andl	%eax, %edx					// b << 10
226	andl	%eax, %r8d					// c << 20
227	andq	$1023, %rax
228	shll	$6, %edx
229	shlq	$12, %r8
230	orl		%edx, %eax
231	orq		%r8, %rax
232	cmp		%r13, %rcx
233	je		2f
234	mov		%rax, (%rcx)
235	jmp		3f
2362:	mov		%ax, (%rcx)
2373:
238	addq	$4, %r9						// next_word++
239	addq	$6, %rcx					// next_low_bits += 3
240	sub		$4, %rdi
241	jg		L_WK_unpack_3_tenbits		// repeat loop if LOW_BITS_AREA_END > next_word
2421:
243
244
245	#define	next_qpos		%rbx
246	#define	hash			%r8
247	#define	tags_counter	%edi
248	#define	dest_buf		%r12
249	#define next_full_patt	%r10
250
251	leaq	_hashLookupTable_new(%rip), hash	// hash look up table
252	movl	$1024, tags_counter				// tags_counter
253	jmp		L_next
254
255	.align 4,0x90
256L_nonpartital:
257	jl		L_ZERO_TAG
258	cmpb	$2, -1(%rsi)
259	je		L_MISS_TAG
260
261L_EXACT_TAG:
262	movzbl	(next_qpos), %eax				// qpos = *next_qpos
263	incq	next_qpos						// next_qpos++
264	decl	tags_counter					// tags_counter--
265	movl	(%rsp,%rax,4), %eax				// w = dictionary[qpos]
266	movl	%eax, -4(dest_buf)				// *dest_buf = w
267	je		L_done
268
269L_next:
270	incq	%rsi							// next_tag++
271	addq	$4, dest_buf
272	cmpb	$1, -1(%rsi)
273	jne		L_nonpartital
274
275L_PARTIAL_TAG:
276	movzbl	(next_qpos),%edx				// qpos = *next_qpos
277	incq	next_qpos						// next_qpos++
278	movl	(%rsp,%rdx,4), %eax				// read dictionary word
279	andl	$-1024, %eax					// clear lower 10 bits
280	or		(%r11), %ax						// pad the lower 10-bits from *next_low_bits
281	addq	$2, %r11						// next_low_bits++
282	decl	tags_counter					// tags_counter--
283	movl	%eax, (%rsp,%rdx,4)				// *dict_location = newly formed word
284	movl	%eax, -4(dest_buf)				// *dest_buf = newly formed word
285	jg		L_next							// repeat loop until next_tag==tag_area_end
286
287L_done:
288
289	// release stack memory, restore registers, and return
290
291	addq	$(64+8+16), %rsp
292	popq	%rbx
293	popq	%r13
294	popq	%r12
295	leave
296	ret
297
298	.align 4,0x90
299L_MISS_TAG:
300	movl	(next_full_patt), %edx			// w = *next_full_patt
301	movl	(next_full_patt), %eax			// w = *next_full_patt
302	shrl	$10, %edx						// w>>10
303	addq	$4, next_full_patt				// next_full_patt++
304	movzbl	%dl, %edx						// 8-bit hash table index
305	movl	%eax, -4(dest_buf)				// *dest_buf = word
306	movzbl	(hash,%rdx),%edx				// qpos
307	decl	tags_counter					// tags_counter--
308	movl	%eax, (%rsp,%rdx)				// dictionary[qpos] = word
309	jg		L_next							// repeat the loop
310	jmp		L_done
311
312	.align 4,0x90
313L_ZERO_TAG:
314	decl	tags_counter					// tags_counter--
315	movl	$0, -4(dest_buf)					// *dest_buf = 0
316	jg		L_next							// repeat the loop
317	jmp		L_done
318
319
320