xref: /xnu-8020.101.4/osfmk/arm/WKdmDecompress_new.s (revision e7776783b89a353188416a9a346c6cdb4928faad)
1/*
2 * Copyright (c) 2000-2013 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains armv7 hand optimized implementation of WKdm memory page decompressor.
31
32	void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, __unused__ unsigned int words);
33
34	input :
35		src_buf : address of input compressed data buffer
36		dest_buf : address of output decompressed buffer
37		scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller
38		words : this argument is not used in the implementation
39
40	output :
41
42		the input buffer is decompressed and the dest_buf is written with decompressed data.
43
44	Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress armv7 assembly code WKdmCompress.s
45
46	The bit stream (*src_buf) consists of
47		a. 12 bytes header
48		b. 256 bytes for 1024 packed tags
49		c. (varying number of) words for new words not matched to dictionary word.
50		d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
51		e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
52
53	where the header (of 3 words) specifies the ending boundaries (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
54
55	The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows
56
57		for (i=0;i<1024;i++) {
58			tag = *next_tag++
59			switch (tag) {
60				case 0 : *dest_buf++ = 0; break;
61				case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break;
62				case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break;
63				case 3 : *dest_buf++ = dictionary[*dict_index++];  break;
64			}
65
66 	cclee, 11/9/12
67
68    Added zero page, single value page, sparse page, early abort optimizations
69    rsrini, 09/14/14
70
71*/
72
73    #define MZV_MAGIC           17185      // magic value used to identify MZV page encoding
74
75	#define	ZERO				0
76	#define	PARTIAL_MATCH		1
77	#define	MISS_TAG			2
78	#define	MATCH				3
79
80	.text
81	.syntax unified
82	.align	4
83
84	// void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes);
85
86	.globl _WKdm_decompress_new
87_WKdm_decompress_new:
88
89	/*
90			--------   symbolizing registers --------
91			the armv7 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names.
92	*/
93
94	#define	src_buf			r0
95	#define	dest_buf		r1
96	#define	scratch			r2
97	#define	eax				r3
98	#define	ebx				r4
99	#define	hashTable		r4
100	#define	ecx				r5
101	#define	edx				r6
102    #define n_bytes         r8
103	#define	next_tag		r12
104	#define	tags_counter	lr
105	#define	dictionary		sp
106	#define	v0		q0
107	#define	v1		q1
108	#define	v2		q2
109	#define	v3		q3
110	#define	v4		q4
111	#define	v5		q5
112
113	// and scratch memory for local variables
114
115    // [sp,#0] : dictionary
116    // [scratch,#0] : tempTagsArray		was 64
117    // [scratch,#1024] : tempQPosArray  was 1088
118    // [scratch,#2048] : tempLowBitsArray was 2112
119
120	push	{r7, lr}
121	mov		r7, sp
122	push	{r4-r6,r8-r11}
123#if KERNEL
124	sub		ecx, sp, #96
125	sub		sp, sp, #96
126	vst1.64	{q0,q1},[ecx]!
127	vst1.64	{q2,q3},[ecx]!
128	vst1.64	{q4,q5},[ecx]!
129#endif
130	sub		sp, sp, #64			// allocate for dictionary
131
132    mov     n_bytes, r3                         // save the n_bytes passed as function args
133    ldr     eax, [src_buf]                      // read the 1st word from the header
134    mov     ecx, #MZV_MAGIC
135    cmp     eax, ecx                            // is the alternate packer used (i.e. is MZV page)?
136    bne     L_default_decompressor              // default decompressor was used
137
138                                                // Mostly Zero Page Handling...
139                                                // {
140    add     src_buf, src_buf, 4                 // skip the header
141    mov     eax, dest_buf
142    mov     ecx, #4096                          // number of bytes to zero out
143    mov     r9, #0
144    mov     r10, #0
145    mov     r11, #0
146    mov     r12, #0
1471:
148    subs    ecx, ecx, #64
149    stmia   eax!, {r9-r12}
150    stmia   eax!, {r9-r12}
151    stmia   eax!, {r9-r12}
152    stmia   eax!, {r9-r12}
153    bne     1b
154
155    mov     r12, #4                             // current byte position in src to read from
1562:
157    ldr     eax, [src_buf], #4                  // get the word
158    ldrh    edx, [src_buf], #2                  // get the index
159    str     eax, [dest_buf, edx]                // store non-0 word in the destination buffer
160    add     r12, r12, #6                        // 6 more bytes processed
161    cmp     r12, n_bytes                        // finished processing all the bytes?
162    bne     2b
163    b       L_done
164                                                // }
165
166L_default_decompressor:
167
168    /*
169			---------------------- set up registers and PRELOAD_DICTIONARY ---------------------------------
170	*/
171    // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR
172	vmov.i32	q0, #0
173	mov		r8, sp
174	adr		ebx, _table_2bits
175    vst1.64	{q0}, [r8]!
176	add		r10, src_buf, #268			// TAGS_AREA_END
177    vst1.64	{q0}, [r8]!
178	add		eax, src_buf, #12			// TAGS_AREA_START
179    vst1.64	{q0}, [r8]!
180	mov		ecx, scratch				// tempTagsArray
181    vst1.64	{q0}, [r8]!
182	vld1.64	{q0,q1},[ebx,:128]
183
184
185	// WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray);
186/*
187	unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes
188    for arm64, this can be done by
189		1. read the input 32-bit word into GPR w
190    	2. duplicate GPR into 4 elements in a vector register v0
191    	3. ushl.4s vd, v0, vshift   where vshift = {0, -2, -4, -6}
192    	4. and.4s  vd, vd, vmask    where vmask = 0x03030303030303030303030303030303
193*/
194
195L_WK_unpack_2bits:
196	vld1.64	{v5}, [eax]!				// read 4 32-bit words for 64 2-bit tags
197	vdup.32	v2, d10[0]					// duplicate to 4 elements
198	vdup.32	v3, d10[1]					// duplicate to 4 elements
199	vdup.32	v4, d11[0]					// duplicate to 4 elements
200	vdup.32	v5, d11[1]					// duplicate to 4 elements
201	vshl.u32	v2, v2, v0				// v0 = {0, -2, -4, -6}
202	vshl.u32	v3, v3, v0				// v0 = {0, -2, -4, -6}
203	vshl.u32	v4, v4, v0				// v0 = {0, -2, -4, -6}
204	vshl.u32	v5, v5, v0				// v0 = {0, -2, -4, -6}
205	vand	v2, v2, v1					// v1 = {3,3,...,3}
206	vand	v3, v3, v1					// v1 = {3,3,...,3}
207	vand	v4, v4, v1					// v1 = {3,3,...,3}
208	vand	v5, v5, v1					// v1 = {3,3,...,3}
209	vst1.64	{v2,v3}, [ecx,:128]!		// write 64 tags into tempTagsArray
210	cmp		r10, eax					// TAGS_AREA_END vs TAGS_AREA_START
211	vst1.64	{v4,v5}, [ecx,:128]!		// write 64 tags into tempTagsArray
212	bhi	L_WK_unpack_2bits				// if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits
213
214
215	// WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray);
216
217	ldm		src_buf, {r8,r9}			// WKdm header qpos start and end
218	adr		ebx, _table_4bits
219	subs	r12, r9, r8					// r12 = (QPOS_AREA_END - QPOS_AREA_START)/4
220	add		r8, src_buf, r8, lsl #2		// QPOS_AREA_START
221	add		r9, src_buf, r9, lsl #2		// QPOS_AREA_END
222	bls		1f							// if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits
223	add		ecx, scratch, #1024			// tempQPosArray
224	vld1.64	{v0,v1},[ebx,:128]
225
226	subs	r12, r12, #1
227	bls		2f							// do loop of 2 only if w14 >= 5
228L_WK_unpack_4bits:
229	vld1.64	{d4}, [r8]!					// read a 32-bit word for 8 4-bit positions
230	subs	r12, r12, #2
231	vmov	d5, d4
232	vzip.32	d4, d5
233	vshl.u32	v2, v2, v0				// v0 = {0, -4, 0, -4}
234	vand	v2, v2, v1					// v1 = {15,15,...,15}
235	vst1.64	{q2}, [ecx,:128]!
236	bhi		L_WK_unpack_4bits
2372:
238	adds	r12, r12, #1
239	ble	1f
240
241	ldr		r12, [r8], #4				// read a 32-bit word for 8 4-bit positions
242	vdup.32	d4, r12						// duplicate to 2 elements
243	vshl.u32	v2, v2, v0				// v0 = {0, -4}
244	vand	v2, v2, v1					// v1 = {15,15,...,15}
245	vst1.64	{d4}, [ecx,:64]!			// write 16 tags into tempTagsArray
246
2471:
248
249	// WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray);
250
251	ldr		eax, [src_buf,#8]			// LOW_BITS_AREA_END offset
252	add		r8, src_buf, eax, lsl #2	// LOW_BITS_AREA_END
253	cmp		r8, r9						// LOW_BITS_AREA_START vs LOW_BITS_AREA_END
254	add		ecx, scratch, #2048			// tempLowBitsArray
255	add		edx, scratch, #4096			// last tenbits
256	bls		1f							// if START>=END, skip L_WK_unpack_3_tenbits
257
258	adr		ebx, _table_10bits
259	vld1.64	{v0,v1},[ebx,:128]
260
261	mov		r11, #0x03ff
262L_WK_unpack_3_tenbits:
263	ldr		r12, [r9], #4				// read a 32-bit word for 3 low 10-bits
264	and		lr, r11, r12
265	strh	lr, [ecx], #2
266	cmp		ecx, edx
267	and		lr, r11, r12, lsr #10
268	beq		1f
269	strh	lr, [ecx], #2
270	and		lr, r11, r12, lsr #20
271	strh	lr, [ecx], #2
272
273	cmp		r8, r9						// LOW_BITS_AREA_START vs LOW_BITS_AREA_END
274	bhi		L_WK_unpack_3_tenbits		// repeat loop if LOW_BITS_AREA_END > next_word
275
2761:
277	/*
278		set up before going to the main decompress loop
279	*/
280
281	mov		next_tag, scratch			// tempTagsArray
282	add		r8, scratch, #1024			// next_qpos
283	add		r11, scratch, #2048			// tempLowBitsArray
284#if defined(KERNEL) && !SLIDABLE
285    adr     hashTable, L_table
286    ldr     hashTable, [hashTable]
287#else
288    ldr     hashTable, L_table
289L_table0:
290    ldr     hashTable, [pc, hashTable]
291#endif
292	mov		tags_counter, #1024			// tags_counter
293
294	b		L_next
295
296	.align 4,0x90
297L_ZERO_TAG:
298	/*
299		we can only get here if w9 = 0, meaning this is a zero tag
300		*dest_buf++ = 0;
301	*/
302	subs	tags_counter,tags_counter,#1	// tags_counter--
303	str		r9, [dest_buf], #4				// *dest_buf++ = 0
304	ble		L_done							// if next_tag >= tag_area_end, we're done
305
306	/* WKdm decompress main loop */
307L_next:
308	ldrb	r9, [next_tag], #1				// new tag
309	cmp		r9, #0
310	beq		L_ZERO_TAG
311	cmp		r9, #2							// partial match tag ?
312	beq		L_MISS_TAG
313	bgt		L_EXACT_TAG
314
315L_PARTIAL_TAG:
316	/*
317			this is a partial match:
318				dict_word = dictionary[*dict_index];
319				dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++;
320	*/
321	ldrb	edx, [r8], #1					// qpos = *next_qpos++
322	ldrh	ecx, [r11], #2					// lower 10-bits from *next_low_bits++
323	ldr		eax, [dictionary, edx, lsl #2]	// read dictionary word
324	subs	tags_counter,tags_counter,#1	// tags_counter--
325	lsr		eax, eax, #10					// clear lower 10 bits
326	orr		eax, ecx, eax, lsl #10			// pad the lower 10-bits from *next_low_bits
327	str		eax, [dictionary,edx,lsl #2]	// *dict_location = newly formed word
328	str		eax, [dest_buf], #4				// *dest_buf++ = newly formed word
329	bgt		L_next							// repeat loop until next_tag==tag_area_end
330
331L_done:
332
333	add		sp, sp, #64			// deallocate for dictionary
334
335	// release stack memory, restore registers, and return
336#if KERNEL
337	vld1.64	{q0,q1},[sp]!
338	vld1.64	{q2,q3},[sp]!
339	vld1.64	{q4,q5},[sp]!
340#endif
341	pop		{r4-r6,r8-r11}
342	pop		{r7,pc}
343
344	.align 4,0x90
345L_MISS_TAG:
346	/*
347		this is a dictionary miss.
348			x = *new_word++;
349			k = (x>>10)&255;
350			k = hashTable[k];
351			dictionary[k] = *dest_buf++ = x;
352	*/
353	subs	tags_counter,tags_counter,#1	// tags_counter--
354	ldr		eax, [r10], #4					// w = *next_full_patt++
355	lsr		edx, eax, #10					// w>>10
356	str		eax, [dest_buf], #4				// *dest_buf++ = word
357	and		edx, edx, #0x0ff				// 8-bit hash table index
358	ldrb	edx, [ebx, edx]					// qpos
359	str		eax, [dictionary,edx]			// dictionary[qpos] = word
360	bgt		L_next							// repeat the loop
361	b		L_done							// if next_tag >= tag_area_end, we're done
362
363	.align 4,0x90
364
365L_EXACT_TAG:
366	/*
367			this is an exact match;
368			*dest_buf++ = dictionary[*dict_index++];
369	*/
370
371	ldrb	eax, [r8], #1					// qpos = *next_qpos++
372	subs	tags_counter,tags_counter,#1	// tags_counter--
373	ldr		eax, [dictionary,eax,lsl #2]	// w = dictionary[qpos]
374	str		eax, [dest_buf], #4				// *dest_buf++ = w
375	bgt		L_next							// repeat the loop
376	b		L_done							// if next_tag >= tag_area_end, we're done
377
378
379	.align 4
380
381_table_2bits:
382	.word	0
383	.word	-2
384	.word	-4
385	.word	-6
386	.word	0x03030303
387	.word	0x03030303
388	.word	0x03030303
389	.word	0x03030303
390
391_table_4bits:
392	.word	0
393	.word	-4
394	.word	0
395	.word	-4
396	.word	0x0f0f0f0f
397	.word	0x0f0f0f0f
398	.word	0x0f0f0f0f
399	.word	0x0f0f0f0f
400
401_table_10bits:
402	.word	0
403	.word	-10
404	.word	-20
405	.word	0
406	.word	1023
407	.word	1023
408	.word	1023
409	.word	0
410
411
412#if defined(KERNEL) && !SLIDABLE
413    .align  2
414L_table:
415    .long   _hashLookupTable_new
416#else
417    .align  2
418L_table:
419    .long   L_Tab$non_lazy_ptr-(L_table0+8)
420
421     .section    __DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
422    .align  2
423L_Tab$non_lazy_ptr:
424    .indirect_symbol    _hashLookupTable_new
425    .long   0
426#endif
427
428