xref: /xnu-10002.61.3/osfmk/vm/lz4.c (revision 0f4c859e951fba394238ab619495c4e1d54d0f34)
1 /*
2  * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 // LZ4_RAW buffer API
30 // EB May 2015
31 // Mar 2016 Imported from the Compression project, with minor optimisations and
32 // early abort detection (Derek Kumar)
33 
34 #include <string.h>
35 #include "lz4.h"
36 
37 size_t
lz4raw_decode_buffer(uint8_t * __restrict dst_buffer,size_t dst_size,const uint8_t * __restrict src_buffer,size_t src_size,void * __restrict work)38 lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
39     const uint8_t * __restrict src_buffer, size_t src_size,
40     void * __restrict work __attribute__((unused)))
41 {
42 	const uint8_t * src = src_buffer;
43 	uint8_t * dst = dst_buffer;
44 
45 	// Go fast if we can, keeping away from the end of buffers
46 #if LZ4_ENABLE_ASSEMBLY_DECODE
47 	if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) {
48 		if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) {
49 			return 0; // FAIL
50 		}
51 	}
52 #endif
53 //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
54 
55 	// Finish safe
56 	if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size)) {
57 		return 0; // FAIL
58 	}
59 	return (size_t)(dst - dst_buffer); // bytes produced
60 }
61 // Debug flags
62 #if LZ4DEBUG
63 #define DEBUG_LZ4_ENCODE_ERRORS (1)
64 #define DEBUG_LZ4_DECODE_ERRORS (1)
65 #endif
66 
67 #if DEBUG_LZ4_ENCODE_ERRORS
68 #endif
69 
70 #if !LZ4_ENABLE_ASSEMBLY_ENCODE
71 
72 #if defined(__x86_64__) || defined(__x86_64h__)
73 # define LZ4_MATCH_SEARCH_INIT_SIZE 32
74 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32
75 #else
76 # define LZ4_MATCH_SEARCH_INIT_SIZE 8
77 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8
78 #endif
79 
80 // Return hash for 4-byte sequence X
81 static inline uint32_t
lz4_hash(uint32_t x)82 lz4_hash(uint32_t x)
83 {
84 	return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS);
85 }
86 
87 // Store 0xfff..fff at *PTR
88 static inline void
lz4_fill16(uint8_t * ptr)89 lz4_fill16(uint8_t * ptr)
90 {
91 	store8(ptr, -1);
92 	store8(ptr + 8, -1);
93 }
94 
95 // Return number of matching bytes 0..4 at positions A and B.
96 static inline size_t
lz4_nmatch4(const uint8_t * a,const uint8_t * b)97 lz4_nmatch4(const uint8_t * a, const uint8_t * b)
98 {
99 	uint32_t x = load4(a) ^ load4(b);
100 	return (x == 0)?4:(__builtin_ctzl(x) >> 3);
101 }
102 
103 // Return number of matching bytes 0..8 at positions A and B.
104 static inline size_t
lz4_nmatch8(const uint8_t * a,const uint8_t * b)105 lz4_nmatch8(const uint8_t * a, const uint8_t * b)
106 {
107 	uint64_t x = load8(a) ^ load8(b);
108 	return (x == 0)?8:(__builtin_ctzll(x) >> 3);
109 }
110 
111 // Return number of matching bytes 0..16 at positions A and B.
112 static inline size_t
lz4_nmatch16(const uint8_t * a,const uint8_t * b)113 lz4_nmatch16(const uint8_t * a, const uint8_t * b)
114 {
115 	size_t n = lz4_nmatch8(a, b);
116 	return (n == 8)?(8 + lz4_nmatch8(a + 8, b + 8)):n;
117 }
118 
119 // Return number of matching bytes 0..32 at positions A and B.
120 static inline size_t
lz4_nmatch32(const uint8_t * a,const uint8_t * b)121 lz4_nmatch32(const uint8_t * a, const uint8_t * b)
122 {
123 	size_t n = lz4_nmatch16(a, b);
124 	return (n == 16)?(16 + lz4_nmatch16(a + 16, b + 16)):n;
125 }
126 
127 // Return number of matching bytes 0..64 at positions A and B.
128 static inline size_t
lz4_nmatch64(const uint8_t * a,const uint8_t * b)129 lz4_nmatch64(const uint8_t * a, const uint8_t * b)
130 {
131 	size_t n = lz4_nmatch32(a, b);
132 	return (n == 32)?(32 + lz4_nmatch32(a + 32, b + 32)):n;
133 }
134 
135 // Compile-time selection, return number of matching bytes 0..N at positions A and B.
136 static inline size_t
lz4_nmatch(int N,const uint8_t * a,const uint8_t * b)137 lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
138 {
139 	switch (N) {
140 	case  4: return lz4_nmatch4(a, b);
141 	case  8: return lz4_nmatch8(a, b);
142 	case 16: return lz4_nmatch16(a, b);
143 	case 32: return lz4_nmatch32(a, b);
144 	case 64: return lz4_nmatch64(a, b);
145 	}
146 	__builtin_trap(); // FAIL
147 }
148 
149 // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
150 // We are allowed to access a constant number of bytes above DST_END.
151 // Return incremented DST pointer on success, and 0 on failure
152 static inline uint8_t *
lz4_store_length(uint8_t * dst,const uint8_t * const end,uint32_t L)153 lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L)
154 {
155 	(void)end;
156 	while (L >= 17 * 255) {
157 		lz4_fill16(dst);
158 		dst += 16;
159 		L -= 16 * 255;
160 	}
161 	lz4_fill16(dst);
162 	//DRKTODO verify these modulos/divisions are optimally handled by clang
163 	dst += L / 255;
164 	*dst++ = L % 255;
165 	return dst;
166 }
167 
168 static inline uint32_t
clamp(uint32_t x,uint32_t max)169 clamp(uint32_t x, uint32_t max)
170 __attribute__((overloadable))
171 {
172 	return x > max ? max : x;
173 }
174 
175 static inline uint8_t *
copy_literal(uint8_t * dst,const uint8_t * restrict src,uint32_t L)176 copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L)
177 {
178 	uint8_t *end = dst + L;
179 	{ copy16(dst, src); dst += 16; src += 16; }
180 	while (dst < end) {
181 		copy32(dst, src); dst += 32; src += 32;
182 	}
183 	return end;
184 }
185 
186 static uint8_t *
lz4_emit_match(uint32_t L,uint32_t M,uint32_t D,uint8_t * restrict dst,const uint8_t * const end,const uint8_t * restrict src)187 lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
188     uint8_t * restrict dst,
189     const uint8_t * const end,
190     const uint8_t * restrict src)
191 {
192 	//  The LZ4 encoding scheme requires that M is at least 4, because
193 	//  the actual value stored by the encoding is M - 4.  Check this
194 	//  requirement for debug builds.
195 	assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
196 	//  Having checked that M >= 4, translate M by four.
197 	M -= 4;
198 	//  Similarly, we must have D < 2**16, because we use only two bytes
199 	//  to represent the value of D in the encoding.
200 	assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
201 	//  Construct the command byte by clamping both L and M to 0 ... 15
202 	//  and packing them into a single byte, and store it.
203 	*dst++ = clamp(L, 15) << 4 | clamp(M, 15);
204 	//  If L is 15 or greater, we need to encode extra literal length bytes.
205 	if (L >= 15) {
206 		dst = lz4_store_length(dst, end, L - 15);
207 		if (dst == 0 || dst + L >= end) {
208 			return NULL;
209 		}
210 	}
211 	//  Copy the literal itself from src to dst.
212 	dst = copy_literal(dst, src, L);
213 	//  Store match distance.
214 	store2(dst, D); dst += 2;
215 	//  If M is 15 or greater, we need to encode extra match length bytes.
216 	if (M >= 15) {
217 		dst = lz4_store_length(dst, end, M - 15);
218 		if (dst == 0) {
219 			return NULL;
220 		}
221 	}
222 	return dst;
223 }
224 
225 /* #ifndef LZ4_EARLY_ABORT */
226 /* #define LZ4_EARLY_ABORT (1) */
227 /* #endif */
228 
229 #if LZ4_EARLY_ABORT
230 int lz4_do_early_abort = 1;
231 int lz4_early_aborts = 0;
232 #define LZ4_EARLY_ABORT_EVAL (448)
233 #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
234 #endif /* LZ4_EARLY_ABORT */
235 
236 void
lz4_encode_2gb(uint8_t ** dst_ptr,size_t dst_size,const uint8_t ** src_ptr,const uint8_t * src_begin,size_t src_size,lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],int skip_final_literals)237 lz4_encode_2gb(uint8_t ** dst_ptr,
238     size_t dst_size,
239     const uint8_t ** src_ptr,
240     const uint8_t * src_begin,
241     size_t src_size,
242     lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
243     int skip_final_literals)
244 {
245 	uint8_t *dst = *dst_ptr;  // current output stream position
246 	uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
247 	const uint8_t *src = *src_ptr; // current input stream literal to encode
248 	const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
249 	const uint8_t *match_begin = 0; // first byte of matched sequence
250 	const uint8_t *match_end = 0; // first byte after matched sequence
251 #if LZ4_EARLY_ABORT
252 	uint8_t * const dst_begin = dst;
253 	uint32_t lz4_do_abort_eval = lz4_do_early_abort;
254 #endif
255 
256 	while (dst < end) {
257 		ptrdiff_t match_distance = 0;
258 		for (match_begin = src; match_begin < src_end; match_begin += 1) {
259 			const uint32_t pos = (uint32_t)(match_begin - src_begin);
260 			const uint32_t w0 = load4(match_begin);
261 			const uint32_t w1 = load4(match_begin + 1);
262 			const uint32_t w2 = load4(match_begin + 2);
263 			const uint32_t w3 = load4(match_begin + 3);
264 			const int i0 = lz4_hash(w0);
265 			const int i1 = lz4_hash(w1);
266 			const int i2 = lz4_hash(w2);
267 			const int i3 = lz4_hash(w3);
268 			const uint8_t *c0 = src_begin + hash_table[i0].offset;
269 			const uint8_t *c1 = src_begin + hash_table[i1].offset;
270 			const uint8_t *c2 = src_begin + hash_table[i2].offset;
271 			const uint8_t *c3 = src_begin + hash_table[i3].offset;
272 			const uint32_t m0 = hash_table[i0].word;
273 			const uint32_t m1 = hash_table[i1].word;
274 			const uint32_t m2 = hash_table[i2].word;
275 			const uint32_t m3 = hash_table[i3].word;
276 			hash_table[i0].offset = pos;
277 			hash_table[i0].word = w0;
278 			hash_table[i1].offset = pos + 1;
279 			hash_table[i1].word = w1;
280 
281 			hash_table[i2].offset = pos + 2;
282 			hash_table[i2].word = w2;
283 			hash_table[i3].offset = pos + 3;
284 			hash_table[i3].word = w3;
285 
286 			match_distance = (match_begin - c0);
287 			if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
288 				match_end = match_begin + 4;
289 				goto EXPAND_FORWARD;
290 			}
291 
292 			match_begin++;
293 			match_distance = (match_begin - c1);
294 			if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
295 				match_end = match_begin + 4;
296 				goto EXPAND_FORWARD;
297 			}
298 
299 			match_begin++;
300 			match_distance = (match_begin - c2);
301 			if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
302 				match_end = match_begin + 4;
303 				goto EXPAND_FORWARD;
304 			}
305 
306 			match_begin++;
307 			match_distance = (match_begin - c3);
308 			if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
309 				match_end = match_begin + 4;
310 				goto EXPAND_FORWARD;
311 			}
312 
313 #if LZ4_EARLY_ABORT
314 			//DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
315 			if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
316 				ptrdiff_t dstd = dst - dst_begin;
317 
318 				if (dstd == 0) {
319 					lz4_early_aborts++;
320 					return;
321 				}
322 
323 /*            if (dstd >= pos) { */
324 /*                    return; */
325 /*            } */
326 /*            ptrdiff_t cbytes = pos - dstd; */
327 /*            if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos)  { */
328 /*                    return; */
329 /*            } */
330 				lz4_do_abort_eval = 0;
331 			}
332 #endif
333 		}
334 
335 		if (skip_final_literals) {
336 			*src_ptr = src; *dst_ptr = dst; return;
337 		}                                                        // do not emit the final literal sequence
338 
339 		//  Emit a trailing literal that covers the remainder of the source buffer,
340 		//  if we can do so without exceeding the bounds of the destination buffer.
341 		size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
342 		if (src_remaining < 15) {
343 			*dst++ = (uint8_t)(src_remaining << 4);
344 			memcpy(dst, src, 16); dst += src_remaining;
345 		} else {
346 			*dst++ = 0xf0;
347 			dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
348 			if (dst == 0 || dst + src_remaining >= end) {
349 				return;
350 			}
351 			memcpy(dst, src, src_remaining); dst += src_remaining;
352 		}
353 		*dst_ptr = dst;
354 		*src_ptr = src + src_remaining;
355 		return;
356 
357 EXPAND_FORWARD:
358 
359 		// Expand match forward
360 		{
361 			const uint8_t * ref_end = match_end - match_distance;
362 			while (match_end < src_end) {
363 				size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
364 				if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) {
365 					match_end += n; break;
366 				}
367 				match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
368 				ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
369 			}
370 		}
371 
372 		// Expand match backward
373 		{
374 			// match_begin_min = max(src_begin + match_distance,literal)
375 			const uint8_t * match_begin_min = src_begin + match_distance;
376 			match_begin_min = (match_begin_min < src)?src:match_begin_min;
377 			const uint8_t * ref_begin = match_begin - match_distance;
378 
379 			while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) {
380 				match_begin -= 1; ref_begin -= 1;
381 			}
382 		}
383 
384 		// Emit match
385 		dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
386 		if (!dst) {
387 			return;
388 		}
389 
390 		// Update state
391 		src = match_end;
392 
393 		// Update return values to include the last fully encoded match
394 		*dst_ptr = dst;
395 		*src_ptr = src;
396 	}
397 }
398 
399 #endif
400 
401 size_t
lz4raw_encode_buffer(uint8_t * __restrict dst_buffer,size_t dst_size,const uint8_t * __restrict src_buffer,size_t src_size,lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])402 lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
403     const uint8_t * __restrict src_buffer, size_t src_size,
404     lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
405 {
406 	//  Initialize hash table
407 	const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
408 
409 	const uint8_t * src = src_buffer;
410 	uint8_t * dst = dst_buffer;
411 
412 	// We need several blocks because our base function is limited to 2GB input
413 	const size_t BLOCK_SIZE = 0x7ffff000;
414 	while (src_size > 0) {
415 		//DRKTODO either implement pattern4 or figure out optimal unroll
416 		//DRKTODO: bizarrely, with plain O3 the compiler generates a single
417 		//DRKTODO: scalar STP per loop iteration with the stock loop
418 		//DRKTODO If hand unrolled, it switches to NEON store pairs
419 		// Reset hash table for each block
420 /* #if __STDC_HOSTED__ */
421 /*     memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
422 /* #else */
423 /*     for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
424 /* #endif */
425 
426 		for (int i = 0; i < LZ4_COMPRESS_HASH_ENTRIES;) {
427 			hash_table[i++] = HASH_FILL;
428 			hash_table[i++] = HASH_FILL;
429 			hash_table[i++] = HASH_FILL;
430 			hash_table[i++] = HASH_FILL;
431 		}
432 
433 		// Bytes to encode in this block
434 		const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
435 
436 		// Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
437 		// Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
438 		uint8_t * dst_start = dst;
439 		const uint8_t * src_start = src;
440 		lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
441 
442 		// Check progress
443 		size_t dst_used = dst - dst_start;
444 		size_t src_used = src - src_start; // src_used <= src_to_encode
445 		if (src_to_encode == src_size && src_used < src_to_encode) {
446 			return 0;                                        // FAIL to encode last block
447 		}
448 		// Note that there is a potential problem here in case of non compressible data requiring more blocks.
449 		// We may end up here with src_used very small, or even 0, and will not be able to make progress during
450 		// compression. We FAIL unless the length of literals remaining at the end is small enough.
451 		if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) {
452 			return 0;                                                  // FAIL too many literals
453 		}
454 		// Update counters (SRC and DST already have been updated)
455 		src_size -= src_used;
456 		dst_size -= dst_used;
457 	}
458 
459 	return (size_t)(dst - dst_buffer); // bytes produced
460 }
461 
462 typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
463 
464 int
lz4_decode(uint8_t ** dst_ptr,uint8_t * dst_begin,uint8_t * dst_end,const uint8_t ** src_ptr,const uint8_t * src_end)465 lz4_decode(uint8_t ** dst_ptr,
466     uint8_t * dst_begin,
467     uint8_t * dst_end,
468     const uint8_t ** src_ptr,
469     const uint8_t * src_end)
470 {
471 	uint8_t * dst = *dst_ptr;
472 	const uint8_t * src = *src_ptr;
473 
474 	//  Require dst_end > dst.
475 	if (dst_end <= dst) {
476 		goto OUT_FULL;
477 	}
478 
479 	while (src < src_end) {
480 		// Keep last good position
481 		*src_ptr = src;
482 		*dst_ptr = dst;
483 
484 		uint8_t cmd = *src++;                                        // 1 byte encoding literal+(match-4) length: LLLLMMMM
485 		uint32_t literalLength = (cmd >> 4) & 15; // 0..15
486 		uint32_t matchLength = 4 + (cmd & 15); // 4..19
487 
488 		// extra bytes for literalLength
489 		if (__improbable(literalLength == 15)) {
490 			uint8_t s;
491 			do {
492 #if DEBUG_LZ4_DECODE_ERRORS
493 				if (__improbable(src >= src_end)) {
494 					printf("Truncated SRC literal length\n");
495 				}
496 #endif
497 				if (__improbable(src >= src_end)) {
498 					goto IN_FAIL;                   // unexpected end of input (1 byte needed)
499 				}
500 				s = *src++;
501 				literalLength += s;
502 			} while (__improbable(s == 255));
503 		}
504 
505 		// copy literal
506 #if DEBUG_LZ4_DECODE_ERRORS
507 		if (__improbable(literalLength > (size_t)(src_end - src))) {
508 			printf("Truncated SRC literal\n");
509 		}
510 #endif
511 		if (__improbable(literalLength > (size_t)(src_end - src))) {
512 			goto IN_FAIL;
513 		}
514 		if (__improbable(literalLength > (size_t)(dst_end - dst))) {
515 			//  literal will take us past the end of the destination buffer,
516 			//  so we can only copy part of it.
517 			literalLength = (uint32_t)(dst_end - dst);
518 			memcpy(dst, src, literalLength);
519 			dst += literalLength;
520 			goto OUT_FULL;
521 		}
522 		memcpy(dst, src, literalLength);
523 		src += literalLength;
524 		dst += literalLength;
525 
526 		if (__improbable(src >= src_end)) {
527 			goto OUT_FULL;                                  // valid end of stream
528 		}
529 #if DEBUG_LZ4_DECODE_ERRORS
530 		if (__improbable(2 > (size_t)(src_end - src))) {
531 			printf("Truncated SRC distance\n");
532 		}
533 #endif
534 		if (__improbable(2 > (size_t)(src_end - src))) {
535 			goto IN_FAIL;                                   // unexpected end of input (2 bytes needed)
536 		}
537 		//DRKTODO: this causes an alignment increase warning (legitimate?)
538 		//DRKTODO: cast of char * to uint16_t*
539 	#pragma clang diagnostic push
540 	#pragma clang diagnostic ignored "-Wcast-align"
541 
542 		// match distance
543 		uint64_t matchDistance = *(const uint16_t *)src;             // 0x0000 <= matchDistance <= 0xffff
544 	#pragma clang diagnostic pop
545 		src += 2;
546 #if DEBUG_LZ4_DECODE_ERRORS
547 		if (matchDistance == 0) {
548 			printf("Invalid match distance D = 0\n");
549 		}
550 #endif
551 		if (__improbable(matchDistance == 0)) {
552 			goto IN_FAIL;                                            // 0x0000 invalid
553 		}
554 		uint8_t * ref = dst - matchDistance;
555 #if DEBUG_LZ4_DECODE_ERRORS
556 		if (__improbable(ref < dst_begin)) {
557 			printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n", matchDistance, dst_begin, dst, dst_end);
558 		}
559 #endif
560 		if (__improbable(ref < dst_begin)) {
561 			goto OUT_FAIL;                                           // out of range
562 		}
563 		// extra bytes for matchLength
564 		if (__improbable(matchLength == 19)) {
565 			uint8_t s;
566 			do {
567 #if DEBUG_LZ4_DECODE_ERRORS
568 				if (__improbable(src >= src_end)) {
569 					printf("Truncated SRC match length\n");
570 				}
571 #endif
572 				if (__improbable(src >= src_end)) {
573 					goto IN_FAIL;                                // unexpected end of input (1 byte needed)
574 				}
575 				s = *src++;
576 				matchLength += s;
577 			} while (__improbable(s == 255));
578 		}
579 
580 		// copy match (may overlap)
581 		if (__improbable(matchLength > (size_t)(dst_end - dst))) {
582 			//  match will take us past the end of the destination buffer,
583 			//  so we can only copy part of it.
584 			matchLength = (uint32_t)(dst_end - dst);
585 			for (uint32_t i = 0; i < matchLength; ++i) {
586 				dst[i] = ref[i];
587 			}
588 			dst += matchLength;
589 			goto OUT_FULL;
590 		}
591 		for (uint64_t i = 0; i < matchLength; i++) {
592 			dst[i] = ref[i];
593 		}
594 		dst += matchLength;
595 	}
596 
597 	//  We reached the end of the input buffer after a full instruction
598 OUT_FULL:
599 	//  Or we reached the end of the output buffer
600 	*dst_ptr = dst;
601 	*src_ptr = src;
602 	return 0;
603 
604 	//  Error conditions
605 OUT_FAIL:
606 IN_FAIL:
607 	return 1; // FAIL
608 }
609