/* * Copyright (c) 2016-2016 Apple Inc. All rights reserved. * * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ * * This file contains Original Code and/or Modifications of Original Code * as defined in and that are subject to the Apple Public Source License * Version 2.0 (the 'License'). You may not use this file except in * compliance with the License. The rights granted to you under the License * may not be used to create, or enable the creation or redistribution of, * unlawful or unlicensed copies of an Apple operating system, or to * circumvent, violate, or enable the circumvention or violation of, any * terms of an Apple operating system software license agreement. * * Please obtain a copy of the License at * http://www.opensource.apple.com/apsl/ and read it before using this file. * * The Original Code and all software distributed under the License are * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. * Please see the License for the specific language governing rights and * limitations under the License. * * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ */ // LZ4_RAW buffer API // EB May 2015 // Mar 2016 Imported from the Compression project, with minor optimisations and // early abort detection (Derek Kumar) #include "lz4.h" #define memcpy __builtin_memcpy size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, const uint8_t * __restrict src_buffer, size_t src_size, void * __restrict work __attribute__((unused))) { const uint8_t * src = src_buffer; uint8_t * dst = dst_buffer; // Go fast if we can, keeping away from the end of buffers #if LZ4_ENABLE_ASSEMBLY_DECODE if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) { if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) { return 0; // FAIL } } #endif //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers? // Finish safe if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size)) { return 0; // FAIL } return (size_t)(dst - dst_buffer); // bytes produced } // Debug flags #if LZ4DEBUG #define DEBUG_LZ4_ENCODE_ERRORS (1) #define DEBUG_LZ4_DECODE_ERRORS (1) #endif #if DEBUG_LZ4_ENCODE_ERRORS #endif #if !LZ4_ENABLE_ASSEMBLY_ENCODE #if defined(__x86_64__) || defined(__x86_64h__) # define LZ4_MATCH_SEARCH_INIT_SIZE 32 # define LZ4_MATCH_SEARCH_LOOP_SIZE 32 #else # define LZ4_MATCH_SEARCH_INIT_SIZE 8 # define LZ4_MATCH_SEARCH_LOOP_SIZE 8 #endif // Return hash for 4-byte sequence X static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); } // Store 0xfff..fff at *PTR static inline void lz4_fill16(uint8_t * ptr) { store8(ptr, -1); store8(ptr + 8, -1); } // Return number of matching bytes 0..4 at positions A and B. static inline size_t lz4_nmatch4(const uint8_t * a, const uint8_t * b) { uint32_t x = load4(a) ^ load4(b); return (x == 0)?4:(__builtin_ctzl(x) >> 3); } // Return number of matching bytes 0..8 at positions A and B. static inline size_t lz4_nmatch8(const uint8_t * a, const uint8_t * b) { uint64_t x = load8(a) ^ load8(b); return (x == 0)?8:(__builtin_ctzll(x) >> 3); } // Return number of matching bytes 0..16 at positions A and B. static inline size_t lz4_nmatch16(const uint8_t * a, const uint8_t * b) { size_t n = lz4_nmatch8(a, b); return (n == 8)?(8 + lz4_nmatch8(a + 8, b + 8)):n; } // Return number of matching bytes 0..32 at positions A and B. static inline size_t lz4_nmatch32(const uint8_t * a, const uint8_t * b) { size_t n = lz4_nmatch16(a, b); return (n == 16)?(16 + lz4_nmatch16(a + 16, b + 16)):n; } // Return number of matching bytes 0..64 at positions A and B. static inline size_t lz4_nmatch64(const uint8_t * a, const uint8_t * b) { size_t n = lz4_nmatch32(a, b); return (n == 32)?(32 + lz4_nmatch32(a + 32, b + 32)):n; } // Compile-time selection, return number of matching bytes 0..N at positions A and B. static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b) { switch (N) { case 4: return lz4_nmatch4(a, b); case 8: return lz4_nmatch8(a, b); case 16: return lz4_nmatch16(a, b); case 32: return lz4_nmatch32(a, b); case 64: return lz4_nmatch64(a, b); } __builtin_trap(); // FAIL } // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff. // We are allowed to access a constant number of bytes above DST_END. // Return incremented DST pointer on success, and 0 on failure static inline uint8_t * lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) { (void)end; while (L >= 17 * 255) { lz4_fill16(dst); dst += 16; L -= 16 * 255; } lz4_fill16(dst); //DRKTODO verify these modulos/divisions are optimally handled by clang dst += L / 255; *dst++ = L % 255; return dst; } static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; } static inline uint8_t * copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) { uint8_t *end = dst + L; { copy16(dst, src); dst += 16; src += 16; } while (dst < end) { copy32(dst, src); dst += 32; src += 32; } return end; } static uint8_t * lz4_emit_match(uint32_t L, uint32_t M, uint32_t D, uint8_t * restrict dst, const uint8_t * const end, const uint8_t * restrict src) { // The LZ4 encoding scheme requires that M is at least 4, because // the actual value stored by the encoding is M - 4. Check this // requirement for debug builds. assert(M >= 4 && "LZ4 encoding requires that M is at least 4"); // Having checked that M >= 4, translate M by four. M -= 4; // Similarly, we must have D < 2**16, because we use only two bytes // to represent the value of D in the encoding. assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes."); // Construct the command byte by clamping both L and M to 0 ... 15 // and packing them into a single byte, and store it. *dst++ = clamp(L, 15) << 4 | clamp(M, 15); // If L is 15 or greater, we need to encode extra literal length bytes. if (L >= 15) { dst = lz4_store_length(dst, end, L - 15); if (dst == 0 || dst + L >= end) { return NULL; } } // Copy the literal itself from src to dst. dst = copy_literal(dst, src, L); // Store match distance. store2(dst, D); dst += 2; // If M is 15 or greater, we need to encode extra match length bytes. if (M >= 15) { dst = lz4_store_length(dst, end, M - 15); if (dst == 0) { return NULL; } } return dst; } /* #ifndef LZ4_EARLY_ABORT */ /* #define LZ4_EARLY_ABORT (1) */ /* #endif */ #if LZ4_EARLY_ABORT int lz4_do_early_abort = 1; int lz4_early_aborts = 0; #define LZ4_EARLY_ABORT_EVAL (448) #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20) #endif /* LZ4_EARLY_ABORT */ void lz4_encode_2gb(uint8_t ** dst_ptr, size_t dst_size, const uint8_t ** src_ptr, const uint8_t * src_begin, size_t src_size, lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], int skip_final_literals) { uint8_t *dst = *dst_ptr; // current output stream position uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN; const uint8_t *src = *src_ptr; // current input stream literal to encode const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN; const uint8_t *match_begin = 0; // first byte of matched sequence const uint8_t *match_end = 0; // first byte after matched sequence #if LZ4_EARLY_ABORT uint8_t * const dst_begin = dst; uint32_t lz4_do_abort_eval = lz4_do_early_abort; #endif while (dst < end) { ptrdiff_t match_distance = 0; for (match_begin = src; match_begin < src_end; match_begin += 1) { const uint32_t pos = (uint32_t)(match_begin - src_begin); const uint32_t w0 = load4(match_begin); const uint32_t w1 = load4(match_begin + 1); const uint32_t w2 = load4(match_begin + 2); const uint32_t w3 = load4(match_begin + 3); const int i0 = lz4_hash(w0); const int i1 = lz4_hash(w1); const int i2 = lz4_hash(w2); const int i3 = lz4_hash(w3); const uint8_t *c0 = src_begin + hash_table[i0].offset; const uint8_t *c1 = src_begin + hash_table[i1].offset; const uint8_t *c2 = src_begin + hash_table[i2].offset; const uint8_t *c3 = src_begin + hash_table[i3].offset; const uint32_t m0 = hash_table[i0].word; const uint32_t m1 = hash_table[i1].word; const uint32_t m2 = hash_table[i2].word; const uint32_t m3 = hash_table[i3].word; hash_table[i0].offset = pos; hash_table[i0].word = w0; hash_table[i1].offset = pos + 1; hash_table[i1].word = w1; hash_table[i2].offset = pos + 2; hash_table[i2].word = w2; hash_table[i3].offset = pos + 3; hash_table[i3].word = w3; match_distance = (match_begin - c0); if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) { match_end = match_begin + 4; goto EXPAND_FORWARD; } match_begin++; match_distance = (match_begin - c1); if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) { match_end = match_begin + 4; goto EXPAND_FORWARD; } match_begin++; match_distance = (match_begin - c2); if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) { match_end = match_begin + 4; goto EXPAND_FORWARD; } match_begin++; match_distance = (match_begin - c3); if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) { match_end = match_begin + 4; goto EXPAND_FORWARD; } #if LZ4_EARLY_ABORT //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) { ptrdiff_t dstd = dst - dst_begin; if (dstd == 0) { lz4_early_aborts++; return; } /* if (dstd >= pos) { */ /* return; */ /* } */ /* ptrdiff_t cbytes = pos - dstd; */ /* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */ /* return; */ /* } */ lz4_do_abort_eval = 0; } #endif } if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence // Emit a trailing literal that covers the remainder of the source buffer, // if we can do so without exceeding the bounds of the destination buffer. size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src; if (src_remaining < 15) { *dst++ = (uint8_t)(src_remaining << 4); memcpy(dst, src, 16); dst += src_remaining; } else { *dst++ = 0xf0; dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15)); if (dst == 0 || dst + src_remaining >= end) { return; } memcpy(dst, src, src_remaining); dst += src_remaining; } *dst_ptr = dst; *src_ptr = src + src_remaining; return; EXPAND_FORWARD: // Expand match forward { const uint8_t * ref_end = match_end - match_distance; while (match_end < src_end) { size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end); if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; } match_end += LZ4_MATCH_SEARCH_LOOP_SIZE; ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE; } } // Expand match backward { // match_begin_min = max(src_begin + match_distance,literal) const uint8_t * match_begin_min = src_begin + match_distance; match_begin_min = (match_begin_min < src)?src:match_begin_min; const uint8_t * ref_begin = match_begin - match_distance; while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) { match_begin -= 1; ref_begin -= 1; } } // Emit match dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src); if (!dst) { return; } // Update state src = match_end; // Update return values to include the last fully encoded match *dst_ptr = dst; *src_ptr = src; } } #endif size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, const uint8_t * __restrict src_buffer, size_t src_size, lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES]) { // Initialize hash table const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 }; const uint8_t * src = src_buffer; uint8_t * dst = dst_buffer; // We need several blocks because our base function is limited to 2GB input const size_t BLOCK_SIZE = 0x7ffff000; while (src_size > 0) { //DRKTODO either implement pattern4 or figure out optimal unroll //DRKTODO: bizarrely, with plain O3 the compiler generates a single //DRKTODO: scalar STP per loop iteration with the stock loop //DRKTODO If hand unrolled, it switches to NEON store pairs // Reset hash table for each block /* #if __STDC_HOSTED__ */ /* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */ /* #else */ /* for (int i=0;i BLOCK_SIZE ? BLOCK_SIZE : src_size; // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads. // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer uint8_t * dst_start = dst; const uint8_t * src_start = src; lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size); // Check progress size_t dst_used = dst - dst_start; size_t src_used = src - src_start; // src_used <= src_to_encode if (src_to_encode == src_size && src_used < src_to_encode) { return 0; // FAIL to encode last block } // Note that there is a potential problem here in case of non compressible data requiring more blocks. // We may end up here with src_used very small, or even 0, and will not be able to make progress during // compression. We FAIL unless the length of literals remaining at the end is small enough. if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) { return 0; // FAIL too many literals } // Update counters (SRC and DST already have been updated) src_size -= src_used; dst_size -= dst_used; } return (size_t)(dst - dst_buffer); // bytes produced } typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1))); int lz4_decode(uint8_t ** dst_ptr, uint8_t * dst_begin, uint8_t * dst_end, const uint8_t ** src_ptr, const uint8_t * src_end) { uint8_t * dst = *dst_ptr; const uint8_t * src = *src_ptr; // Require dst_end > dst. if (dst_end <= dst) { goto OUT_FULL; } while (src < src_end) { // Keep last good position *src_ptr = src; *dst_ptr = dst; uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM uint32_t literalLength = (cmd >> 4) & 15; // 0..15 uint32_t matchLength = 4 + (cmd & 15); // 4..19 // extra bytes for literalLength if (__improbable(literalLength == 15)) { uint8_t s; do { #if DEBUG_LZ4_DECODE_ERRORS if (__improbable(src >= src_end)) { printf("Truncated SRC literal length\n"); } #endif if (__improbable(src >= src_end)) { goto IN_FAIL; // unexpected end of input (1 byte needed) } s = *src++; literalLength += s; } while (__improbable(s == 255)); } // copy literal #if DEBUG_LZ4_DECODE_ERRORS if (__improbable(literalLength > (size_t)(src_end - src))) { printf("Truncated SRC literal\n"); } #endif if (__improbable(literalLength > (size_t)(src_end - src))) { goto IN_FAIL; } if (__improbable(literalLength > (size_t)(dst_end - dst))) { // literal will take us past the end of the destination buffer, // so we can only copy part of it. literalLength = (uint32_t)(dst_end - dst); memcpy(dst, src, literalLength); dst += literalLength; goto OUT_FULL; } memcpy(dst, src, literalLength); src += literalLength; dst += literalLength; if (__improbable(src >= src_end)) { goto OUT_FULL; // valid end of stream } #if DEBUG_LZ4_DECODE_ERRORS if (__improbable(2 > (size_t)(src_end - src))) { printf("Truncated SRC distance\n"); } #endif if (__improbable(2 > (size_t)(src_end - src))) { goto IN_FAIL; // unexpected end of input (2 bytes needed) } //DRKTODO: this causes an alignment increase warning (legitimate?) //DRKTODO: cast of char * to uint16_t* #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wcast-align" // match distance uint64_t matchDistance = *(const uint16_t *)src; // 0x0000 <= matchDistance <= 0xffff #pragma clang diagnostic pop src += 2; #if DEBUG_LZ4_DECODE_ERRORS if (matchDistance == 0) { printf("Invalid match distance D = 0\n"); } #endif if (__improbable(matchDistance == 0)) { goto IN_FAIL; // 0x0000 invalid } uint8_t * ref = dst - matchDistance; #if DEBUG_LZ4_DECODE_ERRORS if (__improbable(ref < dst_begin)) { printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n", matchDistance, dst_begin, dst, dst_end); } #endif if (__improbable(ref < dst_begin)) { goto OUT_FAIL; // out of range } // extra bytes for matchLength if (__improbable(matchLength == 19)) { uint8_t s; do { #if DEBUG_LZ4_DECODE_ERRORS if (__improbable(src >= src_end)) { printf("Truncated SRC match length\n"); } #endif if (__improbable(src >= src_end)) { goto IN_FAIL; // unexpected end of input (1 byte needed) } s = *src++; matchLength += s; } while (__improbable(s == 255)); } // copy match (may overlap) if (__improbable(matchLength > (size_t)(dst_end - dst))) { // match will take us past the end of the destination buffer, // so we can only copy part of it. matchLength = (uint32_t)(dst_end - dst); for (uint32_t i = 0; i < matchLength; ++i) { dst[i] = ref[i]; } dst += matchLength; goto OUT_FULL; } for (uint64_t i = 0; i < matchLength; i++) { dst[i] = ref[i]; } dst += matchLength; } // We reached the end of the input buffer after a full instruction OUT_FULL: // Or we reached the end of the output buffer *dst_ptr = dst; *src_ptr = src; return 0; // Error conditions OUT_FAIL: IN_FAIL: return 1; // FAIL }