From: elasota 1137273+elasota@users.noreply.github.com
--- dlls/cabinet/Makefile.in | 5 +- dlls/cabinet/fci.c | 253 +- dlls/cabinet/liblzx.h | 141 + dlls/cabinet/liblzx_bitops.h | 156 + dlls/cabinet/liblzx_bt_matchfinder.h | 446 +++ dlls/cabinet/liblzx_compiler.h | 214 ++ dlls/cabinet/liblzx_compress_common.c | 673 ++++ dlls/cabinet/liblzx_compress_common.h | 19 + dlls/cabinet/liblzx_config.h | 29 + dlls/cabinet/liblzx_endianness.h | 136 + dlls/cabinet/liblzx_error.h | 11 + dlls/cabinet/liblzx_hc_matchfinder.h | 432 +++ dlls/cabinet/liblzx_lzx_common.c | 325 ++ dlls/cabinet/liblzx_lzx_common.h | 29 + dlls/cabinet/liblzx_lzx_compress.c | 3662 ++++++++++++++++++++++ dlls/cabinet/liblzx_lzx_constants.h | 108 + dlls/cabinet/liblzx_matchfinder_common.h | 131 + dlls/cabinet/liblzx_minmax.h | 122 + dlls/cabinet/liblzx_types.h | 33 + dlls/cabinet/liblzx_unaligned.h | 134 + dlls/cabinet/liblzx_util.h | 20 + dlls/cabinet/tests/extract.c | 240 +- 22 files changed, 7181 insertions(+), 138 deletions(-) create mode 100644 dlls/cabinet/liblzx.h create mode 100644 dlls/cabinet/liblzx_bitops.h create mode 100644 dlls/cabinet/liblzx_bt_matchfinder.h create mode 100644 dlls/cabinet/liblzx_compiler.h create mode 100644 dlls/cabinet/liblzx_compress_common.c create mode 100644 dlls/cabinet/liblzx_compress_common.h create mode 100644 dlls/cabinet/liblzx_config.h create mode 100644 dlls/cabinet/liblzx_endianness.h create mode 100644 dlls/cabinet/liblzx_error.h create mode 100644 dlls/cabinet/liblzx_hc_matchfinder.h create mode 100644 dlls/cabinet/liblzx_lzx_common.c create mode 100644 dlls/cabinet/liblzx_lzx_common.h create mode 100644 dlls/cabinet/liblzx_lzx_compress.c create mode 100644 dlls/cabinet/liblzx_lzx_constants.h create mode 100644 dlls/cabinet/liblzx_matchfinder_common.h create mode 100644 dlls/cabinet/liblzx_minmax.h create mode 100644 dlls/cabinet/liblzx_types.h create mode 100644 dlls/cabinet/liblzx_unaligned.h create mode 100644 dlls/cabinet/liblzx_util.h
diff --git a/dlls/cabinet/Makefile.in b/dlls/cabinet/Makefile.in index a72e5a31be6..40db872bbfd 100644 --- a/dlls/cabinet/Makefile.in +++ b/dlls/cabinet/Makefile.in @@ -7,4 +7,7 @@ SOURCES = \ cabinet.rc \ cabinet_main.c \ fci.c \ - fdi.c + fdi.c \ + liblzx_compress_common.c \ + liblzx_lzx_common.c \ + liblzx_lzx_compress.c diff --git a/dlls/cabinet/fci.c b/dlls/cabinet/fci.c index e62399db1ba..87a29623656 100644 --- a/dlls/cabinet/fci.c +++ b/dlls/cabinet/fci.c @@ -47,6 +47,8 @@ There is still some work to be done: #include "wine/list.h" #include "wine/debug.h"
+#include "liblzx.h" + WINE_DEFAULT_DEBUG_CHANNEL(cabinet);
#ifdef WORDS_BIGENDIAN @@ -165,11 +167,13 @@ typedef struct FCI_Int char szPrevDisk[CB_MAX_DISK_NAME]; /* disk name of previous cabinet */ unsigned char data_in[CAB_BLOCKMAX]; /* uncompressed data blocks */ unsigned char data_out[2 * CAB_BLOCKMAX]; /* compressed data blocks */ + BOOL have_data_out; cab_UWORD cdata_in; ULONG cCompressedBytesInFolder; cab_UWORD cFolders; cab_UWORD cFiles; - cab_ULONG cDataBlocks; + cab_ULONG cDataBlocksIn; + cab_ULONG cDataBlocksOut; cab_ULONG cbFileRemainder; /* uncompressed, yet to be written data */ /* of spanned file of a spanning folder of a spanning cabinet */ struct temp_file data; @@ -185,6 +189,9 @@ typedef struct FCI_Int cab_ULONG folders_data_size; /* total size of data contained in the current folders */ TCOMP compression; cab_UWORD (*compress)(struct FCI_Int *); + cab_UWORD (*flush)(struct FCI_Int *); + void (*compress_shutdown)(struct FCI_Int *); + struct liblzx_compressor *lzx_compressor; } FCI_Int;
#define FCI_INT_MAGIC 0xfcfcfc05 @@ -274,7 +281,7 @@ static struct file *add_file( FCI_Int *fci, const char *filename ) return NULL; } file->size = 0; - file->offset = fci->cDataBlocks * CAB_BLOCKMAX + fci->cdata_in; + file->offset = fci->cDataBlocksIn * CAB_BLOCKMAX + fci->cdata_in; file->folder = fci->cFolders; file->date = 0; file->time = 0; @@ -305,43 +312,82 @@ static void free_file( FCI_Int *fci, struct file *file ) fci->free( file ); }
-/* create a new data block for the data in fci->data_in */ -static BOOL add_data_block( FCI_Int *fci, PFNFCISTATUS status_callback ) +/* creates new data blocks for the data in fci->data_in */ +static BOOL add_data_blocks( FCI_Int *fci, BOOL is_last_block, PFNFCISTATUS status_callback ) { int err; struct data_block *block; + cab_UWORD compressed_size = 0; + cab_UWORD uncompressed_size = fci->cdata_in;
- if (!fci->cdata_in) return TRUE; + if (!uncompressed_size) + { + if (fci->cDataBlocksIn == 0 || !is_last_block) return TRUE; + }
if (fci->data.handle == -1 && !create_temp_file( fci, &fci->data )) return FALSE;
- if (!(block = fci->alloc( sizeof(*block) ))) + if (uncompressed_size) { - set_error( fci, FCIERR_ALLOC_FAIL, ERROR_NOT_ENOUGH_MEMORY ); - return FALSE; + compressed_size = fci->compress( fci ); + + fci->cdata_in = 0; + fci->cDataBlocksIn++; } - block->uncompressed = fci->cdata_in; - block->compressed = fci->compress( fci );
- if (fci->write( fci->data.handle, fci->data_out, - block->compressed, &err, fci->pv ) != block->compressed) + if (compressed_size == 0 && is_last_block && fci->flush) { - set_error( fci, FCIERR_TEMP_FILE, err ); - fci->free( block ); - return FALSE; + compressed_size = fci->flush( fci ); }
- fci->cdata_in = 0; - fci->pending_data_size += sizeof(CFDATA) + fci->ccab.cbReserveCFData + block->compressed; - fci->cCompressedBytesInFolder += block->compressed; - fci->cDataBlocks++; - list_add_tail( &fci->blocks_list, &block->entry ); - - if (status_callback( statusFile, block->compressed, block->uncompressed, fci->pv ) == -1) + while (compressed_size > 0) { - set_error( fci, FCIERR_USER_ABORT, 0 ); - return FALSE; + if (!(block = fci->alloc( sizeof(*block) ))) + { + set_error( fci, FCIERR_ALLOC_FAIL, ERROR_NOT_ENOUGH_MEMORY ); + return FALSE; + } + + if (is_last_block && fci->cDataBlocksIn - 1 == fci->cDataBlocksOut) + { + block->uncompressed = uncompressed_size; + } + else + { + block->uncompressed = CAB_BLOCKMAX; + } + + block->compressed = compressed_size; + + if (fci->write( fci->data.handle, fci->data_out, + block->compressed, &err, fci->pv ) != block->compressed) + { + set_error( fci, FCIERR_TEMP_FILE, err ); + fci->free( block ); + return FALSE; + } + + fci->pending_data_size += sizeof(CFDATA) + fci->ccab.cbReserveCFData + block->compressed; + fci->cCompressedBytesInFolder += block->compressed; + fci->cDataBlocksOut++; + list_add_tail( &fci->blocks_list, &block->entry ); + + if (status_callback( statusFile, block->compressed, block->uncompressed, fci->pv ) == -1) + { + set_error( fci, FCIERR_USER_ABORT, 0 ); + return FALSE; + } + + if (is_last_block && fci->flush) + { + compressed_size = fci->flush( fci ); + } + else + { + compressed_size = 0; + } } + return TRUE; }
@@ -377,7 +423,7 @@ static BOOL add_file_data( FCI_Int *fci, char *sourcefile, char *filename, BOOL } file->size += len; fci->cdata_in += len; - if (fci->cdata_in == CAB_BLOCKMAX && !add_data_block( fci, status_callback )) return FALSE; + if (fci->cdata_in == CAB_BLOCKMAX && !add_data_blocks( fci, FALSE, status_callback )) return FALSE; } fci->close( handle, &err, fci->pv ); return TRUE; @@ -824,7 +870,8 @@ static BOOL add_data_to_folder( FCI_Int *fci, struct folder *folder, cab_ULONG * } if (split_block) break; free_data_block( fci, block ); - fci->cDataBlocks--; + fci->cDataBlocksIn--; + fci->cDataBlocksOut--; }
if (list_empty( &fci->blocks_list )) return TRUE; @@ -905,6 +952,10 @@ static cab_UWORD compress_NONE( FCI_Int *fci ) return fci->cdata_in; }
+static void shutdown_NONE(FCI_Int *fci) +{ +} + static void *zalloc( void *opaque, unsigned int items, unsigned int size ) { FCI_Int *fci = opaque; @@ -938,9 +989,118 @@ static cab_UWORD compress_MSZIP( FCI_Int *fci ) fci->data_out[1] = 'K'; deflate( &stream, Z_FINISH ); deflateEnd( &stream ); + fci->have_data_out = TRUE; return stream.total_out + 2; }
+static void shutdown_MSZIP( FCI_Int *fci ) +{ +} + +static void shutdown_LZX(FCI_Int *fci) +{ + liblzx_compress_destroy(fci->lzx_compressor); + fci->lzx_compressor = NULL; +} + +static void *compress_LZX_alloc_callback(void *userdata, size_t size) +{ + FCI_Int *fci = (FCI_Int *)userdata; + + return fci->alloc((ULONG)size); +} + +static void compress_LZX_free_callback(void *userdata, void *ptr) +{ + FCI_Int *fci = (FCI_Int *)userdata; + + fci->free(ptr); +} + +static cab_UWORD compress_LZX(FCI_Int *fci) +{ + size_t in_digested = 0; + size_t compressed_size = 0; + const liblzx_output_chunk_t *out_chunk = NULL; + + if (fci->cDataBlocksIn == 0) { + /* First block, restart compression */ + int window_size_bits = LZXCompressionWindowFromTCOMP(fci->compression); + liblzx_compress_properties_t props; + + if (fci->lzx_compressor) + { + liblzx_compress_destroy(fci->lzx_compressor); + fci->lzx_compressor = NULL; + } + + memset(&props, 0, sizeof(props)); + props.lzx_variant = LIBLZX_VARIANT_CAB_DELTA; + props.window_size = 1 << window_size_bits; + props.chunk_granularity = CAB_BLOCKMAX; + props.compression_level = 70; + props.e8_file_size = LIBLZX_CONST_DEFAULT_E8_FILE_SIZE; + props.alloc_func = compress_LZX_alloc_callback; + props.free_func = compress_LZX_free_callback; + props.userdata = fci; + + fci->lzx_compressor = liblzx_compress_create(&props); + + if (!fci->lzx_compressor) + { + set_error(fci, FCIERR_ALLOC_FAIL, ERROR_OUTOFMEMORY); + return 0; + } + } + + while (in_digested < fci->cdata_in) + { + in_digested += liblzx_compress_add_input(fci->lzx_compressor, fci->data_in + in_digested, fci->cdata_in - in_digested); + + if (out_chunk) + { + /* After producing an output chunk, all data should be digestable. */ + assert(in_digested == fci->cdata_in); + break; + } + + out_chunk = liblzx_compress_get_next_chunk(fci->lzx_compressor); + + if (out_chunk) + { + compressed_size = out_chunk->size; + memcpy(fci->data_out, out_chunk->data, compressed_size); + liblzx_compress_release_next_chunk(fci->lzx_compressor); + + fci->have_data_out = TRUE; + } + } + + return compressed_size; +} + +cab_UWORD flush_LZX(FCI_Int *fci) +{ + const liblzx_output_chunk_t *out_chunk = NULL; + cab_UWORD compressed_size = 0; + + liblzx_compress_end_input(fci->lzx_compressor); + out_chunk = liblzx_compress_get_next_chunk(fci->lzx_compressor); + + if (out_chunk == NULL) + { + return 0; + } + + compressed_size = out_chunk->size; + memcpy(fci->data_out, out_chunk->data, out_chunk->size); + + liblzx_compress_release_next_chunk(fci->lzx_compressor); + + fci->have_data_out = TRUE; + + return compressed_size; +}
/*********************************************************************** * FCICreate (CABINET.10) @@ -1046,6 +1206,7 @@ HFCI __cdecl FCICreate( p_fci_internal->pv = pv; p_fci_internal->data.handle = -1; p_fci_internal->compress = compress_NONE; + p_fci_internal->compress_shutdown = shutdown_NONE;
list_init( &p_fci_internal->folders_list ); list_init( &p_fci_internal->files_list ); @@ -1102,11 +1263,12 @@ static BOOL fci_flush_folder( FCI_Int *p_fci_internal, p_fci_internal->fSplitFolder=FALSE;
/* START of COPY */ - if (!add_data_block( p_fci_internal, pfnfcis )) return FALSE; + if (!add_data_blocks( p_fci_internal, TRUE, pfnfcis )) return FALSE;
/* reset to get the number of data blocks of this folder which are */ /* actually in this cabinet ( at least partially ) */ - p_fci_internal->cDataBlocks=0; + p_fci_internal->cDataBlocksIn = 0; + p_fci_internal->cDataBlocksOut = 0;
p_fci_internal->statusFolderTotal = get_header_size( p_fci_internal ) + sizeof(CFFOLDER) + p_fci_internal->ccab.cbReserveCFFolder + @@ -1211,7 +1373,8 @@ static BOOL fci_flush_folder( FCI_Int *p_fci_internal, if (!add_files_to_folder( p_fci_internal, folder, payload )) return FALSE;
/* reset CFFolder specific information */ - p_fci_internal->cDataBlocks=0; + p_fci_internal->cDataBlocksIn=0; + p_fci_internal->cDataBlocksOut=0; p_fci_internal->cCompressedBytesInFolder=0;
return TRUE; @@ -1409,19 +1572,41 @@ BOOL __cdecl FCIAddFile(
if (typeCompress != p_fci_internal->compression) { + if ((typeCompress & tcompMASK_TYPE) == tcompTYPE_LZX) { + TCOMP window_size_bits = (typeCompress & tcompMASK_LZX_WINDOW); + + if (window_size_bits < tcompLZX_WINDOW_LO || window_size_bits > tcompLZX_WINDOW_HI) { + set_error(p_fci_internal, FCIERR_BAD_COMPR_TYPE, ERROR_BAD_ARGUMENTS); + return FALSE; + } + } + if (!FCIFlushFolder( hfci, pfnfcignc, pfnfcis )) return FALSE; - switch (typeCompress) + + p_fci_internal->compress_shutdown(p_fci_internal); + + switch (typeCompress & tcompMASK_TYPE) { case tcompTYPE_MSZIP: - p_fci_internal->compression = tcompTYPE_MSZIP; - p_fci_internal->compress = compress_MSZIP; + p_fci_internal->compression = tcompTYPE_MSZIP; + p_fci_internal->compress = compress_MSZIP; + p_fci_internal->flush = NULL; + p_fci_internal->compress_shutdown = shutdown_MSZIP; + break; + case tcompTYPE_LZX: + p_fci_internal->compression = typeCompress; + p_fci_internal->compress = compress_LZX; + p_fci_internal->flush = flush_LZX; + p_fci_internal->compress_shutdown = shutdown_LZX; break; default: FIXME( "compression %x not supported, defaulting to none\n", typeCompress ); /* fall through */ case tcompTYPE_NONE: - p_fci_internal->compression = tcompTYPE_NONE; - p_fci_internal->compress = compress_NONE; + p_fci_internal->compression = tcompTYPE_NONE; + p_fci_internal->compress = compress_NONE; + p_fci_internal->flush = NULL; + p_fci_internal->compress_shutdown = shutdown_NONE; break; } } diff --git a/dlls/cabinet/liblzx.h b/dlls/cabinet/liblzx.h new file mode 100644 index 00000000000..504a3289bac --- /dev/null +++ b/dlls/cabinet/liblzx.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright (C) 2012-2017 Eric Biggers + * + * This file is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this file; if not, see https://www.gnu.org/licenses/. + */ + +#pragma once + +#ifndef __LIBLZX_H__ +#define __LIBLZX_H__ + +#include <stddef.h> +#include <stdint.h> + +#include "liblzx_error.h" + +typedef struct liblzx_internal liblzx_internal_t; +typedef struct liblzx_compress_properties liblzx_compress_properties_t; +typedef struct liblzx_compressor liblzx_compressor_t; +typedef struct liblzx_output_chunk liblzx_output_chunk_t; + +typedef void *(*liblzx_alloc_func_t)(void *opaque, size_t size); +typedef void (*liblzx_free_func_t)(void *opaque, void *ptr); + +enum liblzx_variant { + /* LZX variant used by CAB files and LZX DELTA */ + LIBLZX_VARIANT_CAB_DELTA, + + /* LZX variant used by WIM */ + LIBLZX_VARIANT_WIM, +}; + +typedef enum liblzx_variant liblzx_variant_t; + +enum liblzx_constant { + LIBLZX_CONST_DEFAULT_CHUNK_SIZE = 32768, + LIBLZX_CONST_DEFAULT_E8_FILE_SIZE = 12 * 1024 * 1024, + LIBLZX_CONST_MAX_WINDOW_SIZE = 64 * 1024 * 1024, +}; + +struct liblzx_output_chunk { + const void *data; + size_t size; +}; + +struct liblzx_compress_properties { + /* LZX variant to use */ + liblzx_variant_t lzx_variant; + + /* Source file size for LZX DELTA. Ignored for WIM. + * When using this, use liblzx_compress_add_input to add the source + * file's data before adding the new file's data. For compression + * only, set this to 0. + */ + size_t delta_source_size; + + /* Compression window size. */ + uint32_t window_size; + + /* Granularity of a chunk. Should generally be set to + * LIBLZX_CONST_DEFAULT_CHUNK_SIZE. + */ + uint32_t chunk_granularity; + + /* Compression level. Can be set arbitrarily high. */ + uint16_t compression_level; + + /* E8 file size parameter. For WIM, this is ignored. For other + * variants, this value is expected to be user-controllable and + * is sent outside of the LZX data stream. + */ + uint32_t e8_file_size; + + /* Memory allocation function. */ + liblzx_alloc_func_t alloc_func; + + /* Memory free function. */ + liblzx_free_func_t free_func; + + /* Userdata parameter to pass to alloc function. */ + void *userdata; +}; + +#ifdef __cplusplus +extern "C" { +#endif + +/* Creates a compressor object and returns a pointer to it. */ +liblzx_compressor_t * +liblzx_compress_create(const liblzx_compress_properties_t *props); + +/* Destroys a compressor object and releases all resources. */ +void +liblzx_compress_destroy(liblzx_compressor_t *stream); + +/* Resets a compressor to its initial state. */ +void +liblzx_compress_reset(liblzx_compressor_t *stream); + +/* Adds input data to the compression stream and returns the number of bytes + * digested. The return value will never exceed in_data_size. If this + * returns a value smaller than in_data_size, then a compressed block was + * produced and must be released with liblzx_compress_release_next_block + * before more data can be added. + */ +size_t +liblzx_compress_add_input(liblzx_compressor_t *stream, const void *in_data, + size_t in_data_size); + +/* Returns the next compressed chunk. This doesn't consume the chunk in the + * process, so repeated calls will keep returning the same chunk. If no chunk + * is available, returns NULL. + */ +const liblzx_output_chunk_t * +liblzx_compress_get_next_chunk(const liblzx_compressor_t *stream); + +/* Releases the next compressed chunk, allowing compression to continue. */ +void +liblzx_compress_release_next_chunk(liblzx_compressor_t *stream); + +/* Ends the compression stream. */ +void +liblzx_compress_end_input(liblzx_compressor_t *stream); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/dlls/cabinet/liblzx_bitops.h b/dlls/cabinet/liblzx_bitops.h new file mode 100644 index 00000000000..976a294fe54 --- /dev/null +++ b/dlls/cabinet/liblzx_bitops.h @@ -0,0 +1,156 @@ +/* + * bitops.h - inline functions for bit manipulation + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LIBLZX_BITOPS_H +#define _LIBLZX_BITOPS_H + +#include "liblzx_compiler.h" +#include "liblzx_types.h" + +#if LIBLZX_IS_MSVC_COMPILER +#include <intrin.h> +#endif + +/* + * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least + * significant bit) of the *most* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static attrib_forceinline unsigned +bsr32(uint32_t v) +{ +#if LIBLZX_IS_MSVC_COMPILER + unsigned long result; + _BitScanReverse(&result, v); + return result; +#else + return 31 - __builtin_clz(v); +#endif +} + +static attrib_forceinline unsigned +bsr64(uint64_t v) +{ +#if LIBLZX_IS_MSVC_COMPILER +# ifdef _M_AMD64 + unsigned long result; + _BitScanReverse64(&result, v); + return result; +# else + unsigned long index; + if (_BitScanReverse(&index, v >> 32)) + return index + 32; + + _BitScanReverse(&index, v & 0xffffffffu); + + return index; +# endif +#else + return 63 - __builtin_clzll(v); +#endif +} + +static attrib_forceinline unsigned +bsrw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsr32((uint32_t)v); + else + return bsr64(v); +} + +/* + * Bit Scan Forward (BSF) - find the 0-based index (relative to the least + * significant bit) of the *least* significant 1 bit in the input value. The + * input value must be nonzero! + */ + +static attrib_forceinline unsigned +bsf32(uint32_t v) +{ +#if LIBLZX_IS_MSVC_COMPILER + unsigned long result; + _BitScanForward(&result, v); + return result; +#else + return __builtin_ctz(v); +#endif +} + +static attrib_forceinline unsigned +bsf64(uint64_t v) +{ +#if LIBLZX_IS_MSVC_COMPILER +# ifdef _M_AMD64 + unsigned long result; + _BitScanForward64(&result, v); + return result; +# else + unsigned long index; + if (_BitScanForward(&index, v & 0xffffffffu)) + return index; + + if (_BitScanForward(&index, v >> 32)) + index += 32; + + return -1; +# endif +#else + return __builtin_ctzll(v); +#endif +} + +static attrib_forceinline unsigned +bsfw(machine_word_t v) +{ + STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64); + if (WORDBITS == 32) + return bsf32(v); + else + return bsf64(v); +} + +/* Return the log base 2 of 'n', rounded up to the nearest integer. */ +static attrib_forceinline unsigned +ilog2_ceil(size_t n) +{ + if (n <= 1) + return 0; + return 1 + bsrw(n - 1); +} + +/* Round 'n' up to the nearest power of 2 */ +static attrib_forceinline size_t +roundup_pow_of_2(size_t n) +{ + return (size_t)1 << ilog2_ceil(n); +} + +#endif /* _LIBLZX_BITOPS_H */ diff --git a/dlls/cabinet/liblzx_bt_matchfinder.h b/dlls/cabinet/liblzx_bt_matchfinder.h new file mode 100644 index 00000000000..d601e110aca --- /dev/null +++ b/dlls/cabinet/liblzx_bt_matchfinder.h @@ -0,0 +1,446 @@ +/* + * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * ---------------------------------------------------------------------------- + * + * This is a Binary Trees (bt) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * binary tree of sequences whose first 4 bytes share the same hash code. Each + * sequence is identified by its starting position in the input buffer. Each + * binary tree is always sorted such that each left child represents a sequence + * lexicographically lesser than its parent and each right child represents a + * sequence lexicographically greater than its parent. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, a new binary tree + * node is created to represent the current sequence. Then, in a single tree + * traversal, the hash bucket's binary tree is searched for matches and is + * re-rooted at the new node. + * + * Compared to the simpler algorithm that uses linked lists instead of binary + * trees (see hc_matchfinder.h), the binary tree version gains more information + * at each node visitation. Ideally, the binary tree version will examine only + * 'log(n)' nodes to find the same matches that the linked list version will + * find by examining 'n' nodes. In addition, the binary tree version can + * examine fewer bytes at each node by taking advantage of the common prefixes + * that result from the sort order, whereas the linked list version may have to + * examine up to the full length of the match at each node. + * + * However, it is not always best to use the binary tree version. It requires + * nearly twice as much memory as the linked list version, and it takes time to + * keep the binary trees sorted, even at positions where the compressor does not + * need matches. Generally, when doing fast compression on small buffers, + * binary trees are the wrong approach. They are best suited for thorough + * compression and/or large buffers. + * + * ---------------------------------------------------------------------------- + */ + + +#include <string.h> + +#include "liblzx_matchfinder_common.h" + +#define BT_MATCHFINDER_HASH3_ORDER 15 +#define BT_MATCHFINDER_HASH3_WAYS 2 +#define BT_MATCHFINDER_HASH4_ORDER 16 + +/* TEMPLATED functions and structures have MF_SUFFIX appended to their name. */ +#undef TEMPLATED +#define TEMPLATED(name) CONCAT(name, MF_SUFFIX) + +#ifndef _LIBLZX_BT_MATCHFINDER_H +#define _LIBLZX_BT_MATCHFINDER_H + +/* Non-templated definitions */ + +/* Representation of a match found by the bt_matchfinder */ +struct lz_match { + + /* The number of bytes matched. */ + uint32_t length; + + /* The offset back from the current position that was matched. */ + uint32_t offset; +}; + +#endif /* _LIBLZX_BT_MATCHFINDER_H */ + +struct TEMPLATED(bt_matchfinder) { + + /* The hash table for finding length 2 matches, if enabled */ +#ifdef BT_MATCHFINDER_HASH2_ORDER + mf_pos_t hash2_tab[1UL << BT_MATCHFINDER_HASH2_ORDER]; +#endif + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS]; + + /* The hash table which contains the roots of the binary trees for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER]; + + /* The child node references for the binary trees. The left and right + * children of the node for the sequence with position 'pos' are + * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively. */ + mf_pos_t child_tab[]; +}; + +static attrib_forceinline bool +TEMPLATED(matchfinder_is_valid_pos)(mf_pos_t pos, mf_pos_t min_pos) +{ + return ((pos + 1) & MF_INVALID_POS) > min_pos; +} + +static attrib_forceinline void +TEMPLATED(matchfinder_rebase)(mf_pos_t * mf_base, size_t count, + mf_pos_t cull_amount) +{ + /* The invalid value points to the last element of the buffer. */ + /* Since no match can start from that byte, it is always invalid. */ + while (count > 0) { + mf_pos_t pos = *mf_base; + + if (pos < cull_amount || pos == MF_INVALID_POS) { + *mf_base = MF_INVALID_POS; + } else { + *mf_base -= cull_amount; + } + + mf_base++; + count--; + } +} + +/* Return the number of bytes that must be allocated for a 'bt_matchfinder' that + * can work with buffers up to the specified size. */ +static attrib_forceinline size_t +TEMPLATED(bt_matchfinder_size)(size_t max_bufsize, bool streaming) +{ + const size_t streaming_bufsize_mul = streaming ? 4 : 2; + + const size_t base_size = + sizeof(struct TEMPLATED(bt_matchfinder)) + + (streaming_bufsize_mul * max_bufsize * sizeof(mf_pos_t)); + + return base_size; +} + +/* Prepare the matchfinder for a new input buffer. */ +static attrib_forceinline void +TEMPLATED(bt_matchfinder_init)(struct TEMPLATED(bt_matchfinder) *mf) +{ + memset(mf, 0xFF, sizeof(*mf)); +} + +static attrib_forceinline mf_pos_t * +TEMPLATED(bt_left_child)(struct TEMPLATED(bt_matchfinder) *mf, uint32_t node) +{ + return &mf->child_tab[(node << 1) + 0]; +} + +static attrib_forceinline mf_pos_t * +TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, uint32_t node) +{ + return &mf->child_tab[(node << 1) + 1]; +} + +/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches() + * and bt_matchfinder_skip_byte(). There must be sufficiently many bytes + * remaining to load a 32-bit integer from the *next* position. */ +#define BT_MATCHFINDER_REQUIRED_NBYTES 5 + +/* Advance the binary tree matchfinder by one byte, optionally recording + * matches. @record_matches should be a compile-time constant. */ +static attrib_forceinline struct lz_match * +TEMPLATED(bt_matchfinder_advance_one_byte)(struct TEMPLATED(bt_matchfinder) * const mf, + const uint8_t * const in_begin, + mf_pos_t in_min_pos, + const ptrdiff_t cur_pos, + const uint32_t max_find_len, + const uint32_t max_produce_len, + const uint32_t nice_len, + const uint32_t max_search_depth, + uint32_t * const next_hashes, + uint32_t * const best_len_ret, + struct lz_match *lz_matchptr, + const bool record_matches) +{ + const uint8_t *in_next = in_begin + cur_pos; + uint32_t depth_remaining = max_search_depth; + uint32_t next_hashseq; + uint32_t hash3; + uint32_t hash4; +#ifdef BT_MATCHFINDER_HASH2_ORDER + uint16_t seq2; + uint32_t hash2; +#endif + STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 && + BT_MATCHFINDER_HASH3_WAYS <= 2); + uint32_t cur_node; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + uint32_t cur_node_2; +#endif + const uint8_t *matchptr; + mf_pos_t *pending_lt_ptr, *pending_gt_ptr; + uint32_t best_lt_len, best_gt_len; + uint32_t len; + uint32_t best_len = 3; + + next_hashseq = get_unaligned_le32(in_next + 1); + + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + +#ifdef BT_MATCHFINDER_HASH2_ORDER + seq2 = load_u16_unaligned(in_next); + hash2 = lz_hash(seq2, BT_MATCHFINDER_HASH2_ORDER); + cur_node = mf->hash2_tab[hash2]; + mf->hash2_tab[hash2] = cur_pos; + if (record_matches && + TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos) && + seq2 == load_u16_unaligned(&in_begin[cur_node])) + { + lz_matchptr->length = 2; + lz_matchptr->offset = in_next - &in_begin[cur_node]; + lz_matchptr++; + } +#endif + + cur_node = mf->hash3_tab[hash3][0]; + mf->hash3_tab[hash3][0] = cur_pos; +#if BT_MATCHFINDER_HASH3_WAYS >= 2 + cur_node_2 = mf->hash3_tab[hash3][1]; + mf->hash3_tab[hash3][1] = cur_node; +#endif + if (record_matches && + TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos)) { + uint32_t seq3 = load_u24_unaligned(in_next); + if (seq3 == load_u24_unaligned(&in_begin[cur_node]) && + likely(cur_node >= in_min_pos)) { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_begin[cur_node]; + lz_matchptr++; + } + #if BT_MATCHFINDER_HASH3_WAYS >= 2 + else if (TEMPLATED(matchfinder_is_valid_pos)(cur_node_2, + in_min_pos) && + seq3 == load_u24_unaligned(&in_begin[cur_node_2])) { + lz_matchptr->length = 3; + lz_matchptr->offset = in_next - &in_begin[cur_node_2]; + lz_matchptr++; + } + #endif + } + + cur_node = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + pending_lt_ptr = TEMPLATED(bt_left_child)(mf, cur_pos); + pending_gt_ptr = TEMPLATED(bt_right_child)(mf, cur_pos); + + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos)) { + *pending_lt_ptr = MF_INVALID_POS; + *pending_gt_ptr = MF_INVALID_POS; + *best_len_ret = best_len; + return lz_matchptr; + } + + best_lt_len = 0; + best_gt_len = 0; + len = 0; + + for (;;) { + matchptr = &in_begin[cur_node]; + + if (matchptr[len] == in_next[len]) { + len = lz_extend(in_next, matchptr, len + 1, max_find_len); + if (!record_matches || len > best_len) { + if (record_matches) { + best_len = len; + lz_matchptr->length = min_u32(len, max_produce_len); + lz_matchptr->offset = + in_next - matchptr; + lz_matchptr++; + } + if (len >= nice_len) { + *pending_lt_ptr = + *TEMPLATED(bt_left_child)(mf, cur_node); + *pending_gt_ptr = + *TEMPLATED(bt_right_child)(mf, cur_node); + *best_len_ret = best_len; + return lz_matchptr; + } + } + } + + if (matchptr[len] < in_next[len]) { + *pending_lt_ptr = cur_node; + pending_lt_ptr = TEMPLATED(bt_right_child)(mf, cur_node); + cur_node = *pending_lt_ptr; + best_lt_len = len; + if (best_gt_len < len) + len = best_gt_len; + } else { + *pending_gt_ptr = cur_node; + pending_gt_ptr = TEMPLATED(bt_left_child)(mf, cur_node); + cur_node = *pending_gt_ptr; + best_gt_len = len; + if (best_lt_len < len) + len = best_lt_len; + } + + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node, + in_min_pos) || + !--depth_remaining) { + *pending_lt_ptr = MF_INVALID_POS; + *pending_gt_ptr = MF_INVALID_POS; + *best_len_ret = best_len; + + return lz_matchptr; + } + } +} + +/* + * Retrieve a list of matches with the current position. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @in_abs_pos + * Absolute position of in_begin in the file + * @cur_pos + * The current position in the input buffer relative to @in_begin (the + * position of the sequence being matched against). + * @max_len + * The maximum permissible match length at this position. Must be >= + * BT_MATCHFINDER_REQUIRED_NBYTES. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @best_len_ret + * If a match of length >= 4 was found, then the length of the longest such + * match is written here; otherwise 3 is written here. (Note: this is + * redundant with the 'struct lz_match' array, but this is easier for the + * compiler to optimize when inlined and the caller immediately does a + * check against 'best_len'.) + * @lz_matchptr + * An array in which this function will record the matches. The recorded + * matches will be sorted by strictly increasing length and (non-strictly) + * increasing offset. The maximum number of matches that may be found is + * 'nice_len - 1', or one less if length 2 matches are disabled. + * + * The return value is a pointer to the next available slot in the @lz_matchptr + * array. (If no matches were found, this will be the same as @lz_matchptr.) + */ +static attrib_forceinline struct lz_match * +TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) *mf, + const uint8_t *in_begin, + uint32_t in_min_pos, + ptrdiff_t cur_pos, + uint32_t max_find_len, + uint32_t max_produce_len, + uint32_t nice_len, + uint32_t max_search_depth, + uint32_t next_hashes[2], + uint32_t *best_len_ret, + struct lz_match *lz_matchptr) +{ + return TEMPLATED(bt_matchfinder_advance_one_byte)(mf, + in_begin, + in_min_pos, + cur_pos, + max_find_len, + max_produce_len, + nice_len, + max_search_depth, + next_hashes, + best_len_ret, + lz_matchptr, + true); +} + +/* + * Advance the matchfinder, but don't record any matches. + * + * This is very similar to bt_matchfinder_get_matches() because both functions + * must do hashing and tree re-rooting. + */ +static attrib_forceinline void +TEMPLATED(bt_matchfinder_skip_byte)(struct TEMPLATED(bt_matchfinder) *mf, + const uint8_t *in_begin, + uint32_t in_min_pos, + ptrdiff_t cur_pos, + uint32_t nice_len, + uint32_t max_search_depth, + uint32_t next_hashes[2]) +{ + uint32_t best_len; + TEMPLATED(bt_matchfinder_advance_one_byte)(mf, + in_begin, + in_min_pos, + cur_pos, + nice_len, + nice_len, + nice_len, + max_search_depth, + next_hashes, + &best_len, + NULL, + false); +} + +/* + * Culls any matches that are lower than a specified offset and reduces any + * remaining offsets by the same amount. + */ +static attrib_forceinline void +TEMPLATED(bt_matchfinder_cull)(struct TEMPLATED(bt_matchfinder) * mf, + uint32_t cull_size, uint32_t window_size) +{ + size_t mf_size = TEMPLATED(bt_matchfinder_size)(window_size, true); + + const size_t mf_count = mf_size / sizeof(mf_pos_t); + + TEMPLATED(matchfinder_rebase)((mf_pos_t *)mf, mf_count, cull_size); +} diff --git a/dlls/cabinet/liblzx_compiler.h b/dlls/cabinet/liblzx_compiler.h new file mode 100644 index 00000000000..73e36c07a59 --- /dev/null +++ b/dlls/cabinet/liblzx_compiler.h @@ -0,0 +1,214 @@ +/* + * compiler.h + * + * Compiler-specific definitions. Currently, only GCC and clang are supported. + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LIBLZX_COMPILER_H +#define _LIBLZX_COMPILER_H + +#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__) +#define LIBLZX_IS_MSVC_COMPILER 1 +#else +#define LIBLZX_IS_MSVC_COMPILER 0 +#endif + +#if LIBLZX_IS_MSVC_COMPILER +#include <stdint.h> +#include <stddef.h> + +#pragma warning(error:4013) +#endif + +#ifndef __ORDER_LITTLE_ENDIAN__ +#define __ORDER_LITTLE_ENDIAN__ 1 +#endif + +#ifndef __ORDER_BIG_ENDIAN__ +#define __ORDER_BIG_ENDIAN__ 2 +#endif + + +/* Is the compiler GCC of the specified version or later? This always returns + * false for clang, since clang is "frozen" at GNUC 4.2. The __has_* + * feature-test macros should be used to detect clang functionality instead. */ +#define GCC_PREREQ(major, minor) \ + (!defined(__clang__) && !defined(__INTEL_COMPILER) && \ + (__GNUC__ > major || \ + (__GNUC__ == major && __GNUC_MINOR__ >= minor))) + +/* Feature-test macros defined by recent versions of clang. */ +#ifndef __has_attribute +# define __has_attribute(attribute) 0 +#endif +#ifndef __has_feature +# define __has_feature(feature) 0 +#endif +#ifndef __has_builtin +# define __has_builtin(builtin) 0 +#endif + +/* Declare that the annotated function should always be inlined. This might be + * desirable in highly tuned code, e.g. compression codecs. */ +#if LIBLZX_IS_MSVC_COMPILER +#define attrib_forceinline __forceinline +#else +#define attrib_forceinline inline __attribute__((always_inline)) +#endif + +/* Declare that the annotated function should *not* be inlined. */ +#if LIBLZX_IS_MSVC_COMPILER +#define attrib_noinline __declspec(noinline) +#else +#define attrib_noinline __attribute__((noinline)) +#endif + +/* Declare that the annotated function is unlikely to be executed */ +#if LIBLZX_IS_MSVC_COMPILER +#define attrib_cold +#else +#define attrib_cold __attribute__((cold)) +#endif + +/* Declare that the annotated type or variable is aligned */ +#if LIBLZX_IS_MSVC_COMPILER +#define attrib_aligned(alignment) __declspec(align(alignment)) +#else +#define attrib_aligned(alignment) __attribute__((aligned(alignment))) +#endif + +/* Functionally the same as 'attrib_noinline', but documents that the reason + * for not inlining is to prevent the annotated function from being inlined + * into a recursive function, thereby increasing its stack usage. */ +#define attrib_noinline_for_stack attrib_noinline + +/* Hint that the expression is usually true. */ +#if LIBLZX_IS_MSVC_COMPILER +#define likely(expr) (expr) +#else +#define likely(expr) __builtin_expect(!!(expr), 1) +#endif + +/* Hint that the expression is usually false. */ +#if LIBLZX_IS_MSVC_COMPILER +#define unlikely(expr) (expr) +#else +#define unlikely(expr) __builtin_expect(!!(expr), 0) +#endif + +/* Prefetch into L1 cache for read. */ +#if LIBLZX_IS_MSVC_COMPILER +#define prefetchr(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0) +#else +#define prefetchr(addr) __builtin_prefetch((addr), 0) +#endif + +/* Prefetch into L1 cache for write. */ +#if LIBLZX_IS_MSVC_COMPILER +#define prefetchw(addr) _mm_prefetch((const char *)(addr), _MM_HINT_T0) +#else +#define prefetchw(addr) __builtin_prefetch((addr), 1) +#endif + +/* Hint that the annotated function takes a printf()-like format string and + * arguments. This is currently disabled on Windows because MinGW does not + * support this attribute on functions taking wide-character strings. */ +#ifdef _WIN32 +# define _format_attribute(type, format_str, format_start) +#else +# define _format_attribute(type, format_str, format_start) \ + __attribute__((format(type, format_str, format_start))) +#endif + +/* Endianness definitions. Either CPU_IS_BIG_ENDIAN() or CPU_IS_LITTLE_ENDIAN() + * evaluates to 1. The other evaluates to 0. Note that newer gcc supports + * __BYTE_ORDER__ for easily determining the endianness; older gcc doesn't. In + * the latter case we fall back to a configure-time check. */ +#ifdef __BYTE_ORDER__ +# define CPU_IS_BIG_ENDIAN() (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#elif defined(HAVE_CONFIG_H) +# include "config.h" +# ifdef WORDS_BIGENDIAN +# define CPU_IS_BIG_ENDIAN() 1 +# else +# define CPU_IS_BIG_ENDIAN() 0 +# endif +#endif +#define CPU_IS_LITTLE_ENDIAN() (!CPU_IS_BIG_ENDIAN()) + +/* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses + * can be performed efficiently on the target platform. */ +#if defined(__x86_64__) || defined(__i386__) || \ + defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__) +# define UNALIGNED_ACCESS_IS_FAST 1 +#else +# define UNALIGNED_ACCESS_IS_FAST 0 +#endif + +/* Swap the values of two variables, without multiple evaluation. */ +#ifndef swap +# define swap(a, b) do { typeof(a) _a = (a); (a) = (b); (b) = _a; } while(0) +#endif +#define SWAP(a, b) swap((a), (b)) + +/* Optional definitions for checking with 'sparse'. */ +#ifdef __CHECKER__ +# define _bitwise_attr __attribute__((bitwise)) +# define _force_attr __attribute__((force)) +#else +# define _bitwise_attr +# define _force_attr +#endif + +/* STATIC_ASSERT() - verify the truth of an expression at compilation time. */ +#ifdef __CHECKER__ +# define STATIC_ASSERT(expr) +# define STATIC_ASSERT_STMT(expr) +#elif __STDC_VERSION__ >= 201112L +# define STATIC_ASSERT(expr) _Static_assert((expr), "") +# define STATIC_ASSERT_STMT(expr) do {_Static_assert((expr), "");} while(0) +#else +# define STATIC_ASSERT(expr) ((void)sizeof(char[1 - 2 * !(expr)])) +# define STATIC_ASSERT_STMT(expr) STATIC_ASSERT(expr) +#endif + +/* STATIC_ASSERT_ZERO() - verify the truth of an expression at compilation time + * and also produce a result of value '0' to be used in constant expressions */ +#define STATIC_ASSERT_ZERO(expr) ((int)sizeof(char[-!(expr)])) + +#define CONCAT_IMPL(s1, s2) s1##s2 + +/* CONCAT() - concatenate two tokens at preprocessing time. */ +#define CONCAT(s1, s2) CONCAT_IMPL(s1, s2) + +#if LIBLZX_IS_MSVC_COMPILER +#define __builtin_constant_p(n) (0) + +typedef ptrdiff_t ssize_t; +#endif + +#endif /* _LIBLZX_COMPILER_H */ diff --git a/dlls/cabinet/liblzx_compress_common.c b/dlls/cabinet/liblzx_compress_common.c new file mode 100644 index 00000000000..be88df26b85 --- /dev/null +++ b/dlls/cabinet/liblzx_compress_common.c @@ -0,0 +1,673 @@ +/* + * compress_common.c + * + * Code for compression shared among multiple compression formats. + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include <string.h> + +#include <assert.h> +#include "liblzx_compress_common.h" +#include "liblzx_minmax.h" +#include "liblzx_util.h" + +/* + * Given the binary tree node A[subtree_idx] whose children already satisfy the + * maxheap property, swap the node with its greater child until it is greater + * than or equal to both of its children, so that the maxheap property is + * satisfied in the subtree rooted at A[subtree_idx]. 'A' uses 1-based indices. + */ +static void +heapify_subtree(uint32_t A[], unsigned length, unsigned subtree_idx) +{ + unsigned parent_idx; + unsigned child_idx; + uint32_t v; + + v = A[subtree_idx]; + parent_idx = subtree_idx; + while ((child_idx = parent_idx * 2) <= length) { + if (child_idx < length && A[child_idx + 1] > A[child_idx]) + child_idx++; + if (v >= A[child_idx]) + break; + A[parent_idx] = A[child_idx]; + parent_idx = child_idx; + } + A[parent_idx] = v; +} + +/* + * Rearrange the array 'A' so that it satisfies the maxheap property. + * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1]. + */ +static void +heapify_array(uint32_t A[], unsigned length) +{ + unsigned subtree_idx; + + for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--) + heapify_subtree(A, length, subtree_idx); +} + +/* + * Sort the array 'A', which contains 'length' unsigned 32-bit integers. + * + * Note: name this function heap_sort() instead of heapsort() to avoid colliding + * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't + * necessary when compiling with -D_ANSI_SOURCE, which is the better solution. + */ +static void +heap_sort(uint32_t A[], unsigned length) +{ + A--; /* Use 1-based indices */ + + heapify_array(A, length); + + while (length >= 2) { + uint32_t tmp = A[length]; + + A[length] = A[1]; + A[1] = tmp; + length--; + heapify_subtree(A, length, 1); + } +} + +#define NUM_SYMBOL_BITS 10 +#define NUM_FREQ_BITS (32 - NUM_SYMBOL_BITS) +#define SYMBOL_MASK ((1 << NUM_SYMBOL_BITS) - 1) +#define FREQ_MASK (~SYMBOL_MASK) + +#define GET_NUM_COUNTERS(num_syms) (num_syms) + +/* + * Sort the symbols primarily by frequency and secondarily by symbol value. + * Discard symbols with zero frequency and fill in an array with the remaining + * symbols, along with their frequencies. The low NUM_SYMBOL_BITS bits of each + * array entry will contain the symbol value, and the remaining bits will + * contain the frequency. + * + * @num_syms + * Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS. + * + * @freqs[num_syms] + * Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1. + * + * @lens[num_syms] + * An array that eventually will hold the length of each codeword. This + * function only fills in the codeword lengths for symbols that have zero + * frequency, which are not well defined per se but will be set to 0. + * + * @symout[num_syms] + * The output array, described above. + * + * Returns the number of entries in 'symout' that were filled. This is the + * number of symbols that have nonzero frequency. + */ +static unsigned +sort_symbols(unsigned num_syms, const uint32_t freqs[], uint8_t lens[], uint32_t symout[]) +{ + unsigned sym; + unsigned i; + unsigned num_used_syms; + unsigned num_counters; + unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)]; + + /* + * We use heapsort, but with an added optimization. Since often most + * symbol frequencies are low, we first do a count sort using a limited + * number of counters. High frequencies are counted in the last + * counter, and only they will be sorted with heapsort. + * + * Note: with more symbols, it is generally beneficial to have more + * counters. About 1 counter per symbol seems fastest. + */ + + num_counters = GET_NUM_COUNTERS(num_syms); + + memset(counters, 0, num_counters * sizeof(counters[0])); + + /* Count the frequencies. */ + for (sym = 0; sym < num_syms; sym++) + counters[min_size(freqs[sym], num_counters - 1)]++; + + /* + * Make the counters cumulative, ignoring the zero-th, which counted + * symbols with zero frequency. As a side effect, this calculates the + * number of symbols with nonzero frequency. + */ + num_used_syms = 0; + for (i = 1; i < num_counters; i++) { + unsigned count = counters[i]; + + counters[i] = num_used_syms; + num_used_syms += count; + } + + /* + * Sort nonzero-frequency symbols using the counters. At the same time, + * set the codeword lengths of zero-frequency symbols to 0. + */ + for (sym = 0; sym < num_syms; sym++) { + uint32_t freq = freqs[sym]; + + if (freq != 0) { + symout[counters[min_size(freq, num_counters - 1)]++] = + sym | (freq << NUM_SYMBOL_BITS); + } else { + lens[sym] = 0; + } + } + + /* Sort the symbols counted in the last counter. */ + heap_sort(symout + counters[num_counters - 2], + counters[num_counters - 1] - counters[num_counters - 2]); + + return num_used_syms; +} + +/* + * Build a Huffman tree. + * + * This is an optimized implementation that + * (a) takes advantage of the frequencies being already sorted; + * (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman + * tree are sufficient to generate a canonical code; + * (c) Only stores parent pointers, not child pointers; + * (d) Produces the nodes in the same memory used for input frequency + * information. + * + * Array 'A', which contains 'sym_count' entries, is used for both input and + * output. For this function, 'sym_count' must be at least 2. + * + * For input, the array must contain the frequencies of the symbols, sorted in + * increasing order. Specifically, each entry must contain a frequency left + * shifted by NUM_SYMBOL_BITS bits. Any data in the low NUM_SYMBOL_BITS bits of + * the entries will be ignored by this function. Although these bits will, in + * fact, contain the symbols that correspond to the frequencies, this function + * is concerned with frequencies only and keeps the symbols as-is. + * + * For output, this function will produce the non-leaf nodes of the Huffman + * tree. These nodes will be stored in the first (sym_count - 1) entries of the + * array. Entry A[sym_count - 2] will represent the root node. Each other node + * will contain the zero-based index of its parent node in 'A', left shifted by + * NUM_SYMBOL_BITS bits. The low NUM_SYMBOL_BITS bits of each entry in A will + * be kept as-is. Again, note that although these low bits will, in fact, + * contain a symbol value, this symbol will have *no relationship* with the + * Huffman tree node that happens to occupy the same slot. This is because this + * implementation only generates the non-leaf nodes of the tree. + */ +static void +build_tree(uint32_t A[], unsigned sym_count) +{ + const unsigned last_idx = sym_count - 1; + + /* Index of the next lowest frequency leaf that still needs a parent */ + unsigned i = 0; + + /* + * Index of the next lowest frequency non-leaf that still needs a + * parent, or 'e' if there is currently no such node + */ + unsigned b = 0; + + /* Index of the next spot for a non-leaf (will overwrite a leaf) */ + unsigned e = 0; + + do { + uint32_t new_freq; + + /* + * Select the next two lowest frequency nodes among the leaves + * A[i] and non-leaves A[b], and create a new node A[e] to be + * their parent. Set the new node's frequency to the sum of the + * frequencies of its two children. + * + * Usually the next two lowest frequency nodes are of the same + * type (leaf or non-leaf), so check those cases first. + */ + if (i + 1 <= last_idx && + (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) { + /* Two leaves */ + new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK); + i += 2; + } else if (b + 2 <= e && + (i > last_idx || + (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) { + /* Two non-leaves */ + new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + A[b + 1] = (e << NUM_SYMBOL_BITS) | + (A[b + 1] & SYMBOL_MASK); + b += 2; + } else { + /* One leaf and one non-leaf */ + new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK); + A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK); + i++; + b++; + } + A[e] = new_freq | (A[e] & SYMBOL_MASK); + /* + * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the + * tree is complete once we've created 'n - 1' non-leaves. + */ + } while (++e < last_idx); +} + +/* + * Given the stripped-down Huffman tree constructed by build_tree(), determine + * the number of codewords that should be assigned each possible length, taking + * into account the length-limited constraint. + * + * @A + * The array produced by build_tree(), containing parent index information + * for the non-leaf nodes of the Huffman tree. Each entry in this array is + * a node; a node's parent always has a greater index than that node + * itself. This function will overwrite the parent index information in + * this array, so essentially it will destroy the tree. However, the data + * in the low NUM_SYMBOL_BITS of each entry will be preserved. + * + * @root_idx + * The 0-based index of the root node in 'A', and consequently one less + * than the number of tree node entries in 'A'. (Or, really 2 less than + * the actual length of 'A'.) + * + * @len_counts + * An array of length ('max_codeword_len' + 1) in which the number of + * codewords having each length <= max_codeword_len will be returned. + * + * @max_codeword_len + * The maximum permissible codeword length. + */ +static void +compute_length_counts(uint32_t A[], unsigned root_idx, unsigned len_counts[], + unsigned max_codeword_len) +{ + unsigned len; + int node; + + /* + * The key observations are: + * + * (1) We can traverse the non-leaf nodes of the tree, always visiting a + * parent before its children, by simply iterating through the array + * in reverse order. Consequently, we can compute the depth of each + * node in one pass, overwriting the parent indices with depths. + * + * (2) We can initially assume that in the real Huffman tree, both + * children of the root are leaves. This corresponds to two + * codewords of length 1. Then, whenever we visit a (non-leaf) node + * during the traversal, we modify this assumption to account for + * the current node *not* being a leaf, but rather its two children + * being leaves. This causes the loss of one codeword for the + * current depth and the addition of two codewords for the current + * depth plus one. + * + * (3) We can handle the length-limited constraint fairly easily by + * simply using the largest length available when a depth exceeds + * max_codeword_len. + */ + + for (len = 0; len <= max_codeword_len; len++) + len_counts[len] = 0; + len_counts[1] = 2; + + /* Set the root node's depth to 0. */ + A[root_idx] &= SYMBOL_MASK; + + for (node = root_idx - 1; node >= 0; node--) { + + /* Calculate the depth of this node. */ + + unsigned parent = A[node] >> NUM_SYMBOL_BITS; + unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS; + unsigned depth = parent_depth + 1; + unsigned len = depth; + + /* + * Set the depth of this node so that it is available when its + * children (if any) are processed. + */ + A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS); + + /* + * If needed, decrease the length to meet the length-limited + * constraint. This is not the optimal method for generating + * length-limited Huffman codes! But it should be good enough. + */ + if (len >= max_codeword_len) { + len = max_codeword_len; + do { + len--; + } while (len_counts[len] == 0); + } + + /* + * Account for the fact that we have a non-leaf node at the + * current depth. + */ + len_counts[len]--; + len_counts[len + 1] += 2; + } +} + +/* + * Generate the codewords for a canonical Huffman code. + * + * @A + * The output array for codewords. In addition, initially this + * array must contain the symbols, sorted primarily by frequency and + * secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of + * each entry. + * + * @len + * Output array for codeword lengths. + * + * @len_counts + * An array that provides the number of codewords that will have + * each possible length <= max_codeword_len. + * + * @max_codeword_len + * Maximum length, in bits, of each codeword. + * + * @num_syms + * Number of symbols in the alphabet, including symbols with zero + * frequency. This is the length of the 'A' and 'len' arrays. + */ +static void +gen_codewords(uint32_t A[], uint8_t lens[], const unsigned len_counts[], + unsigned max_codeword_len, unsigned num_syms) +{ + uint32_t next_codewords[MAX_CODEWORD_LEN + 1]; + unsigned i; + unsigned len; + unsigned sym; + + /* + * Given the number of codewords that will have each length, assign + * codeword lengths to symbols. We do this by assigning the lengths in + * decreasing order to the symbols sorted primarily by increasing + * frequency and secondarily by increasing symbol value. + */ + for (i = 0, len = max_codeword_len; len >= 1; len--) { + unsigned count = len_counts[len]; + + while (count--) + lens[A[i++] & SYMBOL_MASK] = len; + } + + /* + * Generate the codewords themselves. We initialize the + * 'next_codewords' array to provide the lexicographically first + * codeword of each length, then assign codewords in symbol order. This + * produces a canonical code. + */ + next_codewords[0] = 0; + next_codewords[1] = 0; + for (len = 2; len <= max_codeword_len; len++) + next_codewords[len] = + (next_codewords[len - 1] + len_counts[len - 1]) << 1; + + for (sym = 0; sym < num_syms; sym++) + A[sym] = next_codewords[lens[sym]]++; +} + +/* + * --------------------------------------------------------------------- + * make_canonical_huffman_code() + * --------------------------------------------------------------------- + * + * Given an alphabet and the frequency of each symbol in it, construct a + * length-limited canonical Huffman code. + * + * @num_syms + * The number of symbols in the alphabet. The symbols are the integers in + * the range [0, num_syms - 1]. This parameter must be at least 2 and + * must not exceed (1 << NUM_SYMBOL_BITS). + * + * @max_codeword_len + * The maximum permissible codeword length. + * + * @freqs + * An array of length @num_syms that gives the frequency of each symbol. + * It is valid for some, none, or all of the frequencies to be 0. The sum + * of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1. + * + * @lens + * An array of @num_syms entries in which this function will return the + * length, in bits, of the codeword assigned to each symbol. Symbols with + * 0 frequency will not have codewords per se, but their entries in this + * array will be set to 0. No lengths greater than @max_codeword_len will + * be assigned. + * + * @codewords + * An array of @num_syms entries in which this function will return the + * codeword for each symbol, right-justified and padded on the left with + * zeroes. Codewords for symbols with 0 frequency will be undefined. + * + * --------------------------------------------------------------------- + * + * This function builds a length-limited canonical Huffman code. + * + * A length-limited Huffman code contains no codewords longer than some + * specified length, and has exactly (with some algorithms) or approximately + * (with the algorithm used here) the minimum weighted path length from the + * root, given this constraint. + * + * A canonical Huffman code satisfies the properties that a longer codeword + * never lexicographically precedes a shorter codeword, and the lexicographic + * ordering of codewords of the same length is the same as the lexicographic + * ordering of the corresponding symbols. A canonical Huffman code, or more + * generally a canonical prefix code, can be reconstructed from only a list + * containing the codeword length of each symbol. + * + * The classic algorithm to generate a Huffman code creates a node for each + * symbol, then inserts these nodes into a min-heap keyed by symbol frequency. + * Then, repeatedly, the two lowest-frequency nodes are removed from the + * min-heap and added as the children of a new node having frequency equal to + * the sum of its two children, which is then inserted into the min-heap. When + * only a single node remains in the min-heap, it is the root of the Huffman + * tree. The codeword for each symbol is determined by the path needed to reach + * the corresponding node from the root. Descending to the left child appends a + * 0 bit, whereas descending to the right child appends a 1 bit. + * + * The classic algorithm is relatively easy to understand, but it is subject to + * a number of inefficiencies. In practice, it is fastest to first sort the + * symbols by frequency. (This itself can be subject to an optimization based + * on the fact that most frequencies tend to be low.) At the same time, we sort + * secondarily by symbol value, which aids the process of generating a canonical + * code. Then, during tree construction, no heap is necessary because both the + * leaf nodes and the unparented non-leaf nodes can be easily maintained in + * sorted order. Consequently, there can never be more than two possibilities + * for the next-lowest-frequency node. + * + * In addition, because we're generating a canonical code, we actually don't + * need the leaf nodes of the tree at all, only the non-leaf nodes. This is + * because for canonical code generation we don't need to know where the symbols + * are in the tree. Rather, we only need to know how many leaf nodes have each + * depth (codeword length). And this information can, in fact, be quickly + * generated from the tree of non-leaves only. + * + * Furthermore, we can build this stripped-down Huffman tree directly in the + * array in which the codewords are to be generated, provided that these array + * slots are large enough to hold a symbol and frequency value. + * + * Still furthermore, we don't even need to maintain explicit child pointers. + * We only need the parent pointers, and even those can be overwritten in-place + * with depth information as part of the process of extracting codeword lengths + * from the tree. So in summary, we do NOT need a big structure like: + * + * struct huffman_tree_node { + * unsigned int symbol; + * unsigned int frequency; + * unsigned int depth; + * struct huffman_tree_node *left_child; + * struct huffman_tree_node *right_child; + * }; + * + * + * ... which often gets used in "naive" implementations of Huffman code + * generation. + * + * Many of these optimizations are based on the implementation in 7-Zip (source + * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov. + * + * NOTE: in general, the same frequencies can be used to generate different + * length-limited canonical Huffman codes. One choice we have is during tree + * construction, when we must decide whether to prefer a leaf or non-leaf when + * there is a tie in frequency. Another choice we have is how to deal with + * codewords that would exceed @max_codeword_len bits in length. Both of these + * choices affect the resulting codeword lengths, which otherwise can be mapped + * uniquely onto the resulting canonical Huffman code. + * + * Normally, there is no problem with choosing one valid code over another, + * provided that they produce similar compression ratios. However, the LZMS + * compression format uses adaptive Huffman coding. It requires that both the + * decompressor and compressor build a canonical code equivalent to that which + * can be generated by using the classic Huffman tree construction algorithm and + * always processing leaves before non-leaves when there is a frequency tie. + * Therefore, we make sure to do this. This method also has the advantage of + * sometimes shortening the longest codeword that is generated. + * + * There also is the issue of how codewords longer than @max_codeword_len are + * dealt with. Fortunately, for LZMS this is irrelevant because for the LZMS + * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15). Since + * the LZMS algorithm regularly halves all frequencies, the frequencies cannot + * become high enough for a length 16 codeword to be generated. Specifically, I + * think that if ties are broken in favor of non-leaves (as we do), the lowest + * total frequency that would give a length-16 codeword would be the sum of the + * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is + * 3570. And in LZMS we can't get a frequency that high based on the alphabet + * sizes, rebuild frequencies, and scaling factors. This worst-case scenario is + * based on the following degenerate case (only the bottom of the tree shown): + * + * ... + * 17 + * / \ + * 10 7 + * / \ + * 6 4 + * / \ + * 3 3 + * / \ + * 2 1 + * / \ + * 1 1 + * + * Excluding the first leaves (those with value 1), each leaf value must be + * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would + * have taken precedence over that non-leaf and been combined with the leaf + * below, thereby decreasing the height compared to that shown. + * + * Interesting fact: if we were to instead prioritize non-leaves over leaves, + * then the worst case frequencies would be the Fibonacci sequence, plus an + * extra frequency of 1. In this hypothetical scenario, it would be slightly + * easier for longer codewords to be generated. + */ +void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const uint32_t freqs[], uint8_t lens[], uint32_t codewords[]) +{ + uint32_t *A = codewords; + unsigned num_used_syms; + + assert(num_syms <= MAX_NUM_SYMS); + STATIC_ASSERT_STMT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS); + assert(max_codeword_len <= MAX_CODEWORD_LEN); + + /* + * We begin by sorting the symbols primarily by frequency and + * secondarily by symbol value. As an optimization, the array used for + * this purpose ('A') shares storage with the space in which we will + * eventually return the codewords. + */ + num_used_syms = sort_symbols(num_syms, freqs, lens, A); + + /* + * 'num_used_syms' is the number of symbols with nonzero frequency. + * This may be less than @num_syms. 'num_used_syms' is also the number + * of entries in 'A' that are valid. Each entry consists of a distinct + * symbol and a nonzero frequency packed into a 32-bit integer. + */ + + /* + * Handle special cases where only 0 or 1 symbols were used (had nonzero + * frequency). + */ + + if (unlikely(num_used_syms == 0)) { + /* + * Code is empty. sort_symbols() already set all lengths to 0, + * so there is nothing more to do. + */ + return; + } + + if (unlikely(num_used_syms == 1)) { + /* + * Only one symbol was used, so we only need one codeword. But + * two codewords are needed to form the smallest complete + * Huffman code, which uses codewords 0 and 1. Therefore, we + * choose another symbol to which to assign a codeword. We use + * 0 (if the used symbol is not 0) or 1 (if the used symbol is + * 0). In either case, the lesser-valued symbol must be + * assigned codeword 0 so that the resulting code is canonical. + */ + + unsigned sym = A[0] & SYMBOL_MASK; + unsigned nonzero_idx = sym ? sym : 1; + + codewords[0] = 0; + lens[0] = 1; + codewords[nonzero_idx] = 1; + lens[nonzero_idx] = 1; + return; + } + + /* + * Build a stripped-down version of the Huffman tree, sharing the array + * 'A' with the symbol values. Then extract length counts from the tree + * and use them to generate the final codewords. + */ + + build_tree(A, num_used_syms); + + { + unsigned len_counts[MAX_CODEWORD_LEN + 1]; + + compute_length_counts(A, num_used_syms - 2, + len_counts, max_codeword_len); + + gen_codewords(A, lens, len_counts, max_codeword_len, num_syms); + } +} diff --git a/dlls/cabinet/liblzx_compress_common.h b/dlls/cabinet/liblzx_compress_common.h new file mode 100644 index 00000000000..5eb73086fb7 --- /dev/null +++ b/dlls/cabinet/liblzx_compress_common.h @@ -0,0 +1,19 @@ +/* + * compress_common.h + * + * Header for compression code shared by multiple compression formats. + */ + +#ifndef _LIBLZX_COMPRESS_COMMON_H +#define _LIBLZX_COMPRESS_COMMON_H + +#include "liblzx_types.h" + +#define MAX_NUM_SYMS 799 /* LZMS_MAX_NUM_SYMS */ +#define MAX_CODEWORD_LEN 16 + +void +make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len, + const uint32_t freqs[], uint8_t lens[], uint32_t codewords[]); + +#endif /* _LIBLZX_COMPRESS_COMMON_H */ diff --git a/dlls/cabinet/liblzx_config.h b/dlls/cabinet/liblzx_config.h new file mode 100644 index 00000000000..2c070d93022 --- /dev/null +++ b/dlls/cabinet/liblzx_config.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright (C) 2012-2017 Eric Biggers + * + * This file is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this file; if not, see https://www.gnu.org/licenses/. + */ + + #pragma once + +#ifndef __LIBLZX_CONFIG_H__ +#define __LIBLZX_CONFIG_H__ + +// Set to 1 to export as a DLL +#ifndef LIBLZX_DLL_EXPORT +#define LIBLZX_DLL_EXPORT 0 +#endif + +#endif diff --git a/dlls/cabinet/liblzx_endianness.h b/dlls/cabinet/liblzx_endianness.h new file mode 100644 index 00000000000..2ee8fceee69 --- /dev/null +++ b/dlls/cabinet/liblzx_endianness.h @@ -0,0 +1,136 @@ +/* + * endianness.h - macros and inline functions for endianness conversion + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LIBLZX_ENDIANNESS_H +#define _LIBLZX_ENDIANNESS_H + +#include "liblzx_compiler.h" +#include "liblzx_types.h" + +#if LIBLZX_IS_MSVC_COMPILER +#include <intrin.h> +#endif + +#ifdef HAVE_SYS_ENDIAN_H + /* Needed on NetBSD to stop system bswap macros from messing things up */ +# include <sys/endian.h> +# undef bswap16 +# undef bswap32 +# undef bswap64 +#endif + +/* Watch out for conflict with ntfs-3g/endians.h ... */ +#ifndef _NTFS_ENDIANS_H + +#define bswap16_const(n) \ + ((((uint16_t)(n) & 0x00FF) << 8) | \ + (((uint16_t)(n) & 0xFF00) >> 8)) + +#define bswap32_const(n) \ + ((((uint32_t)(n) & 0x000000FF) << 24) | \ + (((uint32_t)(n) & 0x0000FF00) << 8) | \ + (((uint32_t)(n) & 0x00FF0000) >> 8) | \ + (((uint32_t)(n) & 0xFF000000) >> 24)) + +#define bswap64_const(n) \ + ((((uint64_t)(n) & 0x00000000000000FF) << 56) | \ + (((uint64_t)(n) & 0x000000000000FF00) << 40) | \ + (((uint64_t)(n) & 0x0000000000FF0000) << 24) | \ + (((uint64_t)(n) & 0x00000000FF000000) << 8) | \ + (((uint64_t)(n) & 0x000000FF00000000) >> 8) | \ + (((uint64_t)(n) & 0x0000FF0000000000) >> 24) | \ + (((uint64_t)(n) & 0x00FF000000000000) >> 40) | \ + (((uint64_t)(n) & 0xFF00000000000000) >> 56)) + +static attrib_forceinline uint16_t do_bswap16(uint16_t n) +{ +#if LIBLZX_IS_MSVC_COMPILER + return _byteswap_ushort(n); +#elif GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16) + return __builtin_bswap16(n); +#else + return bswap16_const(n); +#endif +} + +static attrib_forceinline uint32_t do_bswap32(uint32_t n) +{ +#if LIBLZX_IS_MSVC_COMPILER + return _byteswap_ulong(n); +#elif GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32) + return __builtin_bswap32(n); +#else + return bswap32_const(n); +#endif +} + +static attrib_forceinline uint64_t do_bswap64(uint64_t n) +{ +#if LIBLZX_IS_MSVC_COMPILER + return _byteswap_uint64(n); +#elif GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64) + return __builtin_bswap64(n); +#else + return bswap64_const(n); +#endif +} + +#define bswap16(n) (__builtin_constant_p(n) ? bswap16_const(n) : do_bswap16(n)) +#define bswap32(n) (__builtin_constant_p(n) ? bswap32_const(n) : do_bswap32(n)) +#define bswap64(n) (__builtin_constant_p(n) ? bswap64_const(n) : do_bswap64(n)) + +#if CPU_IS_BIG_ENDIAN() +# define cpu_to_le16(n) ((_force_attr le16_t)bswap16(n)) +# define cpu_to_le32(n) ((_force_attr le32_t)bswap32(n)) +# define cpu_to_le64(n) ((_force_attr le64_t)bswap64(n)) +# define le16_to_cpu(n) bswap16((_force_attr uint16_t)(le16_t)(n)) +# define le32_to_cpu(n) bswap32((_force_attr uint32_t)(le32_t)(n)) +# define le64_to_cpu(n) bswap64((_force_attr uint64_t)(le64_t)(n)) +# define cpu_to_be16(n) ((_force_attr be16_t)(uint16_t)(n)) +# define cpu_to_be32(n) ((_force_attr be32_t)(uint32_t)(n)) +# define cpu_to_be64(n) ((_force_attr be64_t)(uint64_t)(n)) +# define be16_to_cpu(n) ((_force_attr uint16_t)(be16_t)(n)) +# define be32_to_cpu(n) ((_force_attr uint32_t)(be32_t)(n)) +# define be64_to_cpu(n) ((_force_attr uint64_t)(be64_t)(n)) +#else +# define cpu_to_le16(n) ((_force_attr le16_t)(uint16_t)(n)) +# define cpu_to_le32(n) ((_force_attr le32_t)(uint32_t)(n)) +# define cpu_to_le64(n) ((_force_attr le64_t)(uint64_t)(n)) +# define le16_to_cpu(n) ((_force_attr uint16_t)(le16_t)(n)) +# define le32_to_cpu(n) ((_force_attr uint32_t)(le32_t)(n)) +# define le64_to_cpu(n) ((_force_attr uint64_t)(le64_t)(n)) +# define cpu_to_be16(n) ((_force_attr be16_t)bswap16(n)) +# define cpu_to_be32(n) ((_force_attr be32_t)bswap32(n)) +# define cpu_to_be64(n) ((_force_attr be64_t)bswap64(n)) +# define be16_to_cpu(n) bswap16((_force_attr uint16_t)(be16_t)(n)) +# define be32_to_cpu(n) bswap32((_force_attr uint32_t)(be32_t)(n)) +# define be64_to_cpu(n) bswap64((_force_attr uint64_t)(be64_t)(n)) +#endif + +#endif /* _NTFS_ENDIANS_H */ +#endif /* _LIBLZX_ENDIANNESS_H */ diff --git a/dlls/cabinet/liblzx_error.h b/dlls/cabinet/liblzx_error.h new file mode 100644 index 00000000000..cd984b61094 --- /dev/null +++ b/dlls/cabinet/liblzx_error.h @@ -0,0 +1,11 @@ +#ifndef __LIBLZX_ERROR_H__ +#define __LIBLZX_ERROR_H__ + +enum liblzx_error { + LIBLZX_ERR_NONE = 0, + + LIBLZX_ERR_NOMEM = -1, + LIBLZX_ERR_INVALID_PARAM = -2, +}; + +#endif /* __LIBLZX_ERROR_H__ */ diff --git a/dlls/cabinet/liblzx_hc_matchfinder.h b/dlls/cabinet/liblzx_hc_matchfinder.h new file mode 100644 index 00000000000..31d659ab1fa --- /dev/null +++ b/dlls/cabinet/liblzx_hc_matchfinder.h @@ -0,0 +1,432 @@ +/* + * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + * + * --------------------------------------------------------------------------- + * + * Algorithm + * + * This is a Hash Chains (hc) based matchfinder. + * + * The main data structure is a hash table where each hash bucket contains a + * linked list (or "chain") of sequences whose first 4 bytes share the same hash + * code. Each sequence is identified by its starting position in the input + * buffer. + * + * The algorithm processes the input buffer sequentially. At each byte + * position, the hash code of the first 4 bytes of the sequence beginning at + * that position (the sequence being matched against) is computed. This + * identifies the hash bucket to use for that position. Then, this hash + * bucket's linked list is searched for matches. Then, a new linked list node + * is created to represent the current sequence and is prepended to the list. + * + * This algorithm has several useful properties: + * + * - It only finds true Lempel-Ziv matches; i.e., those where the matching + * sequence occurs prior to the sequence being matched against. + * + * - The sequences in each linked list are always sorted by decreasing starting + * position. Therefore, the closest (smallest offset) matches are found + * first, which in many compression formats tend to be the cheapest to encode. + * + * - Although fast running time is not guaranteed due to the possibility of the + * lists getting very long, the worst degenerate behavior can be easily + * prevented by capping the number of nodes searched at each position. + * + * - If the compressor decides not to search for matches at a certain position, + * then that position can be quickly inserted without searching the list. + * + * - The algorithm is adaptable to sliding windows: just store the positions + * relative to a "base" value that is updated from time to time, and stop + * searching each list when the sequences get too far away. + * + * --------------------------------------------------------------------------- + * + * Notes on usage + * + * Before including this header, you must define 'mf_pos_t' to an integer type + * that can represent all possible positions. This can be a 16-bit or 32-bit + * unsigned integer. When possible, the former should be used due to the + * reduced cache pressure. This header can be included multiple times in a + * single .c file with different 'mf_pos_t' definitions; however, you must + * define a different MF_SUFFIX each time to generate different names for the + * matchfinder structure and functions. + * + * The number of bytes that must be allocated for a given 'struct + * hc_matchfinder' must be gotten by calling hc_matchfinder_size(). + * + * ---------------------------------------------------------------------------- + * + * Optimizations + * + * The main hash table and chains handle length 4+ matches. Length 3 matches + * are handled by a separate hash table with no chains. This works well for + * typical "greedy" or "lazy"-style compressors, where length 3 matches are + * often only helpful if they have small offsets. Instead of searching a full + * chain for length 3+ matches, the algorithm just checks for one close length 3 + * match, then focuses on finding length 4+ matches. + * + * The longest_match() and skip_bytes() functions are inlined into the + * compressors that use them. This isn't just about saving the overhead of a + * function call. These functions are intended to be called from the inner + * loops of compressors, where giving the compiler more control over register + * allocation is very helpful. There is also significant benefit to be gained + * from allowing the CPU to predict branches independently at each call site. + * For example, "lazy"-style compressors can be written with two calls to + * longest_match(), each of which starts with a different 'best_len' and + * therefore has significantly different performance characteristics. + * + * Although any hash function can be used, a multiplicative hash is fast and + * works well. + * + * On some processors, it is significantly faster to extend matches by whole + * words (32 or 64 bits) instead of by individual bytes. For this to be the + * case, the processor must implement unaligned memory accesses efficiently and + * must have either a fast "find first set bit" instruction or a fast "find last + * set bit" instruction, depending on the processor's endianness. + * + * The code uses one loop for finding the first match and one loop for finding a + * longer match. Each of these loops is tuned for its respective task and in + * combination are faster than a single generalized loop that handles both + * tasks. + * + * The code also uses a tight inner loop that only compares the last and first + * bytes of a potential match. It is only when these bytes match that a full + * match extension is attempted. + * + * ---------------------------------------------------------------------------- + */ + +#include <string.h> + +#include "liblzx_matchfinder_common.h" + +#define HC_MATCHFINDER_HASH3_ORDER 15 +#define HC_MATCHFINDER_HASH4_ORDER 16 + +/* TEMPLATED functions and structures have MF_SUFFIX appended to their name. */ +#undef TEMPLATED +#define TEMPLATED(name) CONCAT(name, MF_SUFFIX) + +struct TEMPLATED(hc_matchfinder) { + + /* The hash table for finding length 3 matches */ + mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER]; + + /* The hash table which contains the first nodes of the linked lists for + * finding length 4+ matches */ + mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER]; + + /* The "next node" references for the linked lists. The "next node" of + * the node for the sequence with position 'pos' is 'next_tab[pos]'. */ + mf_pos_t next_tab[]; +}; + +/* Return the number of bytes that must be allocated for a 'hc_matchfinder' that + * can work with buffers up to the specified size. */ +static attrib_forceinline size_t +TEMPLATED(hc_matchfinder_size)(size_t max_bufsize, bool streaming) +{ + const size_t streaming_mul = streaming ? 2 : 1; + + return sizeof(struct TEMPLATED(hc_matchfinder)) + + (max_bufsize * streaming_mul * sizeof(mf_pos_t)); +} + +/* Prepare the matchfinder for a new input buffer. */ +static attrib_forceinline void +TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) * mf, + size_t max_bufsize, bool streaming) +{ + memset(mf, 0xFF, TEMPLATED(hc_matchfinder_size)(max_bufsize, streaming)); +} + +/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches() + * and bt_matchfinder_skip_byte(). There must be sufficiently many bytes + * remaining to load a 32-bit integer from the *next* position. */ +#define HC_MATCHFINDER_REQUIRED_NBYTES 5 + +/* + * Find the longest match longer than 'best_len' bytes. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @in_next + * Pointer to the next position in the input buffer, i.e. the sequence + * being matched against. + * @best_len + * Require a match longer than this length. + * @max_len + * The maximum permissible match length at this position. + * @nice_len + * Stop searching if a match of at least this length is found. + * Must be <= @max_len. + * @max_search_depth + * Limit on the number of potential matches to consider. Must be >= 1. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + 1. + * @offset_ret + * If a match is found, its offset is returned in this location. + * + * Return the length of the match found, or 'best_len' if no match longer than + * 'best_len' was found. + */ +static attrib_forceinline uint32_t +TEMPLATED(hc_matchfinder_longest_match)(struct TEMPLATED(hc_matchfinder) * const mf, + const uint8_t * const in_begin, + uint32_t in_min_pos, + const uint8_t * const in_next, + uint32_t best_len, + const uint32_t max_find_len, + const uint32_t max_produce_len, + const uint32_t nice_len, + const uint32_t max_search_depth, + uint32_t * const next_hashes, + uint32_t * const offset_ret) +{ + uint32_t depth_remaining = max_search_depth; + const uint8_t *best_matchptr = in_next; + mf_pos_t cur_node3, cur_node4; + uint32_t hash3, hash4; + uint32_t next_hashseq; + uint32_t seq4; + const uint8_t *matchptr; + uint32_t len; + uint32_t cur_pos = in_next - in_begin; + + /* can we read 4 bytes from 'in_next + 1'? */ + if (unlikely(max_find_len < HC_MATCHFINDER_REQUIRED_NBYTES)) + goto out; + + /* Get the precomputed hash codes. */ + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + + /* From the hash buckets, get the first node of each linked list. */ + cur_node3 = mf->hash3_tab[hash3]; + cur_node4 = mf->hash4_tab[hash4]; + + /* Update for length 3 matches. This replaces the singleton node in the + * 'hash3' bucket with the node for the current sequence. */ + mf->hash3_tab[hash3] = cur_pos; + + /* Update for length 4 matches. This prepends the node for the current + * sequence to the linked list in the 'hash4' bucket. */ + mf->hash4_tab[hash4] = cur_pos; + mf->next_tab[cur_pos] = cur_node4; + + /* Compute the next hash codes. */ + next_hashseq = get_unaligned_le32(in_next + 1); + next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + prefetchw(&mf->hash3_tab[next_hashes[0]]); + prefetchw(&mf->hash4_tab[next_hashes[1]]); + + if (best_len < 4) { /* No match of length >= 4 found yet? */ + + /* Check for a length 3 match if needed. */ + + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node3, in_min_pos)) + goto out; + + seq4 = load_u32_unaligned(in_next); + + if (best_len < 3) { + matchptr = &in_begin[cur_node3]; + if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) { + best_len = 3; + best_matchptr = matchptr; + } + } + + /* Check for a length 4 match. */ + + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, in_min_pos)) + goto out; + + for (;;) { + /* No length 4 match found yet. Check the first 4 bytes. */ + matchptr = &in_begin[cur_node4]; + + if (load_u32_unaligned(matchptr) == seq4) + break; + + /* The first 4 bytes did not match. Keep trying. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, + in_min_pos) || + !--depth_remaining) + goto out; + } + + /* Found a match of length >= 4. Extend it to its full length. */ + best_matchptr = matchptr; + best_len = lz_extend(in_next, best_matchptr, 4, max_find_len); + if (best_len >= nice_len) + goto out; + cur_node4 = mf->next_tab[cur_node4]; + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, + in_min_pos) || + !--depth_remaining) + goto out; + } else { + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, + in_min_pos) || + best_len >= nice_len) + goto out; + } + + /* Check for matches of length >= 5. */ + + for (;;) { + for (;;) { + matchptr = &in_begin[cur_node4]; + + /* Already found a length 4 match. Try for a longer + * match; start by checking either the last 4 bytes and + * the first 4 bytes, or the last byte. (The last byte, + * the one which would extend the match length by 1, is + * the most important.) */ + #if UNALIGNED_ACCESS_IS_FAST + if ((load_u32_unaligned(matchptr + best_len - 3) == + load_u32_unaligned(in_next + best_len - 3)) && + (load_u32_unaligned(matchptr) == + load_u32_unaligned(in_next))) + #else + if (matchptr[best_len] == in_next[best_len]) + #endif + break; + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, + in_min_pos) || + !--depth_remaining) + goto out; + } + + #if UNALIGNED_ACCESS_IS_FAST + len = 4; + #else + len = 0; + #endif + len = lz_extend(in_next, matchptr, len, max_find_len); + if (len > best_len) { + /* This is the new longest match. */ + best_len = len; + best_matchptr = matchptr; + if (best_len >= nice_len) + goto out; + } + + /* Continue to the next node in the list. */ + cur_node4 = mf->next_tab[cur_node4]; + if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, + in_min_pos) || + !--depth_remaining) + goto out; + } +out: + *offset_ret = in_next - best_matchptr; + best_len = min_u32(best_len, max_produce_len); + if (best_len < 2) + best_len = 2; + + return best_len; +} + +/* + * Advance the matchfinder, but don't search for matches. + * + * @mf + * The matchfinder structure. + * @in_begin + * Pointer to the beginning of the input buffer. + * @in_next + * Pointer to the next position in the input buffer. + * @in_end + * Pointer to the end of the input buffer. + * @count + * The number of bytes to advance. Must be > 0. + * @next_hashes + * The precomputed hash codes for the sequence beginning at @in_next. + * These will be used and then updated with the precomputed hashcodes for + * the sequence beginning at @in_next + @count. + */ +static attrib_forceinline void +TEMPLATED(hc_matchfinder_skip_bytes)(struct TEMPLATED(hc_matchfinder) * const mf, + const uint8_t * const in_begin, + const uint8_t *in_next, + const uint8_t * const in_end, + const uint32_t count, + uint32_t * const next_hashes) +{ + uint32_t cur_pos; + uint32_t hash3, hash4; + uint32_t next_hashseq; + uint32_t remaining = count; + + if (unlikely(count + HC_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next)) + return; + + cur_pos = in_next - in_begin; + hash3 = next_hashes[0]; + hash4 = next_hashes[1]; + do { + mf->hash3_tab[hash3] = cur_pos; + mf->next_tab[cur_pos] = mf->hash4_tab[hash4]; + mf->hash4_tab[hash4] = cur_pos; + + next_hashseq = get_unaligned_le32(++in_next); + hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER); + hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER); + cur_pos++; + } while (--remaining); + + prefetchw(&mf->hash3_tab[hash3]); + prefetchw(&mf->hash4_tab[hash4]); + next_hashes[0] = hash3; + next_hashes[1] = hash4; +} + +/* + * Culls any matches that are lower than a specified offset and reduces any + * remaining offsets by the same amount. + */ +static attrib_forceinline void +TEMPLATED(hc_matchfinder_cull)(struct TEMPLATED(hc_matchfinder) * mf, + uint32_t cull_size, uint32_t window_size) +{ + const size_t mf_count = + TEMPLATED(hc_matchfinder_size)(window_size, true) / + sizeof(mf_pos_t); + + TEMPLATED(matchfinder_rebase)((mf_pos_t *)mf, mf_count, cull_size); +} diff --git a/dlls/cabinet/liblzx_lzx_common.c b/dlls/cabinet/liblzx_lzx_common.c new file mode 100644 index 00000000000..d8619e0965f --- /dev/null +++ b/dlls/cabinet/liblzx_lzx_common.c @@ -0,0 +1,325 @@ +/* + * lzx_common.c - Common code for LZX compression and decompression. + */ + +/* + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright (C) 2012-2016 Eric Biggers + * + * This file is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this file; if not, see https://www.gnu.org/licenses/. + */ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "liblzx_minmax.h" + +#include <string.h> + +#ifdef __SSE2__ +# include <emmintrin.h> +#endif + +#ifdef __AVX2__ +# include <immintrin.h> +#endif + +#include "liblzx_bitops.h" +#include "liblzx_endianness.h" +#include "liblzx_lzx_common.h" +#include "liblzx_unaligned.h" +#include "liblzx_util.h" + +/* Mapping: offset slot => first match offset that uses that offset slot. + * The offset slots for repeat offsets map to "fake" offsets < 1. */ +const int32_t lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = { + -2 , -1 , 0 , 1 , 2 , /* 0 --- 4 */ + 4 , 6 , 10 , 14 , 22 , /* 5 --- 9 */ + 30 , 46 , 62 , 94 , 126 , /* 10 --- 14 */ + 190 , 254 , 382 , 510 , 766 , /* 15 --- 19 */ + 1022 , 1534 , 2046 , 3070 , 4094 , /* 20 --- 24 */ + 6142 , 8190 , 12286 , 16382 , 24574 , /* 25 --- 29 */ + 32766 , 49150 , 65534 , 98302 , 131070 , /* 30 --- 34 */ + 196606 , 262142 , 393214 , 524286 , 655358 , /* 35 --- 39 */ + 786430 , 917502 , 1048574, 1179646, 1310718, /* 40 --- 44 */ + 1441790, 1572862, 1703934, 1835006, 1966078, /* 45 --- 49 */ + 2097150 /* extra */ +}; + +/* Mapping: offset slot => how many extra bits must be read and added to the + * corresponding offset slot base to decode the match offset. */ +const uint8_t lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = { + 0 , 0 , 0 , 0 , 1 , + 1 , 2 , 2 , 3 , 3 , + 4 , 4 , 5 , 5 , 6 , + 6 , 7 , 7 , 8 , 8 , + 9 , 9 , 10, 10, 11, + 11, 12, 12, 13, 13, + 14, 14, 15, 15, 16, + 16, 17, 17, 17, 17, + 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, +}; + +/* Round the specified buffer size up to the next valid LZX window size, and + * return its order (log2). Or, if the buffer size is 0 or greater than the + * largest valid LZX window size, return 0. */ +unsigned +lzx_get_window_order(size_t max_bufsize) +{ + if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE) + return 0; + + return max_uint(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER); +} + +/* Given a valid LZX window order, return the number of symbols that will exist + * in the main Huffman code. */ +unsigned +lzx_get_num_main_syms(unsigned window_order) +{ + /* Note: one would expect that the maximum match offset would be + * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two + * bytes were to match the last two bytes. However, the format + * disallows this case. This reduces the number of needed offset slots + * by 1. */ + uint32_t window_size = (uint32_t)1 << window_order; + uint32_t max_offset = window_size - LZX_MIN_MATCH_LEN - 1; + unsigned num_offset_slots = 30; + while (max_offset >= (uint32_t)lzx_offset_slot_base[num_offset_slots]) + num_offset_slots++; + + return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS); +} + +static void +do_translate_target(void *target, int32_t input_pos, int32_t e8_file_size) +{ + int32_t abs_offset, rel_offset; + + rel_offset = get_unaligned_le32(target); + if (rel_offset >= -input_pos && rel_offset < e8_file_size) { + if (rel_offset < e8_file_size - input_pos) { + /* "good translation" */ + abs_offset = rel_offset + input_pos; + } else { + /* "compensating translation" */ + abs_offset = rel_offset - e8_file_size; + } + put_unaligned_le32(abs_offset, target); + } +} + +static void +undo_translate_target(void *target, int32_t input_pos, int32_t e8_file_size) +{ + int32_t abs_offset, rel_offset; + + abs_offset = get_unaligned_le32(target); + if (abs_offset >= 0) { + if (abs_offset < e8_file_size) { + /* "good translation" */ + rel_offset = abs_offset - input_pos; + put_unaligned_le32(rel_offset, target); + } + } else { + if (abs_offset >= -input_pos) { + /* "compensating translation" */ + rel_offset = abs_offset + e8_file_size; + put_unaligned_le32(rel_offset, target); + } + } +} + +/* + * Do or undo the 'E8' preprocessing used in LZX. Before compression, the + * uncompressed data is preprocessed by changing the targets of x86 CALL + * instructions from relative offsets to absolute offsets. After decompression, + * the translation is undone by changing the targets of x86 CALL instructions + * from absolute offsets to relative offsets. + * + * Note that despite its intent, E8 preprocessing can be done on any data even + * if it is not actually x86 machine code. In fact, E8 preprocessing appears to + * always be used in LZX-compressed resources in WIM files; there is no bit to + * indicate whether it is used or not, unlike in the LZX compressed format as + * used in cabinet files, where a bit is reserved for that purpose. + * + * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data, + * which really means the 5-byte call instruction cannot start in the last 10 + * bytes of the uncompressed data. This is one of the errors in the LZX + * documentation. + * + * E8 preprocessing does not appear to be disabled after the 32768th chunk of a + * WIM resource, which apparently is another difference from the LZX compression + * used in cabinet files. + * + * E8 processing is supposed to take the file size as a parameter, as it is used + * in calculating the translated jump targets. But in WIM files, this file size + * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000). + */ +static void +lzx_e8_filter(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size, + void (*process_target)(void *, int32_t, int32_t)) +{ + +#if !defined(__SSE2__) && !defined(__AVX2__) + uint8_t *tail; + uint8_t *p; + + if (size <= LZX_E8_FILTER_TAIL_SIZE) + return; + + tail = &data[size - LZX_E8_FILTER_TAIL_SIZE]; + p = data; + while (p < tail) { + if (*p != 0xE8) { + p++; + continue; + } + + (*process_target)(p + 1, (int32_t)(p - data + chunk_offset), + e8_file_size); + p += 5; + } +#else + /* SSE2 or AVX-2 optimized version for x86_64 */ + + uint8_t *p = data; + uint64_t valid_mask = ~0; + + if (size <= LZX_E8_FILTER_TAIL_SIZE) + return; +#ifdef __AVX2__ +# define ALIGNMENT_REQUIRED 32 +#else +# define ALIGNMENT_REQUIRED 16 +#endif + + /* Process one byte at a time until the pointer is properly aligned. */ + while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) { + if (p >= data + size - LZX_E8_FILTER_TAIL_SIZE) + return; + if (*p == 0xE8 && (valid_mask & 1)) { + (*process_target)(p + 1, p - data + chunk_offset, + e8_file_size); + valid_mask &= ~0x1F; + } + p++; + valid_mask >>= 1; + valid_mask |= (uint64_t)1 << 63; + } + + if (data + size - p >= 64) { + + /* Vectorized processing */ + + /* Note: we use a "trap" E8 byte to eliminate the need to check + * for end-of-buffer in the inner loop. This byte is carefully + * positioned so that it will never be changed by a previous + * translation before it is detected. */ + + uint8_t *trap = p + ((data + size - p) & ~31) - 32 + 4; + uint8_t saved_byte = *trap; + *trap = 0xE8; + + for (;;) { + uint32_t e8_mask; + uint8_t *orig_p = p; + #ifdef __AVX2__ + const __m256i e8_bytes = _mm256_set1_epi8(0xE8); + for (;;) { + __m256i bytes = *(const __m256i *)p; + __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes); + e8_mask = _mm256_movemask_epi8(cmpresult); + if (e8_mask) + break; + p += 32; + } + #else + const __m128i e8_bytes = _mm_set1_epi8(0xE8); + for (;;) { + /* Read the next 32 bytes of data and test them + * for E8 bytes. */ + __m128i bytes1 = *(const __m128i *)p; + __m128i bytes2 = *(const __m128i *)(p + 16); + __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes); + __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes); + uint32_t mask1 = _mm_movemask_epi8(cmpresult1); + uint32_t mask2 = _mm_movemask_epi8(cmpresult2); + /* The masks have a bit set for each E8 byte. + * We stay in this fast inner loop as long as + * there are no E8 bytes. */ + if (mask1 | mask2) { + e8_mask = mask1 | (mask2 << 16); + break; + } + p += 32; + } + #endif + + /* Did we pass over data with no E8 bytes? */ + if (p != orig_p) + valid_mask = ~0; + + /* Are we nearing end-of-buffer? */ + if (p == trap - 4) + break; + + /* Process the E8 bytes. However, the AND with + * 'valid_mask' ensures we never process an E8 byte that + * was itself part of a translation target. */ + while ((e8_mask &= valid_mask)) { + unsigned bit = bsf32(e8_mask); + (*process_target)(p + bit + 1, + p + bit - data + chunk_offset, + e8_file_size); + valid_mask &= ~((uint64_t)0x1F << bit); + } + + valid_mask >>= 32; + valid_mask |= 0xFFFFFFFF00000000; + p += 32; + } + + *trap = saved_byte; + } + + /* Approaching the end of the buffer; process one byte a time. */ + while (p < data + size - LZX_E8_FILTER_TAIL_SIZE) { + if (*p == 0xE8 && (valid_mask & 1)) { + (*process_target)(p + 1, p - data + chunk_offset, + e8_file_size); + valid_mask &= ~0x1F; + } + p++; + valid_mask >>= 1; + valid_mask |= (uint64_t)1 << 63; + } +#endif /* __SSE2__ || __AVX2__ */ +} + +void +lzx_preprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size) +{ + lzx_e8_filter(data, size, chunk_offset, e8_file_size, + do_translate_target); +} + +void +lzx_postprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size) +{ + lzx_e8_filter(data, size, chunk_offset, e8_file_size, + undo_translate_target); +} diff --git a/dlls/cabinet/liblzx_lzx_common.h b/dlls/cabinet/liblzx_lzx_common.h new file mode 100644 index 00000000000..d6fd20b095c --- /dev/null +++ b/dlls/cabinet/liblzx_lzx_common.h @@ -0,0 +1,29 @@ +/* + * lzx_common.h + * + * Declarations shared between LZX compression and decompression. + */ + +#ifndef _LZX_COMMON_H +#define _LZX_COMMON_H + +#include "liblzx_lzx_constants.h" +#include "liblzx_types.h" + +extern const int32_t lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1]; + +extern const uint8_t lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS]; + +unsigned +lzx_get_window_order(size_t max_bufsize); + +unsigned +lzx_get_num_main_syms(unsigned window_order); + +void +lzx_preprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size); + +void +lzx_postprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size); + +#endif /* _LZX_COMMON_H */ diff --git a/dlls/cabinet/liblzx_lzx_compress.c b/dlls/cabinet/liblzx_lzx_compress.c new file mode 100644 index 00000000000..f26629d5fdb --- /dev/null +++ b/dlls/cabinet/liblzx_lzx_compress.c @@ -0,0 +1,3662 @@ +/* + * lzx_compress.c + * + * A compressor for the LZX compression format, as used in WIM archives. + */ + +/* + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright (C) 2012-2017 Eric Biggers + * + * This file is free software; you can redistribute it and/or modify it under + * the terms of the GNU Lesser General Public License as published by the Free + * Software Foundation; either version 2.1 of the License, or (at your option) any + * later version. + * + * This file is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS + * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more + * details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this file; if not, see https://www.gnu.org/licenses/. + */ + + +/* + * This file contains a compressor for the LZX ("Lempel-Ziv eXtended") + * compression format, as used in the WIM (Windows IMaging) file format. + * + * Two different LZX-compatible algorithms are implemented: "near-optimal" and + * "lazy". "Near-optimal" is significantly slower than "lazy", but results in a + * better compression ratio. The "near-optimal" algorithm is used at the + * default compression level. + * + * This file may need some slight modifications to be used outside of the WIM + * format. In particular, in other situations the LZX block header might be + * slightly different, and sliding window support might be required. + * + * LZX is a compression format derived from DEFLATE, the format used by zlib and + * gzip. Both LZX and DEFLATE use LZ77 matching and Huffman coding. Certain + * details are quite similar, such as the method for storing Huffman codes. + * However, the main differences are: + * + * - LZX preprocesses the data to attempt to make x86 machine code slightly more + * compressible before attempting to compress it further. + * + * - LZX uses a "main" alphabet which combines literals and matches, with the + * match symbols containing a "length header" (giving all or part of the match + * length) and an "offset slot" (giving, roughly speaking, the order of + * magnitude of the match offset). + * + * - LZX does not have static Huffman blocks (that is, the kind with preset + * Huffman codes); however it does have two types of dynamic Huffman blocks + * ("verbatim" and "aligned"). + * + * - LZX has a minimum match length of 2 rather than 3. Length 2 matches can be + * useful, but generally only if the compressor is smart about choosing them. + * + * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue + * of match offsets. This is very useful for certain types of files, such as + * binary files that have repeating records. + */ + +/******************************************************************************/ +/* General parameters */ +/*----------------------------------------------------------------------------*/ + +/* + * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL. It + * uses the slower algorithm at levels > MAX_FAST_LEVEL. + */ +#define MAX_FAST_LEVEL 34 + +/* + * The compressor-side limits on the codeword lengths (in bits) for each Huffman + * code. To make outputting bits slightly faster, some of these limits are + * lower than the limits defined by the LZX format. This does not significantly + * affect the compression ratio. + */ +#define MAIN_CODEWORD_LIMIT 16 +#define LENGTH_CODEWORD_LIMIT 12 +#define ALIGNED_CODEWORD_LIMIT 7 +#define PRE_CODEWORD_LIMIT 7 + +/******************************************************************************/ +/* Block splitting parameters */ +/*----------------------------------------------------------------------------*/ + +/* + * The compressor always outputs blocks of at least this size in bytes, except + * for the last block which may need to be smaller. + */ +#define MIN_BLOCK_SIZE 6500 + +/* + * The compressor attempts to end a block when it reaches this size in bytes. + * The final size might be slightly larger due to matches extending beyond the + * end of the block. Specifically: + * + * - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN + * bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'. + * + * - The lazy compressor may choose a sequence of literals starting at position + * 'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better + * matches. The final match may be up to LZX_MAX_MATCH_LEN bytes. The + * length of the literal sequence is approximately limited by the "nice match + * length" parameter. + */ +#define SOFT_MAX_BLOCK_SIZE 100000 + +/* + * The number of observed items (matches and literals) that represents + * sufficient data for the compressor to decide whether the current block should + * be ended or not. + */ +#define NUM_OBSERVATIONS_PER_BLOCK_CHECK 400 + + +/******************************************************************************/ +/* Parameters for slower algorithm */ +/*----------------------------------------------------------------------------*/ + +/* + * The log base 2 of the number of entries in the hash table for finding length + * 2 matches. This could be as high as 16, but using a smaller hash table + * speeds up compression due to reduced cache pressure. + */ +#define BT_MATCHFINDER_HASH2_ORDER 12 + +/* + * The number of lz_match structures in the match cache, excluding the extra + * "overflow" entries. This value should be high enough so that nearly the + * time, all matches found in a given block can fit in the match cache. + * However, fallback behavior (immediately terminating the block) on cache + * overflow is still required. + */ +#define CACHE_LENGTH (SOFT_MAX_BLOCK_SIZE * 5) + +/* + * An upper bound on the number of matches that can ever be saved in the match + * cache for a single position. Since each match we save for a single position + * has a distinct length, we can use the number of possible match lengths in LZX + * as this bound. This bound is guaranteed to be valid in all cases, although + * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be + * reached. + */ +#define MAX_MATCHES_PER_POS LZX_NUM_LENS + +/* + * A scaling factor that makes it possible to consider fractional bit costs. A + * single bit has a cost of BIT_COST. + * + * Note: this is only useful as a statistical trick for when the true costs are + * unknown. Ultimately, each token in LZX requires a whole number of bits to + * output. + */ +#define BIT_COST_BITS 6 +#define BIT_COST (1 << BIT_COST_BITS) + +/* + * Should the compressor take into account the costs of aligned offset symbols + * instead of assuming that all are equally likely? + */ +#define CONSIDER_ALIGNED_COSTS 1 + +/* + * Should the "minimum" cost path search algorithm consider "gap" matches, where + * a normal match is followed by a literal, then by a match with the same + * offset? This is one specific, somewhat common situation in which the true + * minimum cost path is often different from the path found by looking only one + * edge ahead. + */ +#define CONSIDER_GAP_MATCHES 1 + +/******************************************************************************/ +/* Includes */ +/*----------------------------------------------------------------------------*/ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +#include "liblzx_compress_common.h" +#include "liblzx_error.h" +#include "liblzx_lzx_common.h" +#include "liblzx_minmax.h" +#include "liblzx_unaligned.h" +#include "liblzx_util.h" +#include "liblzx.h" + +#include <assert.h> + +#include <malloc.h> + +/* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including + * bt_matchfinder.h. */ + +/* Matchfinders with 16-bit positions */ +#define mf_pos_t uint16_t +#define MF_SUFFIX _16 +#define MF_INVALID_POS (0xFFFFu) +#include "liblzx_bt_matchfinder.h" +#include "liblzx_hc_matchfinder.h" + +/* Matchfinders with 32-bit positions */ +#undef mf_pos_t +#undef MF_SUFFIX +#undef MF_INVALID_POS +#define mf_pos_t uint32_t +#define MF_SUFFIX _32 +#define MF_INVALID_POS (0xFFFFFFFFu) +#include "liblzx_bt_matchfinder.h" +#include "liblzx_hc_matchfinder.h" + +#undef mf_pos_t +#undef MF_SUFFIX +#undef MF_INVALID_POS + +/******************************************************************************/ +/* Compressor structure */ +/*----------------------------------------------------------------------------*/ + +/* Codewords for the Huffman codes */ +struct lzx_codewords { + uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS]; + uint32_t len[LZX_LENCODE_NUM_SYMBOLS]; + uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; +}; + +/* + * Codeword lengths, in bits, for the Huffman codes. + * + * A codeword length of 0 means the corresponding codeword has zero frequency. + * + * The main and length codes each have one extra entry for use as a sentinel. + * See lzx_write_compressed_code(). + */ +struct lzx_lens { + uint8_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1]; + uint8_t len[LZX_LENCODE_NUM_SYMBOLS + 1]; + uint8_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; +}; + +/* Codewords and lengths for the Huffman codes */ +struct lzx_codes { + struct lzx_codewords codewords; + struct lzx_lens lens; +}; + +/* Symbol frequency counters for the Huffman-encoded alphabets */ +struct lzx_freqs { + uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS]; + uint32_t len[LZX_LENCODE_NUM_SYMBOLS]; + uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; +}; + +/* Block split statistics. See the "Block splitting algorithm" section later in + * this file for details. */ +#define NUM_LITERAL_OBSERVATION_TYPES 8 +#define NUM_MATCH_OBSERVATION_TYPES 2 +#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \ + NUM_MATCH_OBSERVATION_TYPES) +struct lzx_block_split_stats { + uint32_t new_observations[NUM_OBSERVATION_TYPES]; + uint32_t observations[NUM_OBSERVATION_TYPES]; + uint32_t num_new_observations; + uint32_t num_observations; +}; + +/* + * Represents a run of literals followed by a match or end-of-block. This + * structure is needed to temporarily store items chosen by the compressor, + * since items cannot be written until all items for the block have been chosen + * and the block's Huffman codes have been computed. + */ +struct attrib_aligned(8) lzx_sequence { + + /* + * Bits 9..31: the number of literals in this run. This may be 0 and + * can be at most about SOFT_MAX_BLOCK_LENGTH. The literals are not + * stored explicitly in this structure; instead, they are read directly + * from the uncompressed data. + * + * Bits 0..8: the length of the match which follows the literals, or 0 + * if this literal run was the last in the block, so there is no match + * which follows it. This can be at most LZX_MAX_MATCH_LEN. + */ + uint32_t litrunlen_and_matchlen; +#define SEQ_MATCHLEN_BITS 9 +#define SEQ_MATCHLEN_MASK (((uint32_t)1 << SEQ_MATCHLEN_BITS) - 1) + + /* + * If 'matchlen' doesn't indicate end-of-block, then this contains: + * + * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent + * offset code, depending on the offset slot encoded in the main symbol. + * + * Bits 0..9: the main symbol. + */ + uint32_t adjusted_offset_and_mainsym; +#define SEQ_MAINSYM_BITS 10 +#define SEQ_MAINSYM_MASK (((uint32_t)1 << SEQ_MAINSYM_BITS) - 1) +}; + +/* + * This structure represents a byte position in the input buffer and a node in + * the graph of possible match/literal choices. + * + * Logically, each incoming edge to this node is labeled with a literal or a + * match that can be taken to reach this position from an earlier position; and + * each outgoing edge from this node is labeled with a literal or a match that + * can be taken to advance from this position to a later position. + */ +struct attrib_aligned(8) lzx_optimum_node { + + /* The cost, in bits, of the lowest-cost path that has been found to + * reach this position. This can change as progressively lower cost + * paths are found to reach this position. */ + uint32_t cost; + + /* + * The best arrival to this node, i.e. the match or literal that was + * used to arrive to this position at the given 'cost'. This can change + * as progressively lower cost paths are found to reach this position. + * + * For non-gap matches, this variable is divided into two bitfields + * whose meanings depend on the item type: + * + * Literals: + * Low bits are 0, high bits are the literal. + * + * Explicit offset matches: + * Low bits are the match length, high bits are the offset plus + * LZX_OFFSET_ADJUSTMENT. + * + * Repeat offset matches: + * Low bits are the match length, high bits are the queue index. + * + * For gap matches, identified by OPTIMUM_GAP_MATCH set, special + * behavior applies --- see the code. + */ + uint32_t item; +#define OPTIMUM_OFFSET_SHIFT SEQ_MATCHLEN_BITS +#define OPTIMUM_LEN_MASK SEQ_MATCHLEN_MASK +#if CONSIDER_GAP_MATCHES +# define OPTIMUM_GAP_MATCH 0x80000000 +#endif + +}; + +/* The cost model for near-optimal parsing */ +struct lzx_costs { + + /* + * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a + * length 'len' match which has an offset belonging to 'offset_slot'. + * The cost includes the main symbol, the length symbol if required, and + * the extra offset bits if any, excluding any entropy-coded bits + * (aligned offset bits). It does *not* include the cost of the aligned + * offset symbol which may be required. + */ + uint16_t match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS]; + + /* Cost of each symbol in the main code */ + uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS]; + + /* Cost of each symbol in the length code */ + uint32_t len[LZX_LENCODE_NUM_SYMBOLS]; + +#if CONSIDER_ALIGNED_COSTS + /* Cost of each symbol in the aligned offset code */ + uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS]; +#endif +}; + +struct lzx_output_bitstream; + +/* The main LZX compressor structure */ +struct liblzx_compressor { + + /* The LZX variant to use */ + enum liblzx_variant variant; + + /* Output chunk */ + liblzx_output_chunk_t out_chunk; + + /* True if a flush was requested*/ + bool flushing; + + /* Memory allocation function */ + liblzx_alloc_func_t alloc_func; + + /* Memory free function */ + liblzx_free_func_t free_func; + + /* Memory allocation userdata */ + void *alloc_userdata; + + /* True if the compressor is outputting the first block */ + bool first_block; + + /* E8 preprocessor file size */ + uint32_t e8_file_size; + + /* E8 preprocessor chunk offset */ + uint32_t e8_chunk_offset; + + /* The buffer for preprocessed input data, if not using destructive + * compression */ + void *in_buffer; + + /* The buffer for output data */ + void *out_buffer; + + /* Capacity of in_buffer */ + uint32_t in_buffer_capacity; + + /* Capacity of out_buffer */ + uint32_t out_buffer_capacity; + + /* Number of prefix bytes currently in in_buffer */ + uint32_t in_prefix_size; + + /* Number of bytes currently in in_buffer */ + uint32_t in_used; + + /* Maximum size of a chunk */ + uint32_t chunk_size; + + /* Pointer to the reset() implementation chosen at allocation time */ + void (*reset)(struct liblzx_compressor *); + + /* Pointer to the compress() implementation chosen at allocation time */ + void (*impl)(struct liblzx_compressor *, const uint8_t *, size_t, size_t, + struct lzx_output_bitstream *); + + /* Pointer to the cul() implementation chosen at allocation time */ + void (*cull)(struct liblzx_compressor *, size_t); + + /* The window size. */ + uint32_t window_size; + + /* The log base 2 of the window size for match offset encoding purposes. + * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */ + unsigned window_order; + + /* The number of symbols in the main alphabet. This depends on the + * window order, since the window order determines the maximum possible + * match offset. */ + unsigned num_main_syms; + + /* The "nice" match length: if a match of this length is found, then it + * is chosen immediately without further consideration. */ + unsigned nice_match_length; + + /* The maximum search depth: at most this many potential matches are + * considered at each position. */ + unsigned max_search_depth; + + /* The number of optimization passes per block */ + unsigned num_optim_passes; + + /* The symbol frequency counters for the current block */ + struct lzx_freqs freqs; + + /* Block split statistics for the current block */ + struct lzx_block_split_stats split_stats; + + /* The Huffman codes for the current and previous blocks. The one with + * index 'codes_index' is for the current block, and the other one is + * for the previous block. */ + struct lzx_codes codes[2]; + unsigned codes_index; + + /* The matches and literals that the compressor has chosen for the + * current block. The required length of this array is limited by the + * maximum number of matches that can ever be chosen for a single block, + * plus one for the special entry at the end. */ + struct lzx_sequence chosen_sequences[ + DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1]; + + /* Least-recently-used match queue */ + uint32_t lru_queue[LZX_NUM_RECENT_OFFSETS]; + + /* Next hashes */ + uint32_t next_hashes[2]; + + /* Tables for mapping adjusted offsets to offset slots */ + uint8_t offset_slot_tab_1[32768]; /* offset slots [0, 29] */ + uint8_t offset_slot_tab_2[128]; /* offset slots [30, 49] */ + + union { + /* Data for lzx_compress_lazy() */ + struct { + /* Hash chains matchfinder (MUST BE LAST!!!) */ + union { + struct hc_matchfinder_16 hc_mf_16; + struct hc_matchfinder_32 hc_mf_32; + }; + }; + + /* Data for lzx_compress_near_optimal() */ + struct { + /* + * Array of nodes, one per position, for running the + * minimum-cost path algorithm. + * + * This array must be large enough to accommodate the + * worst-case number of nodes, which occurs if the + * compressor finds a match of length LZX_MAX_MATCH_LEN + * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a + * block of size 'SOFT_MAX_BLOCK_SIZE - 1 + + * LZX_MAX_MATCH_LEN'. Add one for the end-of-block + * node. + */ + struct lzx_optimum_node optimum_nodes[ + SOFT_MAX_BLOCK_SIZE - 1 + + LZX_MAX_MATCH_LEN + 1]; + + /* The cost model for the current optimization pass */ + struct lzx_costs costs; + + /* + * Cached matches for the current block. This array + * contains the matches that were found at each position + * in the block. Specifically, for each position, there + * is a special 'struct lz_match' whose 'length' field + * contains the number of matches that were found at + * that position; this is followed by the matches + * themselves, if any, sorted by strictly increasing + * length. + * + * Note: in rare cases, there will be a very high number + * of matches in the block and this array will overflow. + * If this happens, we force the end of the current + * block. CACHE_LENGTH is the length at which we + * actually check for overflow. The extra slots beyond + * this are enough to absorb the worst case overflow, + * which occurs if starting at &match_cache[CACHE_LENGTH + * - 1], we write the match count header, then write + * MAX_MATCHES_PER_POS matches, then skip searching for + * matches at 'LZX_MAX_MATCH_LEN - 1' positions and + * write the match count header for each. + */ + struct lz_match match_cache[CACHE_LENGTH + + MAX_MATCHES_PER_POS + + LZX_MAX_MATCH_LEN - 1]; + + /* Binary trees matchfinder (MUST BE LAST!!!) */ + union { + struct bt_matchfinder_16 bt_mf_16; + struct bt_matchfinder_32 bt_mf_32; + }; + }; + }; +}; + +/******************************************************************************/ +/* Matchfinder utilities */ +/*----------------------------------------------------------------------------*/ + +/* + * Will a matchfinder using 16-bit positions be sufficient for compressing + * buffers of up to the specified size? The limit could be 65536 bytes, but we + * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case. + * This requires that the limit be no more than the length of offset_slot_tab_1 + * (currently 32768). + */ +static attrib_forceinline bool +lzx_is_16_bit(size_t max_bufsize) +{ + STATIC_ASSERT(ARRAY_LEN(((struct liblzx_compressor *)0)->offset_slot_tab_1) == 32768); + return max_bufsize <= 32768; +} + +/* + * Return the offset slot for the specified adjusted match offset. + */ +static attrib_forceinline unsigned +lzx_get_offset_slot(struct liblzx_compressor *c, uint32_t adjusted_offset, + bool is_16_bit) +{ + if (__builtin_constant_p(adjusted_offset) && + adjusted_offset < LZX_NUM_RECENT_OFFSETS) + return adjusted_offset; + if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1)) + return c->offset_slot_tab_1[adjusted_offset]; + + assert((adjusted_offset >> 14) < ARRAY_LEN(c->offset_slot_tab_2)); + + return c->offset_slot_tab_2[adjusted_offset >> 14]; +} + +/* + * For a match that has the specified length and adjusted offset, tally its main + * symbol, and if needed its length symbol; then return its main symbol. + */ +static attrib_forceinline unsigned +lzx_tally_main_and_lensyms(struct liblzx_compressor *c, unsigned length, + uint32_t adjusted_offset, bool is_16_bit) +{ + unsigned mainsym; + + if (length >= LZX_MIN_SECONDARY_LEN) { + /* Length symbol needed */ + c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++; + mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS; + } else { + /* No length symbol needed */ + mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN; + } + + mainsym += LZX_NUM_LEN_HEADERS * + lzx_get_offset_slot(c, adjusted_offset, is_16_bit); + c->freqs.main[mainsym]++; + return mainsym; +} + +/* + * The following macros call either the 16-bit or the 32-bit version of a + * matchfinder function based on the value of 'is_16_bit', which will be known + * at compilation time. + */ + +#define CALL_HC_MF(is_16_bit, c, funcname, ...) \ + ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \ + CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__)); + +#define CALL_BT_MF(is_16_bit, c, funcname, ...) \ + ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \ + CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__)); + +/******************************************************************************/ +/* Output bitstream */ +/*----------------------------------------------------------------------------*/ + +/* + * The LZX bitstream is encoded as a sequence of little endian 16-bit coding + * units. Bits are ordered from most significant to least significant within + * each coding unit. + */ + +/* + * Structure to keep track of the current state of sending bits to the + * compressed output buffer. + */ +struct lzx_output_bitstream { + + /* Bits that haven't yet been written to the output buffer */ + machine_word_t bitbuf; + + /* Number of bits currently held in @bitbuf */ + machine_word_t bitcount; + + /* Pointer to the start of the output buffer */ + uint8_t *start; + + /* Pointer to the position in the output buffer at which the next coding + * unit should be written */ + uint8_t *next; + + /* Pointer to just past the end of the output buffer, rounded down by + * one byte if needed to make 'end - start' a multiple of 2 */ + uint8_t *end; +}; + +/* Can the specified number of bits always be added to 'bitbuf' after all + * pending 16-bit coding units have been flushed? */ +#define CAN_BUFFER(n) ((n) <= WORDBITS - 15) + +/* Initialize the output bitstream to write to the specified buffer. */ +static void +lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size) +{ + os->bitbuf = 0; + os->bitcount = 0; + os->start = buffer; + os->next = buffer; + os->end = (uint8_t *)buffer + (size & ~1); +} + +/* + * Add some bits to the bitbuffer variable of the output bitstream. The caller + * must make sure there is enough room. + */ +static attrib_forceinline void +lzx_add_bits(struct lzx_output_bitstream *os, uint32_t bits, unsigned num_bits) +{ + os->bitbuf = (os->bitbuf << num_bits) | bits; + os->bitcount += num_bits; +} + +/* + * Flush bits from the bitbuffer variable to the output buffer. 'max_num_bits' + * specifies the maximum number of bits that may have been added since the last + * flush. + */ +static attrib_forceinline void +lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits) +{ + /* Masking the number of bits to shift is only needed to avoid undefined + * behavior; we don't actually care about the results of bad shifts. On + * x86, the explicit masking generates no extra code. */ + const uint32_t shift_mask = WORDBITS - 1; + + if (os->end - os->next < 6) + return; + put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) & + shift_mask), os->next + 0); + if (max_num_bits > 16) + put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) & + shift_mask), os->next + 2); + if (max_num_bits > 32) + put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) & + shift_mask), os->next + 4); + os->next += (os->bitcount >> 4) << 1; + os->bitcount &= 15; +} + +/* Add at most 16 bits to the bitbuffer and flush it. */ +static attrib_forceinline void +lzx_write_bits(struct lzx_output_bitstream *os, uint32_t bits, unsigned num_bits) +{ + lzx_add_bits(os, bits, num_bits); + lzx_flush_bits(os, 16); +} + +/* + * Flush the last coding unit to the output buffer if needed. Return the total + * number of bytes written to the output buffer, or 0 if an overflow occurred. + */ +static size_t +lzx_flush_output(struct lzx_output_bitstream *os) +{ + if (os->end - os->next < 6) + return 0; + + if (os->bitcount != 0) { + put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next); + os->next += 2; + } + + return os->next - os->start; +} + +/******************************************************************************/ +/* Preparing Huffman codes */ +/*----------------------------------------------------------------------------*/ + +/* + * Build the Huffman codes. This takes as input the frequency tables for each + * code and produces as output a set of tables that map symbols to codewords and + * codeword lengths. + */ +static void +lzx_build_huffman_codes(struct liblzx_compressor *c) +{ + const struct lzx_freqs *freqs = &c->freqs; + struct lzx_codes *codes = &c->codes[c->codes_index]; + + STATIC_ASSERT_STMT(MAIN_CODEWORD_LIMIT >= 9 && + MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN); + make_canonical_huffman_code(c->num_main_syms, + MAIN_CODEWORD_LIMIT, + freqs->main, + codes->lens.main, + codes->codewords.main); + + STATIC_ASSERT_STMT(LENGTH_CODEWORD_LIMIT >= 8 && + LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN); + make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS, + LENGTH_CODEWORD_LIMIT, + freqs->len, + codes->lens.len, + codes->codewords.len); + + STATIC_ASSERT_STMT( + ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS && + ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN); + make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS, + ALIGNED_CODEWORD_LIMIT, + freqs->aligned, + codes->lens.aligned, + codes->codewords.aligned); +} + +/* Reset the symbol frequencies for the current block. */ +static void +lzx_reset_symbol_frequencies(struct liblzx_compressor *c) +{ + memset(&c->freqs, 0, sizeof(c->freqs)); +} + +static unsigned +lzx_compute_precode_items(const uint8_t * restrict lens, + const uint8_t * restrict prev_lens, + uint32_t * restrict precode_freqs, + unsigned * restrict precode_items) +{ + unsigned *itemptr; + unsigned run_start; + unsigned run_end; + unsigned extra_bits; + int delta; + uint8_t len; + + itemptr = precode_items; + run_start = 0; + + while (!((len = lens[run_start]) & 0x80)) { + + /* len = the length being repeated */ + + /* Find the next run of codeword lengths. */ + + run_end = run_start + 1; + + /* Fast case for a single length. */ + if (likely(len != lens[run_end])) { + delta = prev_lens[run_start] - len; + if (delta < 0) + delta += 17; + precode_freqs[delta]++; + *itemptr++ = delta; + run_start++; + continue; + } + + /* Extend the run. */ + do { + run_end++; + } while (len == lens[run_end]); + + if (len == 0) { + /* Run of zeroes. */ + + /* Symbol 18: RLE 20 to 51 zeroes at a time. */ + while ((run_end - run_start) >= 20) { + extra_bits = + min_uint((run_end - run_start) - 20, 0x1F); + precode_freqs[18]++; + *itemptr++ = 18 | (extra_bits << 5); + run_start += 20 + extra_bits; + } + + /* Symbol 17: RLE 4 to 19 zeroes at a time. */ + if ((run_end - run_start) >= 4) { + extra_bits = + min_uint((run_end - run_start) - 4, 0xF); + precode_freqs[17]++; + *itemptr++ = 17 | (extra_bits << 5); + run_start += 4 + extra_bits; + } + } else { + + /* A run of nonzero lengths. */ + + /* Symbol 19: RLE 4 to 5 of any length at a time. */ + while ((run_end - run_start) >= 4) { + extra_bits = (run_end - run_start) > 4; + delta = prev_lens[run_start] - len; + if (delta < 0) + delta += 17; + precode_freqs[19]++; + precode_freqs[delta]++; + *itemptr++ = 19 | (extra_bits << 5) | (delta << 6); + run_start += 4 + extra_bits; + } + } + + /* Output any remaining lengths without RLE. */ + while (run_start != run_end) { + delta = prev_lens[run_start] - len; + if (delta < 0) + delta += 17; + precode_freqs[delta]++; + *itemptr++ = delta; + run_start++; + } + } + + return itemptr - precode_items; +} + +/******************************************************************************/ +/* Outputting compressed data */ +/*----------------------------------------------------------------------------*/ + +/* + * Output a Huffman code in the compressed form used in LZX. + * + * The Huffman code is represented in the output as a logical series of codeword + * lengths from which the Huffman code, which must be in canonical form, can be + * reconstructed. + * + * The codeword lengths are themselves compressed using a separate Huffman code, + * the "precode", which contains a symbol for each possible codeword length in + * the larger code as well as several special symbols to represent repeated + * codeword lengths (a form of run-length encoding). The precode is itself + * constructed in canonical form, and its codeword lengths are represented + * literally in 20 4-bit fields that immediately precede the compressed codeword + * lengths of the larger code. + * + * Furthermore, the codeword lengths of the larger code are actually represented + * as deltas from the codeword lengths of the corresponding code in the previous + * block. + * + * @os: + * Bitstream to which to write the compressed Huffman code. + * @lens: + * The codeword lengths, indexed by symbol, in the Huffman code. + * @prev_lens: + * The codeword lengths, indexed by symbol, in the corresponding Huffman + * code in the previous block, or all zeroes if this is the first block. + * @num_lens: + * The number of symbols in the Huffman code. + */ +static void +lzx_write_compressed_code(struct lzx_output_bitstream *os, + const uint8_t * restrict lens, + const uint8_t * restrict prev_lens, + unsigned num_lens) +{ + uint32_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS]; + uint8_t precode_lens[LZX_PRECODE_NUM_SYMBOLS]; + uint32_t precode_codewords[LZX_PRECODE_NUM_SYMBOLS]; + unsigned *precode_items = (unsigned *)alloca(sizeof(unsigned) * num_lens); + unsigned num_precode_items; + unsigned precode_item; + unsigned precode_sym; + unsigned i; + uint8_t saved = lens[num_lens]; + *(uint8_t *)(lens + num_lens) = 0x80; + + for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) + precode_freqs[i] = 0; + + /* Compute the "items" (RLE / literal tokens and extra bits) with which + * the codeword lengths in the larger code will be output. */ + num_precode_items = lzx_compute_precode_items(lens, + prev_lens, + precode_freqs, + precode_items); + + /* Build the precode. */ + STATIC_ASSERT_STMT(PRE_CODEWORD_LIMIT >= 5 && + PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN); + make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT, + precode_freqs, precode_lens, + precode_codewords); + + /* Output the lengths of the codewords in the precode. */ + for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++) + lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE); + + /* Output the encoded lengths of the codewords in the larger code. */ + for (i = 0; i < num_precode_items; i++) { + precode_item = precode_items[i]; + precode_sym = precode_item & 0x1F; + lzx_add_bits(os, precode_codewords[precode_sym], + precode_lens[precode_sym]); + if (precode_sym >= 17) { + if (precode_sym == 17) { + lzx_add_bits(os, precode_item >> 5, 4); + } else if (precode_sym == 18) { + lzx_add_bits(os, precode_item >> 5, 5); + } else { + lzx_add_bits(os, (precode_item >> 5) & 1, 1); + precode_sym = precode_item >> 6; + lzx_add_bits(os, precode_codewords[precode_sym], + precode_lens[precode_sym]); + } + } + STATIC_ASSERT_STMT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1)); + lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1); + } + + *(uint8_t *)(lens + num_lens) = saved; +} + +/* + * Write all matches and literal bytes (which were precomputed) in an LZX + * compressed block to the output bitstream in the final compressed + * representation. + * + * @os + * The output bitstream. + * @block_type + * The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or + * LZX_BLOCKTYPE_VERBATIM). + * @block_data + * The uncompressed data of the block. + * @sequences + * The matches and literals to output, given as a series of sequences. + * @codes + * The main, length, and aligned offset Huffman codes for the block. + */ +static void +lzx_write_sequences(struct lzx_output_bitstream *os, int block_type, + const uint8_t *block_data, const struct lzx_sequence sequences[], + const struct lzx_codes *codes) +{ + const struct lzx_sequence *seq = sequences; + unsigned min_aligned_offset_slot; + + if (block_type == LZX_BLOCKTYPE_ALIGNED) + min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT; + else + min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS; + + for (;;) { + /* Output the next sequence. */ + + uint32_t litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS; + unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK; + STATIC_ASSERT((uint32_t)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >= + SOFT_MAX_BLOCK_SIZE); + uint32_t adjusted_offset; + unsigned main_symbol; + unsigned offset_slot; + unsigned num_extra_bits; + uint32_t extra_bits; + + /* Output the literal run of the sequence. */ + + if (litrunlen) { /* Is the literal run nonempty? */ + + /* Verify optimization is enabled on 64-bit */ + STATIC_ASSERT(WORDBITS < 64 || + CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)); + + if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) { + + /* 64-bit: write 3 literals at a time. */ + while (litrunlen >= 3) { + unsigned lit0 = block_data[0]; + unsigned lit1 = block_data[1]; + unsigned lit2 = block_data[2]; + lzx_add_bits(os, codes->codewords.main[lit0], + codes->lens.main[lit0]); + lzx_add_bits(os, codes->codewords.main[lit1], + codes->lens.main[lit1]); + lzx_add_bits(os, codes->codewords.main[lit2], + codes->lens.main[lit2]); + lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT); + block_data += 3; + litrunlen -= 3; + } + if (litrunlen--) { + unsigned lit = *block_data++; + lzx_add_bits(os, codes->codewords.main[lit], + codes->lens.main[lit]); + if (litrunlen--) { + unsigned lit = *block_data++; + lzx_add_bits(os, codes->codewords.main[lit], + codes->lens.main[lit]); + lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT); + } else { + lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT); + } + } + } else { + /* 32-bit: write 1 literal at a time. */ + do { + unsigned lit = *block_data++; + lzx_add_bits(os, codes->codewords.main[lit], + codes->lens.main[lit]); + lzx_flush_bits(os, MAIN_CODEWORD_LIMIT); + } while (--litrunlen); + } + } + + /* Was this the last literal run? */ + if (matchlen == 0) + return; + + /* Nope; output the match. */ + + block_data += matchlen; + + adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS; + main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK; + + offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS; + num_extra_bits = lzx_extra_offset_bits[offset_slot]; + extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] + + LZX_OFFSET_ADJUSTMENT); + + #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT + \ + LENGTH_CODEWORD_LIMIT + \ + LZX_MAX_NUM_EXTRA_BITS - \ + LZX_NUM_ALIGNED_OFFSET_BITS + \ + ALIGNED_CODEWORD_LIMIT) + + /* Verify optimization is enabled on 64-bit */ + STATIC_ASSERT_STMT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS)); + + /* Output the main symbol for the match. */ + + lzx_add_bits(os, codes->codewords.main[main_symbol], + codes->lens.main[main_symbol]); + if (!CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, MAIN_CODEWORD_LIMIT); + + /* If needed, output the length symbol for the match. */ + + if (matchlen >= LZX_MIN_SECONDARY_LEN) { + lzx_add_bits(os, codes->codewords.len[matchlen - + LZX_MIN_SECONDARY_LEN], + codes->lens.len[matchlen - + LZX_MIN_SECONDARY_LEN]); + if (!CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT); + } + + /* Output the extra offset bits for the match. In aligned + * offset blocks, the lowest 3 bits of the adjusted offset are + * Huffman-encoded using the aligned offset code, provided that + * there are at least extra 3 offset bits required. All other + * extra offset bits are output verbatim. */ + + if (offset_slot >= min_aligned_offset_slot) { + + lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS, + num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS); + if (!CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS - + LZX_NUM_ALIGNED_OFFSET_BITS); + + lzx_add_bits(os, codes->codewords.aligned[adjusted_offset & + LZX_ALIGNED_OFFSET_BITMASK], + codes->lens.aligned[adjusted_offset & + LZX_ALIGNED_OFFSET_BITMASK]); + if (!CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT); + } else { + STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS)); + + lzx_add_bits(os, extra_bits, num_extra_bits); + if (!CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS); + } + + if (CAN_BUFFER(MAX_MATCH_BITS)) + lzx_flush_bits(os, MAX_MATCH_BITS); + + /* Advance to the next sequence. */ + seq++; + } +} + +static void +lzx_write_header(uint32_t e8_file_size, struct lzx_output_bitstream *os) +{ + if (e8_file_size == 0) { + lzx_write_bits(os, 0, 1); + } else { + lzx_write_bits(os, 1, 1); + lzx_write_bits(os, (e8_file_size >> 16) & 0xffffu, 16); + lzx_write_bits(os, e8_file_size & 0xffffu, 16); + } +} + +static void +lzx_write_compressed_block(const uint8_t *block_begin, + int block_type, + uint32_t block_size, + enum liblzx_variant variant, + unsigned window_order, + unsigned num_main_syms, + const struct lzx_sequence sequences[], + const struct lzx_codes * codes, + const struct lzx_lens * prev_lens, + struct lzx_output_bitstream * os) +{ + /* The first three bits indicate the type of block and are one of the + * LZX_BLOCKTYPE_* constants. */ + lzx_write_bits(os, block_type, 3); + + /* + * Output the block size. + * + * The original LZX format encoded the block size in 24 bits. However, + * the LZX format used in WIM archives uses 1 bit to specify whether the + * block has the default size of 32768 bytes, then optionally 16 bits to + * specify a non-default size. This works fine for Microsoft's WIM + * software (WIMGAPI), which never compresses more than 32768 bytes at a + * time with LZX. However, as an extension, our LZX compressor supports + * compressing up to 2097152 bytes, with a corresponding increase in + * window size. It is possible for blocks in these larger buffers to + * exceed 65535 bytes; such blocks cannot have their size represented in + * 16 bits. + * + * The chosen solution was to use 24 bits for the block size when + * possibly required --- specifically, when the compressor has been + * allocated to be capable of compressing more than 32768 bytes at once + * (which also causes the number of main symbols to be increased). + */ + if (variant == LIBLZX_VARIANT_WIM) { + if (block_size == LZX_DEFAULT_BLOCK_SIZE) { + lzx_write_bits(os, 1, 1); + } else { + lzx_write_bits(os, 0, 1); + + if (window_order >= 16) + lzx_write_bits(os, block_size >> 16, 8); + + lzx_write_bits(os, block_size & 0xFFFF, 16); + } + } else { + lzx_write_bits(os, block_size >> 16, 8); + lzx_write_bits(os, block_size & 0xFFFF, 16); + } + + /* If it's an aligned offset block, output the aligned offset code. */ + if (block_type == LZX_BLOCKTYPE_ALIGNED) { + for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + lzx_write_bits(os, codes->lens.aligned[i], + LZX_ALIGNEDCODE_ELEMENT_SIZE); + } + } + + /* Output the main code (two parts). */ + lzx_write_compressed_code(os, codes->lens.main, + prev_lens->main, + LZX_NUM_CHARS); + lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS, + prev_lens->main + LZX_NUM_CHARS, + num_main_syms - LZX_NUM_CHARS); + + /* Output the length code. */ + lzx_write_compressed_code(os, codes->lens.len, + prev_lens->len, + LZX_LENCODE_NUM_SYMBOLS); + + /* Output the compressed matches and literals. */ + lzx_write_sequences(os, block_type, block_begin, sequences, codes); +} + +/* + * Given the frequencies of symbols in an LZX-compressed block and the + * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or + * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively, + * will take fewer bits to output. + */ +static int +lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs, + const struct lzx_codes * codes) +{ + uint32_t verbatim_cost = 0; + uint32_t aligned_cost = 0; + + /* A verbatim block requires 3 bits in each place that an aligned offset + * symbol would be used in an aligned offset block. */ + for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i]; + aligned_cost += codes->lens.aligned[i] * freqs->aligned[i]; + } + + /* Account for the cost of sending the codeword lengths of the aligned + * offset code. */ + aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE * + LZX_ALIGNEDCODE_NUM_SYMBOLS; + + if (aligned_cost < verbatim_cost) + return LZX_BLOCKTYPE_ALIGNED; + else + return LZX_BLOCKTYPE_VERBATIM; +} + +/* + * Flush an LZX block: + * + * 1. Build the Huffman codes. + * 2. Decide whether to output the block as VERBATIM or ALIGNED. + * 3. Write the block. + * 4. Swap the indices of the current and previous Huffman codes. + * + * Note: we never output UNCOMPRESSED blocks. This probably should be + * implemented sometime, but it doesn't make much difference. + */ +static void +lzx_flush_block(struct liblzx_compressor *c, struct lzx_output_bitstream *os, + const uint8_t *block_begin, uint32_t block_size, uint32_t seq_idx) +{ + int block_type; + + lzx_build_huffman_codes(c); + + block_type = lzx_choose_verbatim_or_aligned(&c->freqs, + &c->codes[c->codes_index]); + + if (c->variant != LIBLZX_VARIANT_WIM) { + if (c->first_block) { + lzx_write_header(c->e8_file_size, os); + c->first_block = false; + } + } + + lzx_write_compressed_block(block_begin, + block_type, + block_size, + c->variant, + c->window_order, + c->num_main_syms, + &c->chosen_sequences[seq_idx], + &c->codes[c->codes_index], + &c->codes[c->codes_index ^ 1].lens, + os); + c->codes_index ^= 1; +} + +/******************************************************************************/ +/* Block splitting algorithm */ +/*----------------------------------------------------------------------------*/ + +/* + * The problem of block splitting is to decide when it is worthwhile to start a + * new block with new entropy codes. There is a theoretically optimal solution: + * recursively consider every possible block split, considering the exact cost + * of each block, and choose the minimum cost approach. But this is far too + * slow. Instead, as an approximation, we can count symbols and after every N + * symbols, compare the expected distribution of symbols based on the previous + * data with the actual distribution. If they differ "by enough", then start a + * new block. + * + * As an optimization and heuristic, we don't distinguish between every symbol + * but rather we combine many symbols into a single "observation type". For + * literals we only look at the high bits and low bits, and for matches we only + * look at whether the match is long or not. The assumption is that for typical + * "real" data, places that are good block boundaries will tend to be noticeable + * based only on changes in these aggregate frequencies, without looking for + * subtle differences in individual symbols. For example, a change from ASCII + * bytes to non-ASCII bytes, or from few matches (generally less compressible) + * to many matches (generally more compressible), would be easily noticed based + * on the aggregates. + * + * For determining whether the frequency distributions are "different enough" to + * start a new block, the simply heuristic of splitting when the sum of absolute + * differences exceeds a constant seems to be good enough. + * + * Finally, for an approximation, it is not strictly necessary that the exact + * symbols being used are considered. With "near-optimal parsing", for example, + * the actual symbols that will be used are unknown until after the block + * boundary is chosen and the block has been optimized. Since the final choices + * cannot be used, we can use preliminary "greedy" choices instead. + */ + +/* Initialize the block split statistics when starting a new block. */ +static void +lzx_init_block_split_stats(struct lzx_block_split_stats *stats) +{ + memset(stats, 0, sizeof(*stats)); +} + +/* Literal observation. Heuristic: use the top 2 bits and low 1 bits of the + * literal, for 8 possible literal observation types. */ +static attrib_forceinline void +lzx_observe_literal(struct lzx_block_split_stats *stats, uint8_t lit) +{ + stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++; + stats->num_new_observations++; +} + +/* Match observation. Heuristic: use one observation type for "short match" and + * one observation type for "long match". */ +static attrib_forceinline void +lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length) +{ + stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++; + stats->num_new_observations++; +} + +static bool +lzx_should_end_block(struct lzx_block_split_stats *stats) +{ + if (stats->num_observations > 0) { + + /* Note: to avoid slow divisions, we do not divide by + * 'num_observations', but rather do all math with the numbers + * multiplied by 'num_observations'. */ + uint32_t total_delta = 0; + for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) { + uint32_t expected = stats->observations[i] * + stats->num_new_observations; + uint32_t actual = stats->new_observations[i] * + stats->num_observations; + uint32_t delta = (actual > expected) ? actual - expected : + expected - actual; + total_delta += delta; + } + + /* Ready to end the block? */ + if (total_delta >= + stats->num_new_observations * 7 / 8 * stats->num_observations) + return true; + } + + for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) { + stats->num_observations += stats->new_observations[i]; + stats->observations[i] += stats->new_observations[i]; + stats->new_observations[i] = 0; + } + stats->num_new_observations = 0; + return false; +} + +/******************************************************************************/ +/* Slower ("near-optimal") compression algorithm */ +/*----------------------------------------------------------------------------*/ + +/* + * Least-recently-used queue for match offsets. + * + * This is represented as a 64-bit integer for efficiency. There are three + * offsets of 21 bits each. Bit 64 is garbage. + */ +struct attrib_aligned(8) lzx_lru_queue { + uint64_t R; +}; + +#define LZX_QUEUE_OFFSET_SHIFT 21 +#define LZX_QUEUE_OFFSET_MASK (((uint64_t)1 << LZX_QUEUE_OFFSET_SHIFT) - 1) + +#define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT) +#define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT) +#define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT) + +#define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT) +#define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT) +#define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT) + +static attrib_forceinline uint64_t +lzx_lru_queue_R0(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK; +} + +static attrib_forceinline uint64_t +lzx_lru_queue_R1(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK; +} + +static attrib_forceinline uint64_t +lzx_lru_queue_R2(struct lzx_lru_queue queue) +{ + return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK; +} + +static attrib_forceinline void +lzx_lru_queue_save(uint32_t * restrict out_queue, + const struct lzx_lru_queue * restrict in_queue) +{ + struct lzx_lru_queue queue = *in_queue; + out_queue[0] = lzx_lru_queue_R0(queue); + out_queue[1] = lzx_lru_queue_R1(queue); + out_queue[2] = lzx_lru_queue_R2(queue); +} + +static attrib_forceinline void +lzx_lru_queue_load(struct lzx_lru_queue *restrict out_queue, + const uint32_t *restrict in_queue) +{ + uint64_t r = 0; + r |= (uint64_t)(in_queue[0]) << LZX_QUEUE_R0_SHIFT; + r |= (uint64_t)(in_queue[1]) << LZX_QUEUE_R1_SHIFT; + r |= (uint64_t)(in_queue[2]) << LZX_QUEUE_R2_SHIFT; + out_queue->R = r; +} + +/* Push a match offset onto the front (most recently used) end of the queue. */ +static attrib_forceinline struct lzx_lru_queue +lzx_lru_queue_push(struct lzx_lru_queue queue, uint32_t offset) +{ + return (struct lzx_lru_queue) { + .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset, + }; +} + +/* Swap a match offset to the front of the queue. */ +static attrib_forceinline struct lzx_lru_queue +lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx) +{ + unsigned shift = idx * 21; + const uint64_t mask = LZX_QUEUE_R0_MASK; + const uint64_t mask_high = mask << shift; + + return (struct lzx_lru_queue) { + (queue.R & ~(mask | mask_high)) | + ((queue.R & mask_high) >> shift) | + ((queue.R & mask) << shift) + }; +} + +static attrib_forceinline uint32_t +lzx_walk_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit, + bool record) +{ + struct lzx_sequence *seq = + &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1]; + uint32_t node_idx = block_size; + uint32_t litrun_end; /* if record=true: end of the current literal run */ + + if (record) { + /* The last sequence has matchlen 0 */ + seq->litrunlen_and_matchlen = 0; + litrun_end = node_idx; + } + + for (;;) { + uint32_t item; + unsigned matchlen; + uint32_t adjusted_offset; + unsigned mainsym; + + /* Tally literals until either a match or the beginning of the + * block is reached. Note: the item in the node at the + * beginning of the block (c->optimum_nodes[0]) has all bits + * set, causing this loop to end when it is reached. */ + for (;;) { + item = c->optimum_nodes[node_idx].item; + if (item & OPTIMUM_LEN_MASK) + break; + c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++; + node_idx--; + } + + #if CONSIDER_GAP_MATCHES + if (item & OPTIMUM_GAP_MATCH) { + if (node_idx == 0) + break; + /* Tally/record the rep0 match after the gap. */ + matchlen = item & OPTIMUM_LEN_MASK; + mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0, + is_16_bit); + if (record) { + seq->litrunlen_and_matchlen |= + (litrun_end - node_idx) << + SEQ_MATCHLEN_BITS; + seq--; + seq->litrunlen_and_matchlen = matchlen; + seq->adjusted_offset_and_mainsym = mainsym; + litrun_end = node_idx - matchlen; + } + + /* Tally the literal in the gap. */ + c->freqs.main[(uint8_t)(item >> OPTIMUM_OFFSET_SHIFT)]++; + + /* Fall through and tally the match before the gap. + * (It was temporarily saved in the 'cost' field of the + * previous node, which was free to reuse.) */ + item = c->optimum_nodes[--node_idx].cost; + node_idx -= matchlen; + } + #else /* CONSIDER_GAP_MATCHES */ + if (node_idx == 0) + break; + #endif /* !CONSIDER_GAP_MATCHES */ + + /* Tally/record a match. */ + matchlen = item & OPTIMUM_LEN_MASK; + adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT; + mainsym = lzx_tally_main_and_lensyms(c, matchlen, + adjusted_offset, + is_16_bit); + if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + + LZX_OFFSET_ADJUSTMENT) + c->freqs.aligned[adjusted_offset & + LZX_ALIGNED_OFFSET_BITMASK]++; + if (record) { + seq->litrunlen_and_matchlen |= + (litrun_end - node_idx) << SEQ_MATCHLEN_BITS; + seq--; + seq->litrunlen_and_matchlen = matchlen; + seq->adjusted_offset_and_mainsym = + (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym; + litrun_end = node_idx - matchlen; + } + node_idx -= matchlen; + } + + /* Record the literal run length for the first sequence. */ + if (record) { + seq->litrunlen_and_matchlen |= + (litrun_end - node_idx) << SEQ_MATCHLEN_BITS; + } + + /* Return the index in chosen_sequences at which the sequences begin. */ + return seq - &c->chosen_sequences[0]; +} + +/* + * Given the minimum-cost path computed through the item graph for the current + * block, walk the path and count how many of each symbol in each Huffman-coded + * alphabet would be required to output the items (matches and literals) along + * the path. + * + * Note that the path will be walked backwards (from the end of the block to the + * beginning of the block), but this doesn't matter because this function only + * computes frequencies. + */ +static attrib_forceinline void +lzx_tally_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit) +{ + lzx_walk_item_list(c, block_size, is_16_bit, false); +} + +/* + * Like lzx_tally_item_list(), but this function also generates the list of + * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences, + * ready to be output to the bitstream after the Huffman codes are computed. + * The lzx_sequences will be written to decreasing memory addresses as the path + * is walked backwards, which means they will end up in the expected + * first-to-last order. The return value is the index in c->chosen_sequences at + * which the lzx_sequences begin. + */ +static attrib_forceinline uint32_t +lzx_record_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit) +{ + return lzx_walk_item_list(c, block_size, is_16_bit, true); +} + +/* + * Find an inexpensive path through the graph of possible match/literal choices + * for the current block. The nodes of the graph are + * c->optimum_nodes[0...block_size]. They correspond directly to the bytes in + * the current block, plus one extra node for end-of-block. The edges of the + * graph are matches and literals. The goal is to find the minimum cost path + * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost + * model 'c->costs'. + * + * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and + * proceeding forwards one node at a time. At each node, a selection of matches + * (len >= 2), as well as the literal byte (len = 1), is considered. An item of + * length 'len' provides a new path to reach the node 'len' bytes later. If + * such a path is the lowest cost found so far to reach that later node, then + * that later node is updated with the new cost and the "arrival" which provided + * that cost. + * + * Note that although this algorithm is based on minimum cost path search, due + * to various simplifying assumptions the result is not guaranteed to be the + * true minimum cost, or "optimal", path over the graph of all valid LZX + * representations of this block. + * + * Also, note that because of the presence of the recent offsets queue (which is + * a type of adaptive state), the algorithm cannot work backwards and compute + * "cost to end" instead of "cost to beginning". Furthermore, the way the + * algorithm handles this adaptive state in the "minimum cost" parse is actually + * only an approximation. It's possible for the globally optimal, minimum cost + * path to contain a prefix, ending at a position, where that path prefix is + * *not* the minimum cost path to that position. This can happen if such a path + * prefix results in a different adaptive state which results in lower costs + * later. The algorithm does not solve this problem in general; it only looks + * one step ahead, with the exception of special consideration for "gap + * matches". + */ +static attrib_forceinline struct lzx_lru_queue +lzx_find_min_cost_path(struct liblzx_compressor * const restrict c, + const uint8_t * const restrict block_begin, + const uint32_t block_size, + const struct lzx_lru_queue initial_queue, + bool is_16_bit) +{ + struct lzx_optimum_node *cur_node = c->optimum_nodes; + struct lzx_optimum_node * const end_node = cur_node + block_size; + struct lz_match *cache_ptr = c->match_cache; + const uint8_t *in_next = block_begin; + const uint8_t * const block_end = block_begin + block_size; + + /* + * Instead of storing the match offset LRU queues in the + * 'lzx_optimum_node' structures, we save memory (and cache lines) by + * storing them in a smaller array. This works because the algorithm + * only requires a limited history of the adaptive state. Once a given + * state is more than LZX_MAX_MATCH_LEN bytes behind the current node + * (more if gap match consideration is enabled; we just round up to 512 + * so it's a power of 2), it is no longer needed. + * + * The QUEUE() macro finds the queue for the given node. This macro has + * been optimized by taking advantage of 'struct lzx_lru_queue' and + * 'struct lzx_optimum_node' both being 8 bytes in size and alignment. + */ + struct lzx_lru_queue queues[512]; + STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1); + STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0])); +#define QUEUE(node) \ + (*(struct lzx_lru_queue *)((char *)queues + \ + ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0]))))) + /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/ + +#if CONSIDER_GAP_MATCHES + uint32_t matches_before_gap[ARRAY_LEN(queues)]; +#define MATCH_BEFORE_GAP(node) \ + (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \ + ARRAY_LEN(matches_before_gap)]) +#endif + + /* + * Initially, the cost to reach each node is "infinity". + * + * The first node actually should have cost 0, but "infinity" + * (0xFFFFFFFF) works just as well because it immediately overflows. + * + * The following statement also intentionally sets the 'item' of the + * first node, which would otherwise have no meaning, to 0xFFFFFFFF for + * use as a sentinel. See lzx_walk_item_list(). + */ + memset(c->optimum_nodes, 0xFF, + (block_size + 1) * sizeof(c->optimum_nodes[0])); + + /* Initialize the recent offsets queue for the first node. */ + QUEUE(cur_node) = initial_queue; + + do { /* For each node in the block in position order... */ + + unsigned num_matches; + unsigned literal; + uint32_t cost; + + /* + * A selection of matches for the block was already saved in + * memory so that we don't have to run the uncompressed data + * through the matchfinder on every optimization pass. However, + * we still search for repeat offset matches during each + * optimization pass because we cannot predict the state of the + * recent offsets queue. But as a heuristic, we don't bother + * searching for repeat offset matches if the general-purpose + * matchfinder failed to find any matches. + * + * Note that a match of length n at some offset implies there is + * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at + * that same offset. In other words, we don't necessarily need + * to use the full length of a match. The key heuristic that + * saves a significicant amount of time is that for each + * distinct length, we only consider the smallest offset for + * which that length is available. This heuristic also applies + * to repeat offsets, which we order specially: R0 < R1 < R2 < + * any explicit offset. Of course, this heuristic may be + * produce suboptimal results because offset slots in LZX are + * subject to entropy encoding, but in practice this is a useful + * heuristic. + */ + + num_matches = cache_ptr->length; + cache_ptr++; + + if (num_matches) { + struct lz_match *end_matches = cache_ptr + num_matches; + unsigned next_len = LZX_MIN_MATCH_LEN; + unsigned max_len = + min_uint(block_end - in_next, LZX_MAX_MATCH_LEN); + const uint8_t *matchptr; + + /* Consider rep0 matches. */ + matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto rep0_done; + STATIC_ASSERT_STMT(LZX_MIN_MATCH_LEN == 2); + do { + uint32_t cost = cur_node->cost + + c->costs.match_cost[0][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (0 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); + + rep0_done: + + /* Consider rep1 matches. */ + matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto rep1_done; + if (matchptr[next_len - 1] != in_next[next_len - 1]) + goto rep1_done; + for (unsigned len = 2; len < next_len - 1; len++) + if (matchptr[len] != in_next[len]) + goto rep1_done; + do { + uint32_t cost = cur_node->cost + + c->costs.match_cost[1][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (1 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); + + rep1_done: + + /* Consider rep2 matches. */ + matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node)); + if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next)) + goto rep2_done; + if (matchptr[next_len - 1] != in_next[next_len - 1]) + goto rep2_done; + for (unsigned len = 2; len < next_len - 1; len++) + if (matchptr[len] != in_next[len]) + goto rep2_done; + do { + uint32_t cost = cur_node->cost + + c->costs.match_cost[2][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost <= (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (2 << OPTIMUM_OFFSET_SHIFT) | next_len; + } + if (unlikely(++next_len > max_len)) { + cache_ptr = end_matches; + goto done_matches; + } + } while (in_next[next_len - 1] == matchptr[next_len - 1]); + + rep2_done: + + while (next_len > cache_ptr->length) + if (++cache_ptr == end_matches) + goto done_matches; + + /* Consider explicit offset matches. */ + for (;;) { + uint32_t offset = cache_ptr->offset; + uint32_t adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT; + unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit); + uint32_t base_cost = cur_node->cost; + uint32_t cost; + + #if CONSIDER_ALIGNED_COSTS + if (offset >= LZX_MIN_ALIGNED_OFFSET) + base_cost += c->costs.aligned[adjusted_offset & + LZX_ALIGNED_OFFSET_BITMASK]; + #endif + do { + cost = base_cost + + c->costs.match_cost[offset_slot][ + next_len - LZX_MIN_MATCH_LEN]; + if (cost < (cur_node + next_len)->cost) { + (cur_node + next_len)->cost = cost; + (cur_node + next_len)->item = + (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len; + } + } while (++next_len <= cache_ptr->length); + + if (++cache_ptr == end_matches) { + #if CONSIDER_GAP_MATCHES + /* Also consider the longest explicit + * offset match as a "gap match": match + * + lit + rep0. */ + int32_t remaining = (block_end - in_next) - (int32_t)next_len; + if (likely(remaining >= 2)) { + const uint8_t *strptr = in_next + next_len; + const uint8_t *matchptr = strptr - offset; + if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) { + STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250); + STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap)); + unsigned limit = min_uint(remaining, + min_uint(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2, + LZX_MAX_MATCH_LEN)); + unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit); + uint8_t lit = strptr[-1]; + unsigned total_len = next_len + rep0_len; + cost += c->costs.main[lit] + + c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN]; + if (cost < (cur_node + total_len)->cost) { + (cur_node + total_len)->cost = cost; + (cur_node + total_len)->item = + OPTIMUM_GAP_MATCH | + ((uint32_t)lit << OPTIMUM_OFFSET_SHIFT) | + rep0_len; + MATCH_BEFORE_GAP(cur_node + total_len) = + (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | + (next_len - 1); + } + } + } + #endif /* CONSIDER_GAP_MATCHES */ + break; + } + } + } + + done_matches: + + /* Consider coding a literal. + + * To avoid an extra branch, actually checking the preferability + * of coding the literal is integrated into the queue update + * code below. */ + literal = *in_next++; + cost = cur_node->cost + c->costs.main[literal]; + + /* Advance to the next position. */ + cur_node++; + + /* The lowest-cost path to the current position is now known. + * Finalize the recent offsets queue that results from taking + * this lowest-cost path. */ + + if (cost <= cur_node->cost) { + /* Literal: queue remains unchanged. */ + cur_node->cost = cost; + cur_node->item = (uint32_t)literal << OPTIMUM_OFFSET_SHIFT; + QUEUE(cur_node) = QUEUE(cur_node - 1); + } else { + /* Match: queue update is needed. */ + unsigned len = cur_node->item & OPTIMUM_LEN_MASK; + #if CONSIDER_GAP_MATCHES + int32_t adjusted_offset = (int32_t)cur_node->item >> OPTIMUM_OFFSET_SHIFT; + STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */ + #else + uint32_t adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT; + #endif + + if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) { + /* Explicit offset match: insert offset at front. */ + QUEUE(cur_node) = + lzx_lru_queue_push(QUEUE(cur_node - len), + adjusted_offset - LZX_OFFSET_ADJUSTMENT); + } + #if CONSIDER_GAP_MATCHES + else if (adjusted_offset < 0) { + /* "Gap match": Explicit offset match, then a + * literal, then rep0 match. Save the explicit + * offset match information in the cost field of + * the previous node, which isn't needed + * anymore. Then insert the offset at the front + * of the queue. */ + uint32_t match_before_gap = MATCH_BEFORE_GAP(cur_node); + (cur_node - 1)->cost = match_before_gap; + QUEUE(cur_node) = + lzx_lru_queue_push(QUEUE(cur_node - len - 1 - + (match_before_gap & OPTIMUM_LEN_MASK)), + (match_before_gap >> OPTIMUM_OFFSET_SHIFT) - + LZX_OFFSET_ADJUSTMENT); + } + #endif + else { + /* Repeat offset match: swap offset to front. */ + QUEUE(cur_node) = + lzx_lru_queue_swap(QUEUE(cur_node - len), + adjusted_offset); + } + } + } while (cur_node != end_node); + + /* Return the recent offsets queue at the end of the path. */ + return QUEUE(cur_node); +} + +/* + * Given the costs for the main and length codewords (c->costs.main and + * c->costs.len), initialize the match cost array (c->costs.match_cost) which + * directly provides the cost of every possible (length, offset slot) pair. + */ +static void +lzx_compute_match_costs(struct liblzx_compressor *c) +{ + unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) / + LZX_NUM_LEN_HEADERS; + struct lzx_costs *costs = &c->costs; + unsigned main_symbol = LZX_NUM_CHARS; + + for (unsigned offset_slot = 0; offset_slot < num_offset_slots; + offset_slot++) + { + uint32_t extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST; + unsigned i; + + #if CONSIDER_ALIGNED_COSTS + if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT) + extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST; + #endif + + for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) { + costs->match_cost[offset_slot][i] = + costs->main[main_symbol++] + extra_cost; + } + + extra_cost += costs->main[main_symbol++]; + + for (; i < LZX_NUM_LENS; i++) { + costs->match_cost[offset_slot][i] = + costs->len[i - LZX_NUM_PRIMARY_LENS] + + extra_cost; + } + } +} + +typedef struct fixed32frac_s { + uint32_t value; +} fixed32frac; + +typedef struct fixed32_s { + uint64_t value; +} fixed32; + +/* + * Fast approximation for log2f(x). This is not as accurate as the standard C + * version. It does not need to be perfectly accurate because it is only used + * for estimating symbol costs, which is very approximate anyway. + */ +struct log2_fixed_table_pair +{ + uint32_t multiplier; + uint32_t log_add; +}; + +static const struct log2_fixed_table_pair log2_fixed_table_0[255] = { + { 0xff01fc08u, 0x16f2dcfu }, { 0xfe05ee36u, 0x2dceffeu }, + { 0xfd0bd0beu, 0x4494959u }, { 0xfc139debu, 0x5b43ca6u }, + { 0xfb1d5020u, 0x71dcca2u }, { 0xfa28e1d4u, 0x885fc02u }, + { 0xf9364d94u, 0x9eccd73u }, { 0xf8458e02u, 0xb52439au }, + { 0xf7569dd6u, 0xcb66115u }, { 0xf66977dau, 0xe19287au }, + { 0xf57e16edu, 0xf7a9c58u }, { 0xf4947602u, 0x10dabf37u }, + { 0xf3ac901fu, 0x12399395u }, { 0xf2c6605bu, 0x13971beeu }, + { 0xf1e1e1e2u, 0x14f35ab3u }, { 0xf0ff0ff1u, 0x164e524fu }, + { 0xf01de5d7u, 0x17a80527u }, { 0xef3e5ef4u, 0x19007598u }, + { 0xee6076bau, 0x1a57a5f9u }, { 0xed8428aau, 0x1bad989cu }, + { 0xeca97059u, 0x1d024fc9u }, { 0xebd04968u, 0x1e55cdc7u }, + { 0xeaf8af8bu, 0x1fa814d2u }, { 0xea229e85u, 0x20f92722u }, + { 0xe94e1228u, 0x224906e7u }, { 0xe87b0655u, 0x2397b64fu }, + { 0xe7a976fdu, 0x24e5377du }, { 0xe6d9601du, 0x26318c93u }, + { 0xe60abdc3u, 0x277cb7acu }, { 0xe53d8c0bu, 0x28c6bad9u }, + { 0xe471c71du, 0x2a0f982cu }, { 0xe3a76b2fu, 0x2b5751aeu }, + { 0xe2de7486u, 0x2c9de963u }, { 0xe216df74u, 0x2de36147u }, + { 0xe150a854u, 0x2f27bb59u }, { 0xe08bcb94u, 0x306af989u }, + { 0xdfc845a9u, 0x31ad1dc8u }, { 0xdf061318u, 0x32ee29feu }, + { 0xde45306fu, 0x342e2014u }, { 0xdd859a4au, 0x356d01e8u }, + { 0xdcc74d51u, 0x36aad154u }, { 0xdc0a4635u, 0x37e79032u }, + { 0xdb4e81b5u, 0x39234051u }, { 0xda93fc99u, 0x3a5de380u }, + { 0xd9dab3b6u, 0x3b977b86u }, { 0xd922a3e9u, 0x3cd00a2au }, + { 0xd86bca1bu, 0x3e07912bu }, { 0xd7b62341u, 0x3f3e1241u }, + { 0xd701ac57u, 0x40738f27u }, { 0xd64e6266u, 0x41a8098eu }, + { 0xd59c427fu, 0x42db8323u }, { 0xd4eb49bcu, 0x440dfd94u }, + { 0xd43b7544u, 0x453f7a82u }, { 0xd38cc244u, 0x466ffb93u }, + { 0xd2df2df3u, 0x479f8265u }, { 0xd232b593u, 0x48ce108eu }, + { 0xd187566cu, 0x49fba7a9u }, { 0xd0dd0dd1u, 0x4b284946u }, + { 0xd033d91du, 0x4c53f6f4u }, { 0xcf8bb5b4u, 0x4d7eb23bu }, + { 0xcee4a102u, 0x4ea87ca4u }, { 0xce3e987au, 0x4fd157b4u }, + { 0xcd99999au, 0x50f944e7u }, { 0xccf5a1e5u, 0x522045bcu }, + { 0xcc52aee8u, 0x53465baau }, { 0xcbb0be38u, 0x546b8825u }, + { 0xcb0fcd6fu, 0x558fcca0u }, { 0xca6fda31u, 0x56b32a89u }, + { 0xc9d0e229u, 0x57d5a34au }, { 0xc932e309u, 0x58f7384au }, + { 0xc895da89u, 0x5a17eaf0u }, { 0xc7f9c66bu, 0x5b37bc99u }, + { 0xc75ea476u, 0x5c56aea2u }, { 0xc6c47277u, 0x5d74c26au }, + { 0xc62b2e44u, 0x5e91f945u }, { 0xc592d5b8u, 0x5fae5488u }, + { 0xc4fb66b5u, 0x60c9d584u }, { 0xc464df24u, 0x61e47d87u }, + { 0xc3cf3cf4u, 0x62fe4dddu }, { 0xc33a7e1au, 0x641747cdu }, + { 0xc2a6a091u, 0x652f6c9eu }, { 0xc213a25cu, 0x6646bd8fu }, + { 0xc1818182u, 0x675d3be2u }, { 0xc0f03c0fu, 0x6872e8d5u }, + { 0xc05fd018u, 0x6987c59fu }, { 0xbfd03bb5u, 0x6a9bd379u }, + { 0xbf417d06u, 0x6baf1395u }, { 0xbeb3922eu, 0x6cc18729u }, + { 0xbe267957u, 0x6dd32f61u }, { 0xbd9a30b1u, 0x6ee40d69u }, + { 0xbd0eb670u, 0x6ff4226du }, { 0xbc8408cdu, 0x71036f95u }, + { 0xbbfa2609u, 0x7211f601u }, { 0xbb710c66u, 0x731fb6d9u }, + { 0xbae8ba2fu, 0x742cb339u }, { 0xba612db0u, 0x7538ec41u }, + { 0xb9da653eu, 0x7644630au }, { 0xb9545f30u, 0x774f18adu }, + { 0xb8cf19e3u, 0x78590e41u }, { 0xb84a93b8u, 0x796244d9u }, + { 0xb7c6cb15u, 0x7a6abd86u }, { 0xb743be65u, 0x7b727958u }, + { 0xb6c16c17u, 0x7c79795bu }, { 0xb63fd29du, 0x7d7fbe9eu }, + { 0xb5bef071u, 0x7e854a23u }, { 0xb53ec40eu, 0x7f8a1cf4u }, + { 0xb4bf4bf5u, 0x808e3815u }, { 0xb44086aau, 0x81919c88u }, + { 0xb3c272b6u, 0x82944b4cu }, { 0xb3450ea6u, 0x83964560u }, + { 0xb2c8590bu, 0x84978bc0u }, { 0xb24c507au, 0x85981f63u }, + { 0xb1d0f38cu, 0x86980143u }, { 0xb15640ddu, 0x87973255u }, + { 0xb0dc370eu, 0x8895b38du }, { 0xb062d4c3u, 0x899385ddu }, + { 0xafea18a4u, 0x8a90aa35u }, { 0xaf72015du, 0x8b8d2181u }, + { 0xaefa8d9eu, 0x8c88ecadu }, { 0xae83bc18u, 0x8d840ca6u }, + { 0xae0d8b83u, 0x8e7e8252u }, { 0xad97fa99u, 0x8f784e96u }, + { 0xad230816u, 0x90717259u }, { 0xacaeb2bbu, 0x9169ee7eu }, + { 0xac3af94cu, 0x9261c3e6u }, { 0xabc7da92u, 0x9358f36cu }, + { 0xab555555u, 0x944f7df3u }, { 0xaae36865u, 0x95456453u }, + { 0xaa721292u, 0x963aa768u }, { 0xaa0152b0u, 0x972f4809u }, + { 0xa9912796u, 0x9823470fu }, { 0xa9219020u, 0x9916a54au }, + { 0xa8b28b29u, 0x9a096393u }, { 0xa8441792u, 0x9afb82bau }, + { 0xa7d6343fu, 0x9bed038du }, { 0xa768e015u, 0x9cdde6ddu }, + { 0xa6fc19fdu, 0x9dce2d77u }, { 0xa68fe0e4u, 0x9ebdd823u }, + { 0xa62433b8u, 0x9face7adu }, { 0xa5b91169u, 0xa09b5cdfu }, + { 0xa54e78edu, 0xa189387du }, { 0xa4e46939u, 0xa2767b4fu }, + { 0xa47ae148u, 0xa3632616u }, { 0xa411e014u, 0xa44f3999u }, + { 0xa3a9649eu, 0xa53ab692u }, { 0xa3416de5u, 0xa6259dc7u }, + { 0xa2d9faeeu, 0xa70feff2u }, { 0xa2730abfu, 0xa7f9add0u }, + { 0xa20c9c60u, 0xa8e2d81du }, { 0xa1a6aedcu, 0xa9cb6f95u }, + { 0xa1414141u, 0xaab374eeu }, { 0xa0dc529fu, 0xab9ae8dfu }, + { 0xa077e207u, 0xac81cc1fu }, { 0xa013ee8fu, 0xad681f62u }, + { 0x9fb0774du, 0xae4de35au }, { 0x9f4d7b5au, 0xaf3318bau }, + { 0x9eeaf9d1u, 0xb017c033u }, { 0x9e88f1d0u, 0xb0fbda74u }, + { 0x9e276276u, 0xb1df682bu }, { 0x9dc64ae5u, 0xb2c26a06u }, + { 0x9d65aa42u, 0xb3a4e0acu }, { 0x9d057fb2u, 0xb486cccbu }, + { 0x9ca5ca5du, 0xb5682f0cu }, { 0x9c46896du, 0xb6490816u }, + { 0x9be7bc0eu, 0xb7295893u }, { 0x9b896170u, 0xb8092121u }, + { 0x9b2b78c1u, 0xb8e8626cu }, { 0x9ace0134u, 0xb9c71d13u }, + { 0x9a70f9fdu, 0xbaa551b9u }, { 0x9a146253u, 0xbb8300fdu }, + { 0x99b8396cu, 0xbc602b82u }, { 0x995c7e82u, 0xbd3cd1e6u }, + { 0x990130d1u, 0xbe18f4c6u }, { 0x98a64f97u, 0xbef494bdu }, + { 0x984bda13u, 0xbfcfb267u }, { 0x97f1cf85u, 0xc0aa4e5fu }, + { 0x97982f30u, 0xc184693fu }, { 0x973ef859u, 0xc25e039eu }, + { 0x96e62a46u, 0xc3371e12u }, { 0x968dc43fu, 0xc40fb932u }, + { 0x9635c58du, 0xc4e7d594u }, { 0x95de2d7cu, 0xc5bf73cau }, + { 0x9586fb58u, 0xc696946au }, { 0x95302e70u, 0xc76d3803u }, + { 0x94d9c615u, 0xc8435f25u }, { 0x9483c197u, 0xc9190a64u }, + { 0x942e204au, 0xc9ee3a4eu }, { 0x93d8e182u, 0xcac2ef71u }, + { 0x93840497u, 0xcb972a58u }, { 0x932f88e0u, 0xcc6aeb90u }, + { 0x92db6db7u, 0xcd3e33a3u }, { 0x9287b275u, 0xce110320u }, + { 0x92345678u, 0xcee35a8du }, { 0x91e1591eu, 0xcfb53a70u }, + { 0x918eb9c5u, 0xd086a355u }, { 0x913c77ceu, 0xd15795c2u }, + { 0x90ea929bu, 0xd228123cu }, { 0x90990990u, 0xd2f81946u }, + { 0x9047dc12u, 0xd3c7ab65u }, { 0x8ff70986u, 0xd496c91du }, + { 0x8fa69154u, 0xd56572f1u }, { 0x8f5672e4u, 0xd633a963u }, + { 0x8f06ada2u, 0xd7016cf0u }, { 0x8eb740f9u, 0xd7cebe18u }, + { 0x8e682c54u, 0xd89b9d5fu }, { 0x8e196f23u, 0xd9680b3du }, + { 0x8dcb08d4u, 0xda340833u }, { 0x8d7cf8d8u, 0xdaff94bcu }, + { 0x8d2f3ea0u, 0xdbcab157u }, { 0x8ce1d9a0u, 0xdc955e7au }, + { 0x8c94c94cu, 0xdd5f9ca0u }, { 0x8c480d19u, 0xde296c45u }, + { 0x8bfba47eu, 0xdef2cddeu }, { 0x8baf8ef2u, 0xdfbbc1e5u }, + { 0x8b63cbeeu, 0xe08448d1u }, { 0x8b185aedu, 0xe14c6316u }, + { 0x8acd3b69u, 0xe214112du }, { 0x8a826cdeu, 0xe2db5389u }, + { 0x8a37eecau, 0xe3a22a9fu }, { 0x89edc0acu, 0xe46896deu }, + { 0x89a3e202u, 0xe52e98bfu }, { 0x895a524eu, 0xe5f430b0u }, + { 0x89111111u, 0xe6b95f22u }, { 0x88c81dceu, 0xe77e2486u }, + { 0x887f7808u, 0xe842814eu }, { 0x88371f45u, 0xe90675e5u }, + { 0x87ef130au, 0xe9ca02bcu }, { 0x87a752dfu, 0xea8d283du }, + { 0x875fde4au, 0xeb4fe6dau }, { 0x8718b4d4u, 0xec123effu }, + { 0x86d1d608u, 0xecd43114u }, { 0x868b4170u, 0xed95bd86u }, + { 0x8644f698u, 0xee56e4beu }, { 0x85fef50du, 0xef17a726u }, + { 0x85b93c5bu, 0xefd8052bu }, { 0x8573cc12u, 0xf097ff30u }, + { 0x852ea3c2u, 0xf157959cu }, { 0x84e9c2f9u, 0xf216c8ddu }, + { 0x84a5294au, 0xf2d59955u }, { 0x8460d647u, 0xf3940768u }, + { 0x841cc982u, 0xf4521381u }, { 0x83d90290u, 0xf50fbdffu }, + { 0x83958106u, 0xf5cd0747u }, { 0x83524478u, 0xf689efc0u }, + { 0x830f4c7eu, 0xf74677c9u }, { 0x82cc98afu, 0xf8029fc5u }, + { 0x828a28a2u, 0xf8be6819u }, { 0x8247fbf2u, 0xf979d120u }, + { 0x82061236u, 0xfa34db42u }, { 0x81c46b0bu, 0xfaef86d9u }, + { 0x8183060cu, 0xfba9d445u }, { 0x8141e2d4u, 0xfc63c3e8u }, + { 0x81010101u, 0xfd1d561du }, { 0x80c06030u, 0xfdd68b44u }, + { 0x80800000u, 0xfe8f63b9u }, +}; + +static const uint32_t log2_fixed_table_1[255] = { + 0x17152u, 0x2e2a3u, 0x453f2u, 0x5c540u, 0x7368du, 0x8a7d8u, + 0xa1921u, 0xb8a69u, 0xcfbb0u, 0xe6cf6u, 0xfde39u, 0x114f7cu, + 0x12c0bdu, 0x1431fcu, 0x15a33au, 0x171477u, 0x1885b2u, 0x19f6ecu, + 0x1b6824u, 0x1cd95bu, 0x1e4a91u, 0x1fbbc5u, 0x212cf7u, 0x229e28u, + 0x240f58u, 0x258086u, 0x26f1b3u, 0x2862deu, 0x29d408u, 0x2b4530u, + 0x2cb657u, 0x2e277du, 0x2f98a1u, 0x3109c4u, 0x327ae5u, 0x33ec05u, + 0x355d23u, 0x36ce40u, 0x383f5bu, 0x39b077u, 0x3b218fu, 0x3c92a6u, + 0x3e03bcu, 0x3f74d0u, 0x40e5e3u, 0x4256f4u, 0x43c804u, 0x453912u, + 0x46aa1fu, 0x481b2bu, 0x498c36u, 0x4afd3fu, 0x4c6e46u, 0x4ddf4cu, + 0x4f5050u, 0x50c153u, 0x523254u, 0x53a355u, 0x551454u, 0x568551u, + 0x57f64cu, 0x596747u, 0x5ad83fu, 0x5c4938u, 0x5dba2eu, 0x5f2b22u, + 0x609c15u, 0x620d06u, 0x637df8u, 0x64eee6u, 0x665fd3u, 0x67d0bfu, + 0x6941abu, 0x6ab293u, 0x6c237bu, 0x6d9461u, 0x6f0546u, 0x707629u, + 0x71e70bu, 0x7357ecu, 0x74c8cbu, 0x7639a8u, 0x77aa84u, 0x791b5fu, + 0x7a8c38u, 0x7bfd10u, 0x7d6de7u, 0x7edebbu, 0x804f8eu, 0x81c061u, + 0x833131u, 0x84a202u, 0x8612cfu, 0x87839au, 0x88f466u, 0x8a652fu, + 0x8bd5f8u, 0x8d46beu, 0x8eb784u, 0x902847u, 0x91990au, 0x9309cau, + 0x947a89u, 0x95eb47u, 0x975c03u, 0x98ccbfu, 0x9a3d79u, 0x9bae31u, + 0x9d1ee8u, 0x9e8f9cu, 0xa00051u, 0xa17103u, 0xa2e1b4u, 0xa45263u, + 0xa5c312u, 0xa733bfu, 0xa8a469u, 0xaa1513u, 0xab85bcu, 0xacf662u, + 0xae6708u, 0xafd7adu, 0xb1484eu, 0xb2b8f0u, 0xb42990u, 0xb59a2du, + 0xb70acbu, 0xb87b67u, 0xb9ec01u, 0xbb5c98u, 0xbccd30u, 0xbe3dc6u, + 0xbfae5au, 0xc11eedu, 0xc28f7fu, 0xc4000eu, 0xc5709cu, 0xc6e12au, + 0xc851b5u, 0xc9c240u, 0xcb32c9u, 0xcca350u, 0xce13d6u, 0xcf845bu, + 0xd0f4deu, 0xd2655fu, 0xd3d5dfu, 0xd5465eu, 0xd6b6dbu, 0xd82757u, + 0xd997d2u, 0xdb084au, 0xdc78c2u, 0xdde938u, 0xdf59acu, 0xe0ca1fu, + 0xe23a91u, 0xe3ab01u, 0xe51b70u, 0xe68bdfu, 0xe7fc4au, 0xe96cb5u, + 0xeadd1du, 0xec4d85u, 0xedbdecu, 0xef2e51u, 0xf09eb4u, 0xf20f17u, + 0xf37f77u, 0xf4efd6u, 0xf66033u, 0xf7d090u, 0xf940eau, 0xfab144u, + 0xfc219cu, 0xfd91f2u, 0xff0248u, 0x100729bu, 0x101e2eeu, 0x103533eu, + 0x104c38eu, 0x10633dbu, 0x107a428u, 0x1091472u, 0x10a84bdu, 0x10bf504u, + 0x10d654bu, 0x10ed591u, 0x11045d4u, 0x111b617u, 0x1132658u, 0x1149697u, + 0x11606d6u, 0x1177713u, 0x118e74du, 0x11a5787u, 0x11bc7c0u, 0x11d37f7u, + 0x11ea82du, 0x1201860u, 0x1218892u, 0x122f8c4u, 0x12468f4u, 0x125d922u, + 0x127494fu, 0x128b97bu, 0x12a29a5u, 0x12b99ceu, 0x12d09f5u, 0x12e7a1bu, + 0x12fea3fu, 0x1315a62u, 0x132ca83u, 0x1343aa3u, 0x135aac1u, 0x1371adeu, + 0x1388afau, 0x139fb15u, 0x13b6b2eu, 0x13cdb45u, 0x13e4b5bu, 0x13fbb6fu, + 0x1412b83u, 0x1429b94u, 0x1440ba4u, 0x1457bb4u, 0x146ebc1u, 0x1485bccu, + 0x149cbd8u, 0x14b3be0u, 0x14cabe8u, 0x14e1beeu, 0x14f8bf2u, 0x150fbf6u, + 0x1526bf9u, 0x153dbf9u, 0x1554bf8u, 0x156bbf5u, 0x1582bf2u, 0x1599becu, + 0x15b0be6u, 0x15c7bdeu, 0x15debd3u, 0x15f5bc9u, 0x160cbbdu, 0x1623bafu, + 0x163ab9fu, 0x1651b8eu, 0x1668b7du, 0x167fb69u, 0x1696b54u, 0x16adb3eu, + 0x16c4b26u, 0x16dbb0du, 0x16f2af3u, +}; + +static int32_t +log2_fixed_fast_normalized(const fixed32 *value, int multiplier_bits) +{ + int32_t base_pos = 0; + uint32_t mantissa = 0; + uint32_t mantissa_log2 = 0; + uint8_t mantissa_byte = 0; + int64_t final_log = 0; + + if (value->value == 0) + return 0; + + base_pos = (int)bsr64(value->value) - 32; + if (base_pos > 0) { + mantissa = (value->value >> base_pos) & 0xffffffffu; + } else { + mantissa = (value->value << -base_pos) & 0xffffffffu; + } + + /* Get the first byte */ + mantissa_byte = (mantissa >> 24) & 0xffu; + + if (mantissa_byte != 0) { + const struct log2_fixed_table_pair *pair = log2_fixed_table_0 + (mantissa_byte - 1); + mantissa = (uint32_t)((mantissa * (uint64_t)pair->multiplier) >> 32) + pair->multiplier; + + mantissa_log2 += pair->log_add; + } + + mantissa_byte = (mantissa >> 16) & 0xffu; + + if (mantissa_byte != 0) { + mantissa_log2 += log2_fixed_table_1[mantissa_byte - 1]; + } + + final_log = (int64_t)mantissa_log2 + ((int64_t)base_pos << 32); + final_log /= ((int64_t)1 << (32 - multiplier_bits)); + + return (int32_t)final_log; +} + +/* + * Return the estimated cost of a symbol which has been estimated to have the + * given probability. + */ +static uint32_t +lzx_cost_for_probability(const fixed32* prob) +{ + /* + * The basic formula is: + * + * entropy = -log2(probability) + * + * Use this to get the cost in fractional bits. Then multiply by our + * scaling factor of BIT_COST and convert to an integer. + * + * In addition, the minimum cost is BIT_COST (one bit) because the + * entropy coding method will be Huffman codes. + * + * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may + * be positive due to inaccuracy in our log2 approximation. Therefore, + * we cannot, in general, assume the computed cost is non-negative, and + * we should make sure negative costs get rounded up correctly. + */ + int32_t cost = -log2_fixed_fast_normalized(prob, BIT_COST_BITS); + return max_u32(cost, BIT_COST); +} + +/* + * Mapping: number of used literals => heuristic probability of a literal times + * 6870. Generated by running this R command: + * + * cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", ")) + */ +static const uint8_t literal_scaled_probs[257] = { + 255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222, + 219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191, + 189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164, + 162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141, + 139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121, + 120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104, + 103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86, + 86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71, + 70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59, + 58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48, + 48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40, + 39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33, + 32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27, + 27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22, + 22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18, + 18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16 +}; + +/* + * Mapping: length symbol => default cost of that symbol. This is derived from + * sample data but has been slightly edited to add more bias towards the + * shortest lengths, which are the most common. + */ +static const uint16_t lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = { + 300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492, + 503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627, + 659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682, + 666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712, + 706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735, + 728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747, + 738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752, + 747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755, + 753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757, + 754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750, + 758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761, + 750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759, + 757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763, + 762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762, + 763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763, + 762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764, + 764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764, + 762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584, +}; + +static void +fixed_rcp_approx(fixed32frac *result, uint32_t i) +{ + result->value = ((uint64_t)0x100000000ull) / i; +} + +static void +fixed_set(fixed32 *result, uint32_t i) +{ + result->value = ((uint64_t)i << 32); +} + +static void +fixed_set_fraction(fixed32frac *result, uint32_t num, uint32_t denom) +{ + result->value = (((uint64_t)num) << 32) / denom; +} + +static void +fixed_mul_uint_frac(fixed32 *result, uint32_t a, const fixed32frac *b) +{ + result->value = ((uint64_t)a) * (uint64_t)b->value; +} + +static void +fixed_mul_uint_frac_to_frac(fixed32frac *result, uint32_t a, const fixed32frac *b) +{ + fixed32 fixed; + fixed_mul_uint_frac(&fixed, a, b); + result->value = (uint32_t)fixed.value; +} + +static void +fixed_add_frac(fixed32 *result, const fixed32 *a, const fixed32frac *b) +{ + result->value = a->value + b->value; +} + +static void +fixed_sub(fixed32 *result, const fixed32 *a, const fixed32 *b) +{ + result->value = a->value + b->value; +} + +static void +fixed_max_frac(fixed32 *result, const fixed32 *a, const fixed32frac *b) +{ + result->value = max_u64(a->value, b->value); +} + +static void +fixed_div_uint(fixed32 *result, const fixed32 *a, uint32_t b) +{ + result->value = a->value / b; +} + +/* Set default costs to bootstrap the iterative optimization algorithm. */ +static void +lzx_set_default_costs(struct liblzx_compressor *c) +{ + unsigned i; + uint32_t num_literals = 0; + uint32_t num_used_literals = 0; + fixed32frac inv_num_matches; + fixed32frac half_inv_num_items; + fixed32frac half_inv_6870; + fixed32 prob_match; + fixed32frac frac_15_100; + uint32_t match_cost; + fixed32frac half_base_literal_prob; + fixed32 temp_fixed; + + fixed_rcp_approx(&inv_num_matches, c->freqs.main[LZX_NUM_CHARS]); + fixed_rcp_approx(&half_inv_6870, 6870 * 2); + fixed_set(&prob_match, 1); + fixed_set_fraction(&frac_15_100, 15, 100); + + /* Some numbers here have been hardcoded to assume a bit cost of 64. */ + STATIC_ASSERT_STMT(BIT_COST == 64); + + /* Estimate the number of literals that will used. 'num_literals' is + * the total number, whereas 'num_used_literals' is the number of + * distinct symbols. */ + for (i = 0; i < LZX_NUM_CHARS; i++) { + num_literals += c->freqs.main[i]; + num_used_literals += (c->freqs.main[i] != 0); + } + + /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'. We + * don't attempt to estimate which ones will be used. */ + + fixed_rcp_approx(&half_inv_num_items, + (num_literals + c->freqs.main[LZX_NUM_CHARS]) * 2); + fixed_mul_uint_frac_to_frac(&half_base_literal_prob, + literal_scaled_probs[num_used_literals], + &half_inv_6870); + + /* Literal costs. We use two different methods to compute the + * probability of each literal and mix together their results. */ + for (i = 0; i < LZX_NUM_CHARS; i++) { + uint32_t freq = c->freqs.main[i]; + if (freq != 0) { + fixed32 prob; + fixed_mul_uint_frac(&prob, freq, &half_inv_num_items); + fixed_add_frac(&prob, &prob, &half_base_literal_prob); + + c->costs.main[i] = lzx_cost_for_probability(&prob); + fixed_sub(&prob_match, &prob_match, &prob); + } else { + c->costs.main[i] = 11 * BIT_COST; + } + } + + /* Match header costs. We just assume that all match headers are + * equally probable, but we do take into account the relative cost of a + * match header vs. a literal depending on how common matches are + * expected to be vs. literals. */ + fixed_max_frac(&prob_match, &prob_match, &frac_15_100); + fixed_div_uint(&temp_fixed, &prob_match, + (c->num_main_syms - LZX_NUM_CHARS)); + match_cost = lzx_cost_for_probability(&temp_fixed); + for (; i < c->num_main_syms; i++) + c->costs.main[i] = match_cost; + + /* Length symbol costs. These are just set to fixed values which + * reflect the fact the smallest lengths are typically the most common, + * and therefore are typically the cheapest. */ + for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) + c->costs.len[i] = lzx_default_len_costs[i]; + +#if CONSIDER_ALIGNED_COSTS + /* Aligned offset symbol costs. These are derived from the estimated + * probability of each aligned offset symbol. */ + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + /* We intentionally tallied the frequencies in the wrong slots, + * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the + * fixup here is faster: a constant 8 subtractions here vs. one + * addition for every match. */ + unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK; + if (c->freqs.aligned[j] != 0) { + fixed32 prob; + fixed_mul_uint_frac(&prob, c->freqs.aligned[j], + &inv_num_matches); + c->costs.aligned[i] = lzx_cost_for_probability(&prob); + } else { + c->costs.aligned[i] = + (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST; + } + } +#endif +} + +/* Update the current cost model to reflect the computed Huffman codes. */ +static void +lzx_set_costs_from_codes(struct liblzx_compressor *c) +{ + unsigned i; + const struct lzx_lens *lens = &c->codes[c->codes_index].lens; + + for (i = 0; i < c->num_main_syms; i++) { + c->costs.main[i] = (lens->main[i] ? lens->main[i] : + MAIN_CODEWORD_LIMIT) * BIT_COST; + } + + for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) { + c->costs.len[i] = (lens->len[i] ? lens->len[i] : + LENGTH_CODEWORD_LIMIT) * BIT_COST; + } + +#if CONSIDER_ALIGNED_COSTS + for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) { + c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] : + ALIGNED_CODEWORD_LIMIT) * BIT_COST; + } +#endif +} + +/* + * Choose a "near-optimal" literal/match sequence to use for the current block, + * then flush the block. Because the cost of each Huffman symbol is unknown + * until the Huffman codes have been built and the Huffman codes themselves + * depend on the symbol frequencies, this uses an iterative optimization + * algorithm to approximate an optimal solution. The first optimization pass + * for the block uses default costs; additional passes use costs derived from + * the Huffman codes computed in the previous pass. + */ +static attrib_forceinline struct lzx_lru_queue +lzx_optimize_and_flush_block(struct liblzx_compressor * const restrict c, + struct lzx_output_bitstream * const restrict os, + const uint8_t * const restrict block_begin, + const uint32_t block_size, + const struct lzx_lru_queue initial_queue, + bool is_16_bit) +{ + unsigned num_passes_remaining = c->num_optim_passes; + struct lzx_lru_queue new_queue; + uint32_t seq_idx; + + lzx_set_default_costs(c); + + for (;;) { + lzx_compute_match_costs(c); + new_queue = lzx_find_min_cost_path(c, block_begin, block_size, + initial_queue, is_16_bit); + + if (--num_passes_remaining == 0) + break; + + /* At least one optimization pass remains. Update the costs. */ + lzx_reset_symbol_frequencies(c); + lzx_tally_item_list(c, block_size, is_16_bit); + lzx_build_huffman_codes(c); + lzx_set_costs_from_codes(c); + } + + /* Done optimizing. Generate the sequence list and flush the block. */ + lzx_reset_symbol_frequencies(c); + seq_idx = lzx_record_item_list(c, block_size, is_16_bit); + lzx_flush_block(c, os, block_begin, block_size, seq_idx); + return new_queue; +} + +/* + * This is the "near-optimal" LZX compressor. + * + * For each block, it performs a relatively thorough graph search to find an + * inexpensive (in terms of compressed size) way to output the block. + * + * Note: there are actually many things this algorithm leaves on the table in + * terms of compression ratio. So although it may be "near-optimal", it is + * certainly not "optimal". The goal is not to produce the optimal compression + * ratio, which for LZX is probably impossible within any practical amount of + * time, but rather to produce a compression ratio significantly better than a + * simpler "greedy" or "lazy" parse while still being relatively fast. + */ +static attrib_forceinline void +lzx_reset_near_optimal(struct liblzx_compressor *c, bool is_16_bit) +{ + /* Initialize the matchfinder. */ + CALL_BT_MF(is_16_bit, c, bt_matchfinder_init); +} + +static void +lzx_reset_near_optimal_16(struct liblzx_compressor *c) +{ + lzx_reset_near_optimal(c, true); +} + +static void +lzx_reset_near_optimal_32(struct liblzx_compressor *c) +{ + lzx_reset_near_optimal(c, false); +} + +static attrib_forceinline void +lzx_compress_near_optimal(struct liblzx_compressor * restrict c, + const uint8_t *restrict in_begin, + size_t in_nchunk, size_t in_ndata, + struct lzx_output_bitstream * restrict os, + bool is_16_bit) +{ + uint32_t max_offset = c->window_size; + const uint8_t * in_next = in_begin; + const uint8_t * const in_chunk_end = in_begin + in_nchunk; + const uint8_t * const in_data_end = in_begin + in_ndata; + uint32_t max_find_len = LZX_MAX_MATCH_LEN; + uint32_t max_produce_len = LZX_MAX_MATCH_LEN; + uint32_t nice_len = min_u32(c->nice_match_length, max_find_len); + uint32_t next_hashes[2] = {0, 0}; + struct lzx_lru_queue queue; + + if (max_offset >= LZX_MAX_WINDOW_SIZE) { + /* Slightly shrink window to avoid offset values that are + * greater than 21 bits. */ + max_offset = LZX_MAX_WINDOW_SIZE - 1 - LZX_OFFSET_ADJUSTMENT; + } + + in_begin -= c->in_prefix_size; + + if (c->variant == LIBLZX_VARIANT_WIM) { + CALL_BT_MF(is_16_bit, c, bt_matchfinder_init); + } else { + /* Load the LRU queue */ + lzx_lru_queue_load(&queue, c->lru_queue); + } + + do { + /* Starting a new block */ + + const uint8_t * const in_block_begin = in_next; + const uint8_t * const in_max_block_end = + in_next + min_size(SOFT_MAX_BLOCK_SIZE, in_chunk_end - in_next); + struct lz_match *cache_ptr = c->match_cache; + const uint8_t *next_search_pos = in_next; + const uint8_t *next_observation = in_next; + const uint8_t *next_pause_point = + min_constptr(in_next + min_size(MIN_BLOCK_SIZE, + in_max_block_end - in_next), + in_max_block_end - min_size(LZX_MAX_MATCH_LEN - 1, + in_max_block_end - in_next)); + + lzx_init_block_split_stats(&c->split_stats); + lzx_reset_symbol_frequencies(c); + + if (in_next >= next_pause_point) + goto pause; + + /* + * Run the input buffer through the matchfinder, caching the + * matches, until we decide to end the block. + * + * For a tighter matchfinding loop, we compute a "pause point", + * which is the next position at which we may need to check + * whether to end the block or to decrease max_len. We then + * only do these extra checks upon reaching the pause point. + */ + resume_matchfinding: + do { + size_t min_match_pos = in_next - in_begin; + min_match_pos -= min_size(min_match_pos, max_offset); + + if (in_next >= next_search_pos && + likely(nice_len >= LZX_MIN_MATCH_LEN)) { + /* Search for matches at this position. */ + struct lz_match *lz_matchptr; + uint32_t best_len; + + lz_matchptr = CALL_BT_MF(is_16_bit, c, + bt_matchfinder_get_matches, + in_begin, + min_match_pos, + in_next - in_begin, + max_find_len, + max_produce_len, + nice_len, + c->max_search_depth, + next_hashes, + &best_len, + cache_ptr + 1); + cache_ptr->length = lz_matchptr - (cache_ptr + 1); + cache_ptr = lz_matchptr; + + /* Accumulate literal/match statistics for block + * splitting and for generating the initial cost + * model. */ + if (in_next >= next_observation) { + best_len = cache_ptr[-1].length; + if (best_len >= 3) { + /* Match (len >= 3) */ + + /* + * Note: for performance reasons this has + * been simplified significantly: + * + * - We wait until later to account for + * LZX_OFFSET_ADJUSTMENT. + * - We don't account for repeat offsets. + * - We don't account for different match headers. + */ + c->freqs.aligned[cache_ptr[-1].offset & + LZX_ALIGNED_OFFSET_BITMASK]++; + c->freqs.main[LZX_NUM_CHARS]++; + + lzx_observe_match(&c->split_stats, best_len); + next_observation = in_next + best_len; + } else { + /* Literal */ + c->freqs.main[*in_next]++; + lzx_observe_literal(&c->split_stats, *in_next); + next_observation = in_next + 1; + } + } + + /* + * If there was a very long match found, then + * don't cache any matches for the bytes covered + * by that match. This avoids degenerate + * behavior when compressing highly redundant + * data, where the number of matches can be very + * large. + * + * This heuristic doesn't actually hurt the + * compression ratio *too* much. If there's a + * long match, then the data must be highly + * compressible, so it doesn't matter as much + * what we do. + */ + if (best_len >= nice_len) + next_search_pos = in_next + best_len; + } else { + /* Don't search for matches at this position. */ + CALL_BT_MF(is_16_bit, c, + bt_matchfinder_skip_byte, + in_begin, + min_match_pos, + in_next - in_begin, + nice_len, + c->max_search_depth, + next_hashes); + cache_ptr->length = 0; + cache_ptr++; + } + } while (++in_next < next_pause_point && + likely(cache_ptr < &c->match_cache[CACHE_LENGTH])); + + pause: + + /* Adjust max_len and nice_len if we're nearing the end of the + * input buffer. In addition, if we are so close to the end of + * the input buffer that there cannot be any more matches, then + * just advance through the last few positions and record no + * matches. */ + if (unlikely(max_produce_len > in_data_end - in_next)) { + max_produce_len = in_chunk_end - in_next; + max_find_len = in_data_end - in_next; + nice_len = min_u32(max_produce_len, nice_len); + if (max_find_len < BT_MATCHFINDER_REQUIRED_NBYTES) { + while (in_next != in_chunk_end) { + cache_ptr->length = 0; + cache_ptr++; + in_next++; + } + } + } + + /* End the block if the match cache may overflow. */ + if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH])) + goto end_block; + + /* End the block if the soft maximum size has been reached. */ + if (in_next >= in_max_block_end) + goto end_block; + + /* End the block if the block splitting algorithm thinks this is + * a good place to do so. */ + if (c->split_stats.num_new_observations >= + NUM_OBSERVATIONS_PER_BLOCK_CHECK && + in_max_block_end - in_next >= MIN_BLOCK_SIZE && + lzx_should_end_block(&c->split_stats)) + goto end_block; + + /* It's not time to end the block yet. Compute the next pause + * point and resume matchfinding. */ + next_pause_point = + min_constptr(in_next + min_size(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 - + c->split_stats.num_new_observations, + in_max_block_end - in_next), + in_max_block_end - min_size(LZX_MAX_MATCH_LEN - 1, + in_max_block_end - in_next)); + goto resume_matchfinding; + + end_block: + /* We've decided on a block boundary and cached matches. Now + * choose a match/literal sequence and flush the block. */ + queue = lzx_optimize_and_flush_block(c, os, in_block_begin, + in_next - in_block_begin, + queue, is_16_bit); + } while (in_next != in_chunk_end); + + /* Save the LRU queue and next hashes */ + lzx_lru_queue_save(c->lru_queue, &queue); +} + +static void +lzx_compress_near_optimal_16(struct liblzx_compressor *c, const uint8_t *in, + size_t in_nchunk, size_t in_ndata, + struct lzx_output_bitstream *os) +{ + lzx_compress_near_optimal(c, in, in_nchunk, in_ndata, os, true); +} + +static void +lzx_compress_near_optimal_32(struct liblzx_compressor *c, const uint8_t *in, + size_t in_nchunk, size_t in_ndata, + struct lzx_output_bitstream *os) +{ + lzx_compress_near_optimal(c, in, in_nchunk, in_ndata, os, false); +} + +static attrib_forceinline void +lzx_cull_near_optimal(struct liblzx_compressor *c, size_t nbytes, const bool is_16_bit) +{ + CALL_BT_MF(is_16_bit, c, bt_matchfinder_cull, nbytes, c->window_size); +} + +static void +lzx_cull_near_optimal_16(struct liblzx_compressor *c, size_t nbytes) +{ + lzx_cull_near_optimal(c, nbytes, true); +} + +static void +lzx_cull_near_optimal_32(struct liblzx_compressor *c, size_t nbytes) +{ + lzx_cull_near_optimal(c, nbytes, false); +} + +/******************************************************************************/ +/* Faster ("lazy") compression algorithm */ +/*----------------------------------------------------------------------------*/ + +/* + * Called when the compressor chooses to use a literal. This tallies the + * Huffman symbol for the literal, increments the current literal run length, + * and "observes" the literal for the block split statistics. + */ +static attrib_forceinline void +lzx_choose_literal(struct liblzx_compressor *c, unsigned literal, uint32_t *litrunlen_p) +{ + lzx_observe_literal(&c->split_stats, literal); + c->freqs.main[literal]++; + ++*litrunlen_p; +} + +/* + * Called when the compressor chooses to use a match. This tallies the Huffman + * symbol(s) for a match, saves the match data and the length of the preceding + * literal run, updates the recent offsets queue, and "observes" the match for + * the block split statistics. + */ +static attrib_forceinline void +lzx_choose_match(struct liblzx_compressor *c, unsigned length, uint32_t adjusted_offset, + uint32_t recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit, + uint32_t *litrunlen_p, struct lzx_sequence **next_seq_p) +{ + struct lzx_sequence *next_seq = *next_seq_p; + unsigned mainsym; + + lzx_observe_match(&c->split_stats, length); + + mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset, + is_16_bit); + next_seq->litrunlen_and_matchlen = + (*litrunlen_p << SEQ_MATCHLEN_BITS) | length; + next_seq->adjusted_offset_and_mainsym = + (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym; + + /* Update the recent offsets queue. */ + if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) { + /* Repeat offset match. */ + uint32_t temp = recent_offsets[adjusted_offset]; + recent_offsets[adjusted_offset] = recent_offsets[0]; + recent_offsets[0] = temp; + } else { + /* Explicit offset match. */ + + /* Tally the aligned offset symbol if needed. */ + if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT) + c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++; + + recent_offsets[2] = recent_offsets[1]; + recent_offsets[1] = recent_offsets[0]; + recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT; + } + + /* Reset the literal run length and advance to the next sequence. */ + *next_seq_p = next_seq + 1; + *litrunlen_p = 0; +} + +/* + * Called when the compressor ends a block. This finshes the last lzx_sequence, + * which is just a literal run with no following match. This literal run might + * be empty. + */ +static attrib_forceinline void +lzx_finish_sequence(struct lzx_sequence *last_seq, uint32_t litrunlen) +{ + last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS; +} + +/* + * Find the longest repeat offset match with the current position. If a match + * is found, return its length and set *best_rep_idx_ret to the index of its + * offset in @recent_offsets. Otherwise, return 0. + * + * Don't bother with length 2 matches; consider matches of length >= 3 only. + * Also assume that max_len >= 3. + */ +static unsigned +lzx_find_longest_repeat_offset_match(const uint8_t * const in_next, + const uint32_t recent_offsets[], + const unsigned max_len, + unsigned *best_rep_idx_ret) +{ + STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */ + + const uint32_t seq3 = load_u24_unaligned(in_next); + const uint8_t *matchptr; + unsigned best_rep_len = 0; + unsigned best_rep_idx = 0; + unsigned rep_len; + + /* Check for rep0 match (most recent offset) */ + matchptr = in_next - recent_offsets[0]; + if (load_u24_unaligned(matchptr) == seq3) + best_rep_len = lz_extend(in_next, matchptr, 3, max_len); + + /* Check for rep1 match (second most recent offset) */ + matchptr = in_next - recent_offsets[1]; + if (load_u24_unaligned(matchptr) == seq3) { + rep_len = lz_extend(in_next, matchptr, 3, max_len); + if (rep_len > best_rep_len) { + best_rep_len = rep_len; + best_rep_idx = 1; + } + } + + /* Check for rep2 match (third most recent offset) */ + matchptr = in_next - recent_offsets[2]; + if (load_u24_unaligned(matchptr) == seq3) { + rep_len = lz_extend(in_next, matchptr, 3, max_len); + if (rep_len > best_rep_len) { + best_rep_len = rep_len; + best_rep_idx = 2; + } + } + + *best_rep_idx_ret = best_rep_idx; + return best_rep_len; +} + +/* + * Fast heuristic scoring for lazy parsing: how "good" is this match? + * This is mainly determined by the length: longer matches are better. + * However, we also give a bonus to close (small offset) matches and to repeat + * offset matches, since those require fewer bits to encode. + */ + +static attrib_forceinline unsigned +lzx_explicit_offset_match_score(unsigned len, uint32_t adjusted_offset) +{ + unsigned score = len; + + if (adjusted_offset < 4096) + score++; + if (adjusted_offset < 256) + score++; + + return score; +} + +static attrib_forceinline unsigned +lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx) +{ + return rep_len + 3; +} + +/* + * This is the "lazy" LZX compressor. The basic idea is that before it chooses + * a match, it checks to see if there's a longer match at the next position. If + * yes, it chooses a literal and continues to the next position. If no, it + * chooses the match. + * + * Some additional heuristics are used as well. Repeat offset matches are + * considered favorably and sometimes are chosen immediately. In addition, long + * matches (at least "nice_len" bytes) are chosen immediately as well. Finally, + * when we decide whether a match is "better" than another, we take the offset + * into consideration as well as the length. + */ +static attrib_forceinline void +lzx_reset_lazy(struct liblzx_compressor *c, bool is_16_bit) +{ + bool streaming = (c->variant != LIBLZX_VARIANT_WIM); + + /* Initialize the matchfinder. */ + CALL_HC_MF(is_16_bit, c, hc_matchfinder_init, c->window_size, + streaming); +} + +static void +lzx_reset_lazy_16(struct liblzx_compressor *c) +{ + lzx_reset_lazy(c, true); +} + +static void +lzx_reset_lazy_32(struct liblzx_compressor *c) +{ + lzx_reset_lazy(c, false); +} + +static attrib_forceinline void +lzx_compress_lazy(struct liblzx_compressor * restrict c, + const uint8_t * restrict in_begin, size_t in_nchunk, + size_t in_ndata, struct lzx_output_bitstream * restrict os, + bool is_16_bit) +{ + uint32_t max_offset = c->window_size; + const uint8_t * in_next = in_begin; + const uint8_t * const in_chunk_end = in_begin + in_nchunk; + const uint8_t *const in_data_end = in_begin + in_ndata; + unsigned max_find_len = LZX_MAX_MATCH_LEN; + unsigned max_produce_len = LZX_MAX_MATCH_LEN; + unsigned nice_len = min_uint(c->nice_match_length, max_find_len); + STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); + uint32_t recent_offsets[LZX_NUM_RECENT_OFFSETS]; + uint32_t next_hashes[2]; + + if (max_offset >= LZX_MAX_WINDOW_SIZE) { + /* Slightly shrink window to avoid offset values that are + * greater than 21 bits. */ + max_offset = LZX_MAX_WINDOW_SIZE - 1 - LZX_OFFSET_ADJUSTMENT; + } + + in_begin -= c->in_prefix_size; + + /* Load the LRU queue and next hashes. */ + { + int i; + for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + recent_offsets[i] = c->lru_queue[i]; + } + + next_hashes[0] = c->next_hashes[0]; + next_hashes[1] = c->next_hashes[1]; + } + + do { + /* Starting a new block */ + + const uint8_t * const in_block_begin = in_next; + const uint8_t * const in_max_block_end = + in_next + min_size(SOFT_MAX_BLOCK_SIZE, in_chunk_end - in_next); + struct lzx_sequence *next_seq = c->chosen_sequences; + uint32_t litrunlen = 0; + unsigned cur_len; + uint32_t cur_offset; + uint32_t cur_adjusted_offset; + unsigned cur_score; + unsigned next_len; + uint32_t next_offset; + uint32_t next_adjusted_offset; + unsigned next_score; + unsigned best_rep_len; + unsigned best_rep_idx; + unsigned rep_score; + unsigned skip_len; + + lzx_reset_symbol_frequencies(c); + lzx_init_block_split_stats(&c->split_stats); + + do { + /* Adjust max_len and nice_len if we're nearing the end + * of the input buffer. */ + if (unlikely(max_produce_len > + in_chunk_end - in_next)) { + max_produce_len = in_chunk_end - in_next; + max_find_len = in_data_end - in_next; + nice_len = + min_uint(max_produce_len, nice_len); + } + + /* Find the longest match (subject to the + * max_search_depth cutoff parameter) with the current + * position. Don't bother with length 2 matches; only + * look for matches of length >= 3. */ + { + size_t min_match_pos = in_next - in_begin; + min_match_pos -= + min_size(min_match_pos, max_offset); + + cur_len = CALL_HC_MF(is_16_bit, c, + hc_matchfinder_longest_match, + in_begin, + min_match_pos, + in_next, + 2, + max_find_len, + max_produce_len, + nice_len, + c->max_search_depth, + next_hashes, + &cur_offset); + } + + /* If there was no match found, or the only match found + * was a distant short match, then choose a literal. */ + if (cur_len < 3 || + (cur_len == 3 && + cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT && + cur_offset != recent_offsets[0] && + cur_offset != recent_offsets[1] && + cur_offset != recent_offsets[2])) + { + lzx_choose_literal(c, *in_next, &litrunlen); + in_next++; + continue; + } + + /* Heuristic: if this match has the most recent offset, + * then go ahead and choose it as a rep0 match. */ + if (cur_offset == recent_offsets[0]) { + in_next++; + skip_len = cur_len - 1; + cur_adjusted_offset = 0; + goto choose_cur_match; + } + + /* Compute the longest match's score as an explicit + * offset match. */ + cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT; + cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset); + + /* Find the longest repeat offset match at this + * position. If we find one and it's "better" than the + * explicit offset match we found, then go ahead and + * choose the repeat offset match immediately. */ + best_rep_len = lzx_find_longest_repeat_offset_match(in_next, + recent_offsets, + max_produce_len, + &best_rep_idx); + in_next++; + + if (best_rep_len != 0 && + (rep_score = lzx_repeat_offset_match_score(best_rep_len, + best_rep_idx)) >= cur_score) + { + cur_len = best_rep_len; + cur_adjusted_offset = best_rep_idx; + skip_len = best_rep_len - 1; + goto choose_cur_match; + } + + have_cur_match: + /* + * We have a match at the current position. If the + * match is very long, then choose it immediately. + * Otherwise, see if there's a better match at the next + * position. + */ + + if (cur_len >= nice_len) { + skip_len = cur_len - 1; + goto choose_cur_match; + } + + if (unlikely(max_produce_len > + in_chunk_end - in_next)) { + max_produce_len = in_chunk_end - in_next; + max_find_len = in_data_end - in_next; + nice_len = + min_uint(max_produce_len, nice_len); + } + + { + size_t min_match_pos = in_next - in_begin; + min_match_pos -= + min_uint(min_match_pos, max_offset); + + next_len = CALL_HC_MF( + is_16_bit, c, + hc_matchfinder_longest_match, + in_begin, + min_match_pos, + in_next, + cur_len - 2, + max_find_len, + max_produce_len, + nice_len, + c->max_search_depth / 2, + next_hashes, + &next_offset); + } + + if (next_len <= cur_len - 2) { + /* No potentially better match was found. */ + in_next++; + skip_len = cur_len - 2; + goto choose_cur_match; + } + + next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT; + next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset); + + best_rep_len = lzx_find_longest_repeat_offset_match(in_next, + recent_offsets, + max_produce_len, + &best_rep_idx); + in_next++; + + if (best_rep_len != 0 && + (rep_score = lzx_repeat_offset_match_score(best_rep_len, + best_rep_idx)) >= next_score) + { + + if (rep_score > cur_score) { + /* The next match is better, and it's a + * repeat offset match. */ + lzx_choose_literal(c, *(in_next - 2), + &litrunlen); + cur_len = best_rep_len; + cur_adjusted_offset = best_rep_idx; + skip_len = cur_len - 1; + goto choose_cur_match; + } + } else { + if (next_score > cur_score) { + /* The next match is better, and it's an + * explicit offset match. */ + lzx_choose_literal(c, *(in_next - 2), + &litrunlen); + cur_len = next_len; + cur_adjusted_offset = next_adjusted_offset; + cur_score = next_score; + goto have_cur_match; + } + } + + /* The original match was better; choose it. */ + skip_len = cur_len - 2; + + choose_cur_match: + /* Choose a match and have the matchfinder skip over its + * remaining bytes. */ + lzx_choose_match(c, cur_len, cur_adjusted_offset, + recent_offsets, is_16_bit, + &litrunlen, &next_seq); + + CALL_HC_MF(is_16_bit, c, + hc_matchfinder_skip_bytes, + in_begin, + in_next, + in_chunk_end, + skip_len, + next_hashes); + in_next += skip_len; + + /* Keep going until it's time to end the block. */ + } while (in_next < in_max_block_end && + !(c->split_stats.num_new_observations >= + NUM_OBSERVATIONS_PER_BLOCK_CHECK && + in_next - in_block_begin >= MIN_BLOCK_SIZE && + in_chunk_end - in_next >= MIN_BLOCK_SIZE && + lzx_should_end_block(&c->split_stats))); + + /* Flush the block. */ + lzx_finish_sequence(next_seq, litrunlen); + lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0); + + /* Keep going until we've reached the end of the input buffer. */ + } while (in_next != in_chunk_end); + + /* Save the LRU queue and next hashes */ + { + int i; + for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + c->lru_queue[i] = recent_offsets[i]; + } + c->next_hashes[0] = next_hashes[0]; + c->next_hashes[1] = next_hashes[1]; + } +} + +static void +lzx_compress_lazy_16(struct liblzx_compressor *c, const uint8_t *in, + size_t in_nchunk, size_t in_navail, + struct lzx_output_bitstream *os) +{ + lzx_compress_lazy(c, in, in_nchunk, in_navail, os, true); +} + +static void +lzx_compress_lazy_32(struct liblzx_compressor *c, const uint8_t *in, + size_t in_nchunk, size_t in_navail, + struct lzx_output_bitstream *os) +{ + lzx_compress_lazy(c, in, in_nchunk, in_navail, os, false); +} + +static void +lzx_cull_lazy_16(struct liblzx_compressor *c, size_t nbytes) +{ + CALL_HC_MF(true, c, hc_matchfinder_cull, nbytes, c->window_size); +} + +static void +lzx_cull_lazy_32(struct liblzx_compressor *c, size_t nbytes) +{ + CALL_HC_MF(false, c, hc_matchfinder_cull, nbytes, c->window_size); +} + +/******************************************************************************/ +/* Compressor operations */ +/*----------------------------------------------------------------------------*/ + +/* + * Generate tables for mapping match offsets (actually, "adjusted" match + * offsets) to offset slots. + */ +static void +lzx_init_offset_slot_tabs(struct liblzx_compressor *c) +{ + uint32_t adjusted_offset = 0; + unsigned slot = 0; + + /* slots [0, 29] */ + for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1); + adjusted_offset++) + { + if (adjusted_offset >= lzx_offset_slot_base[slot + 1] + + LZX_OFFSET_ADJUSTMENT) + slot++; + c->offset_slot_tab_1[adjusted_offset] = slot; + } + + /* slots [30, 49] */ + for (; adjusted_offset < LZX_MAX_WINDOW_SIZE; + adjusted_offset += (uint32_t)1 << 14) + { + if (adjusted_offset >= lzx_offset_slot_base[slot + 1] + + LZX_OFFSET_ADJUSTMENT) + slot++; + c->offset_slot_tab_2[adjusted_offset >> 14] = slot; + } +} + +static size_t +lzx_bt_max_search_depth(unsigned compression_level) +{ + return (24 * compression_level) / 50; +} + +static size_t +lzx_get_compressor_size(size_t window_size, unsigned compression_level, + bool streaming) +{ + + if (compression_level <= MAX_FAST_LEVEL) { + if (lzx_is_16_bit(window_size)) + return offsetof(struct liblzx_compressor, hc_mf_16) + + hc_matchfinder_size_16(window_size, streaming); + else + return offsetof(struct liblzx_compressor, hc_mf_32) + + hc_matchfinder_size_32(window_size, streaming); + } else { + if (lzx_is_16_bit(window_size)) + return offsetof(struct liblzx_compressor, bt_mf_16) + + bt_matchfinder_size_16(window_size, streaming); + else + return offsetof(struct liblzx_compressor, bt_mf_32) + + bt_matchfinder_size_32(window_size, streaming); + } +} + +/* Compress a buffer of data. */ +static void +lzx_reset(struct liblzx_compressor *c) +{ + /* Initially, the previous Huffman codeword lengths are all zeroes. */ + c->codes_index = 0; + memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens)); + + /* Reset the E8 preprocessor offset */ + c->e8_chunk_offset = 0; + + /* Reset the streaming prefix */ + c->in_prefix_size = 0; + + /* Reset the LRU queue */ + { + int i; + for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) { + c->lru_queue[i] = 1; + } + } + + /* Reset next hashes */ + c->next_hashes[0] = 0; + c->next_hashes[1] = 0; + + c->reset(c); +} + +/* Allocate an LZX compressor. */ +liblzx_compressor_t * +liblzx_compress_create(const struct liblzx_compress_properties *props) +{ + unsigned window_order; + struct liblzx_compressor *c; + bool streaming = (props->lzx_variant != LIBLZX_VARIANT_WIM); + + /* Validate the maximum buffer size and get the window order from it. */ + window_order = lzx_get_window_order(props->window_size); + if (window_order == 0) + return NULL; + + /* Allocate the compressor. */ + c = props->alloc_func(props->userdata, + lzx_get_compressor_size(props->window_size, props->compression_level, streaming)); + if (!c) + goto oom0; + + c->alloc_func = props->alloc_func; + c->free_func = props->free_func; + c->alloc_userdata = props->userdata; + c->window_size = props->window_size; + c->window_order = window_order; + c->num_main_syms = lzx_get_num_main_syms(window_order); + c->variant = props->lzx_variant; + c->first_block = true; + c->out_chunk.data = NULL; + c->out_chunk.size = 0; + c->flushing = false; + c->e8_chunk_offset = 0; + c->e8_file_size = props->e8_file_size; + c->in_buffer_capacity = c->window_size; + c->in_prefix_size = 0; + c->in_used = 0; + c->chunk_size = props->chunk_granularity; + + /* Allocate the buffer for preprocessed data if needed. */ + if (streaming) { + /* Pad out to include past blocks and extra + * matchfinding space */ + c->in_buffer_capacity *= 2; + c->in_buffer_capacity += + LZX_MAX_MATCH_LEN + LZX_E8_FILTER_TAIL_SIZE; + } + + if (c->variant == LIBLZX_VARIANT_WIM) + c->e8_file_size = LZX_WIM_MAGIC_FILESIZE; + + c->in_buffer = + props->alloc_func(props->userdata, c->in_buffer_capacity); + + if (!c->in_buffer) + goto oom1; + + c->out_buffer_capacity = c->chunk_size; + if (c->variant != LIBLZX_VARIANT_WIM) + c->out_buffer_capacity += 6144; + + c->out_chunk.data = c->out_buffer = + props->alloc_func(props->userdata, c->out_buffer_capacity); + + if (!c->out_buffer) + goto oom2; + + if (props->compression_level <= MAX_FAST_LEVEL) { + + /* Fast compression: Use lazy parsing. */ + if (lzx_is_16_bit(props->window_size)) { + c->reset = lzx_reset_lazy_16; + c->impl = lzx_compress_lazy_16; + c->cull = lzx_cull_lazy_16; + } else { + c->reset = lzx_reset_lazy_32; + c->impl = lzx_compress_lazy_32; + c->cull = lzx_cull_lazy_32; + } + + /* Scale max_search_depth and nice_match_length with the + * compression level. */ + c->max_search_depth = (60 * props->compression_level) / 20; + c->nice_match_length = (80 * props->compression_level) / 20; + + /* lzx_compress_lazy() needs max_search_depth >= 2 because it + * halves the max_search_depth when attempting a lazy match, and + * max_search_depth must be at least 1. */ + c->max_search_depth = max_uint(c->max_search_depth, 2); + } else { + + /* Normal / high compression: Use near-optimal parsing. */ + if (lzx_is_16_bit(c->window_size)) { + c->reset = lzx_reset_near_optimal_16; + c->impl = lzx_compress_near_optimal_16; + c->cull = lzx_cull_near_optimal_16; + } else { + c->reset = lzx_reset_near_optimal_32; + c->impl = lzx_compress_near_optimal_32; + c->cull = lzx_cull_near_optimal_32; + } + + /* Scale max_search_depth and nice_match_length with the + * compression level. */ + c->max_search_depth = lzx_bt_max_search_depth(props->compression_level); + c->nice_match_length = (48 * props->compression_level) / 50; + + /* Also scale num_optim_passes with the compression level. But + * the more passes there are, the less they help --- so don't + * add them linearly. */ + c->num_optim_passes = 1; + c->num_optim_passes += (props->compression_level >= 45); + c->num_optim_passes += (props->compression_level >= 70); + c->num_optim_passes += (props->compression_level >= 100); + c->num_optim_passes += (props->compression_level >= 150); + c->num_optim_passes += (props->compression_level >= 200); + c->num_optim_passes += (props->compression_level >= 300); + + /* max_search_depth must be at least 1. */ + c->max_search_depth = max_uint(c->max_search_depth, 1); + } + + /* Prepare the offset => offset slot mapping. */ + lzx_init_offset_slot_tabs(c); + + lzx_reset(c); + + return c; + +oom2: + props->free_func(props->userdata, c->in_buffer); +oom1: + props->free_func(props->userdata, c); +oom0: + return NULL; +} + +/* Compress a buffer of data. */ +static size_t +lzx_compress_chunk(struct liblzx_compressor *c) +{ + struct lzx_output_bitstream os; + size_t result; + bool e8_preprocess_enabled = (c->e8_chunk_offset < 0x40000000); + bool next_e8_preprocess_enabled = + (c->e8_chunk_offset + c->chunk_size < 0x40000000); + uint32_t chunk_size = min_u32(c->chunk_size, c->in_used); + uint32_t next_chunk_preprocess_size = 0; + + uint8_t *in = (uint8_t *)c->in_buffer + c->in_prefix_size; + + /* Preprocess the input data. */ + if (e8_preprocess_enabled) { + lzx_preprocess(in, chunk_size, c->e8_chunk_offset, + c->e8_file_size); + } + + if (c->in_used > c->chunk_size && next_e8_preprocess_enabled) { + next_chunk_preprocess_size = + min_u32(LZX_MAX_MATCH_LEN + LZX_E8_FILTER_TAIL_SIZE, + c->in_used - c->chunk_size); + } + + /* Preprocess enough of the next block input data for the + matchfinder */ + if (next_chunk_preprocess_size > 0) { + lzx_preprocess(in + c->chunk_size, next_chunk_preprocess_size, + c->e8_chunk_offset + c->chunk_size, + c->e8_file_size); + } + + /* Initialize the output bitstream. */ + lzx_init_output(&os, c->out_buffer, c->out_buffer_capacity); + + /* Call the compression level-specific compress() function. */ + (*c->impl)(c, in, chunk_size, c->in_used, &os); + + /* Undo next block preprocessing */ + if (next_chunk_preprocess_size > 0) { + lzx_postprocess(in + c->chunk_size, next_chunk_preprocess_size, + c->e8_chunk_offset + c->chunk_size, + c->e8_file_size); + } + + /* Flush the output bitstream. */ + result = lzx_flush_output(&os); + + /* Update the E8 chunk offset. */ + c->e8_chunk_offset += (uint32_t)chunk_size; + + /* Update the prefix and used amounts. */ + c->in_prefix_size += (uint32_t)chunk_size; + c->in_used -= chunk_size; + + if (c->in_prefix_size >= c->window_size * 2) { + uint32_t cull_amount = (c->in_prefix_size - c->window_size); + + in = (uint8_t *)c->in_buffer + c->in_prefix_size; + + memmove(c->in_buffer, in - c->window_size, + c->in_used + c->window_size); + c->in_prefix_size = c->window_size; + + (*c->cull)(c, cull_amount); + } + + /* Return the number of compressed bytes, or 0 if the input did not + * compress to less than its original size. */ + return result; +} + +void +liblzx_compress_destroy(liblzx_compressor_t *c) +{ + c->free_func(c->alloc_userdata, c->out_buffer); + c->free_func(c->alloc_userdata, c->in_buffer); + c->free_func(c->alloc_userdata, c); +} + +size_t +liblzx_compress_add_input(liblzx_compressor_t *c, const void *in_data, + size_t in_data_size) +{ + uint32_t max_used = 0; + size_t fill_amount = 0; + + if (c->out_chunk.size > 0 || c->flushing) + return 0; + + max_used = min_uint(c->in_buffer_capacity - c->in_prefix_size, + c->chunk_size + LZX_MAX_MATCH_LEN + + LZX_E8_FILTER_TAIL_SIZE); + fill_amount = min_size(in_data_size, max_used - c->in_used); + + memcpy(((uint8_t *)c->in_buffer) + c->in_prefix_size + c->in_used, in_data, + fill_amount); + + c->in_used += fill_amount; + + if (c->in_used == max_used) { + c->out_chunk.size = lzx_compress_chunk(c); + } + + return fill_amount; +} + +const liblzx_output_chunk_t * +liblzx_compress_get_next_chunk(const liblzx_compressor_t *c) +{ + if (c->out_chunk.size > 0) + return &c->out_chunk; + else + return NULL; +} + +void +liblzx_compress_release_next_chunk(liblzx_compressor_t *c) +{ + c->out_chunk.size = 0; + if (c->flushing && c->in_used > 0) { + c->out_chunk.size = lzx_compress_chunk(c); + } +} + +void +liblzx_compress_end_input(liblzx_compressor_t *c) +{ + if (!c->flushing) { + c->flushing = true; + if (c->in_used > 0 && c->out_chunk.size == 0) { + c->out_chunk.size = lzx_compress_chunk(c); + } + } +} diff --git a/dlls/cabinet/liblzx_lzx_constants.h b/dlls/cabinet/liblzx_lzx_constants.h new file mode 100644 index 00000000000..f11ce407873 --- /dev/null +++ b/dlls/cabinet/liblzx_lzx_constants.h @@ -0,0 +1,108 @@ +/* + * lzx_constants.h + * + * Constants for the LZX compression format. + */ + +#ifndef _LZX_CONSTANTS_H +#define _LZX_CONSTANTS_H + +/* Number of literal byte values. */ +#define LZX_NUM_CHARS 256 + +/* The smallest and largest allowed match lengths. */ +#define LZX_MIN_MATCH_LEN 2 +#define LZX_MAX_MATCH_LEN 257 + +/* Number of distinct match lengths that can be represented. */ +#define LZX_NUM_LENS (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1) + +/* Number of match lengths for which no length symbol is required. */ +#define LZX_NUM_PRIMARY_LENS 7 +#define LZX_NUM_LEN_HEADERS (LZX_NUM_PRIMARY_LENS + 1) + +/* The first length which requires a length symbol. */ +#define LZX_MIN_SECONDARY_LEN (LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS) + +/* Valid values of the 3-bit block type field. */ +#define LZX_BLOCKTYPE_VERBATIM 1 +#define LZX_BLOCKTYPE_ALIGNED 2 +#define LZX_BLOCKTYPE_UNCOMPRESSED 3 + +/* 'LZX_MIN_WINDOW_SIZE' and 'LZX_MAX_WINDOW_SIZE' are the minimum and maximum + * sizes of the sliding window. */ +#define LZX_MIN_WINDOW_ORDER 15 +#define LZX_MAX_WINDOW_ORDER 21 +#define LZX_MIN_WINDOW_SIZE (1UL << LZX_MIN_WINDOW_ORDER) /* 32768 */ +#define LZX_MAX_WINDOW_SIZE (1UL << LZX_MAX_WINDOW_ORDER) /* 2097152 */ + +/* Maximum number of offset slots. (The actual number of offset slots depends + * on the window size.) */ +#define LZX_MAX_OFFSET_SLOTS 50 + +/* Maximum number of symbols in the main code. (The actual number of symbols in + * the main code depends on the window size.) */ +#define LZX_MAINCODE_MAX_NUM_SYMBOLS \ + (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS)) + +/* Number of symbols in the length code. */ +#define LZX_LENCODE_NUM_SYMBOLS (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS) + +/* Number of symbols in the pre-code. */ +#define LZX_PRECODE_NUM_SYMBOLS 20 + +/* Number of bits in which each pre-code codeword length is represented. */ +#define LZX_PRECODE_ELEMENT_SIZE 4 + +/* Number of low-order bits of each match offset that are entropy-encoded in + * aligned offset blocks. */ +#define LZX_NUM_ALIGNED_OFFSET_BITS 3 + +/* Number of symbols in the aligned offset code. */ +#define LZX_ALIGNEDCODE_NUM_SYMBOLS (1 << LZX_NUM_ALIGNED_OFFSET_BITS) + +/* Mask for the match offset bits that are entropy-encoded in aligned offset + * blocks. */ +#define LZX_ALIGNED_OFFSET_BITMASK ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1) + +/* Number of bits in which each aligned offset codeword length is represented. */ +#define LZX_ALIGNEDCODE_ELEMENT_SIZE 3 + +/* The first offset slot which requires an aligned offset symbol in aligned + * offset blocks. */ +#define LZX_MIN_ALIGNED_OFFSET_SLOT 8 + +/* The offset slot base for LZX_MIN_ALIGNED_OFFSET_SLOT. */ +#define LZX_MIN_ALIGNED_OFFSET 14 + +/* The maximum number of extra offset bits in verbatim blocks. (One would need + * to subtract LZX_NUM_ALIGNED_OFFSET_BITS to get the number of extra offset + * bits in *aligned* blocks.) */ +#define LZX_MAX_NUM_EXTRA_BITS 17 + +/* Maximum lengths (in bits) for length-limited Huffman code construction. */ +#define LZX_MAX_MAIN_CODEWORD_LEN 16 +#define LZX_MAX_LEN_CODEWORD_LEN 16 +#define LZX_MAX_PRE_CODEWORD_LEN ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1) +#define LZX_MAX_ALIGNED_CODEWORD_LEN ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1) + +/* For LZX-compressed blocks in WIM resources, this value is always used as the + * filesize parameter for the call instruction (0xe8 byte) preprocessing, even + * though the blocks themselves are not this size, and the size of the actual + * file resource in the WIM file is very likely to be something entirely + * different as well. */ +#define LZX_WIM_MAGIC_FILESIZE 12000000 + +/* Assumed LZX block size when the encoded block size begins with a 0 bit. + * This is probably WIM-specific. */ +#define LZX_DEFAULT_BLOCK_SIZE 32768 + +#define LZX_E8_FILTER_TAIL_SIZE 10 + +/* Number of offsets in the recent (or "repeat") offsets queue. */ +#define LZX_NUM_RECENT_OFFSETS 3 + +/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_ADJUSTMENT). */ +#define LZX_OFFSET_ADJUSTMENT (LZX_NUM_RECENT_OFFSETS - 1) + +#endif /* _LZX_CONSTANTS_H */ diff --git a/dlls/cabinet/liblzx_matchfinder_common.h b/dlls/cabinet/liblzx_matchfinder_common.h new file mode 100644 index 00000000000..37f16fc3fa1 --- /dev/null +++ b/dlls/cabinet/liblzx_matchfinder_common.h @@ -0,0 +1,131 @@ +/* + * matchfinder_common.h - common code for Lempel-Ziv matchfinding + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LIBLZX_MATCHFINDER_COMMON_H +#define _LIBLZX_MATCHFINDER_COMMON_H + +#include "liblzx_bitops.h" +#include "liblzx_unaligned.h" + +/* + * Given a 32-bit value that was loaded with the platform's native endianness, + * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24 + * bits contain the first 3 bytes, arranged in octets in a platform-dependent + * order, at the memory location from which the input 32-bit value was loaded. + */ +static attrib_forceinline uint32_t +loaded_u32_to_u24(uint32_t v) +{ + if (CPU_IS_LITTLE_ENDIAN()) + return v & 0xFFFFFF; + else + return v >> 8; +} + +/* + * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value. + * The order in which the 3 bytes will be arranged as octets in the 24 bits is + * platform-dependent. At least 4 bytes (not 3) must be available at @p. + */ +static attrib_forceinline uint32_t +load_u24_unaligned(const uint8_t *p) +{ +#if UNALIGNED_ACCESS_IS_FAST + return loaded_u32_to_u24(load_u32_unaligned(p)); +#else + if (CPU_IS_LITTLE_ENDIAN()) + return ((uint32_t)p[0] << 0) | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16); + else + return ((uint32_t)p[2] << 0) | ((uint32_t)p[1] << 8) | ((uint32_t)p[0] << 16); +#endif +} + +/* + * The hash function: given a sequence prefix held in the low-order bits of a + * 32-bit value, multiply by a carefully-chosen large constant. Discard any + * bits of the product that don't fit in a 32-bit value, but take the + * next-highest @num_bits bits of the product as the hash value, as those have + * the most randomness. + */ +static attrib_forceinline uint32_t +lz_hash(uint32_t seq, unsigned num_bits) +{ + return (uint32_t)(seq * 0x1E35A7BD) >> (32 - num_bits); +} + +/* + * Return the number of bytes at @matchptr that match the bytes at @strptr, up + * to a maximum of @max_len. Initially, @start_len bytes are matched. + */ +static attrib_forceinline unsigned +lz_extend(const uint8_t * const strptr, const uint8_t * const matchptr, + const unsigned start_len, const unsigned max_len) +{ + unsigned len = start_len; + machine_word_t v_word; + + if (UNALIGNED_ACCESS_IS_FAST) { + + if (likely(max_len - len >= 4 * WORDBYTES)) { + + #define COMPARE_WORD_STEP \ + v_word = load_word_unaligned(&matchptr[len]) ^ \ + load_word_unaligned(&strptr[len]); \ + if (v_word != 0) \ + goto word_differs; \ + len += WORDBYTES; \ + + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + COMPARE_WORD_STEP + #undef COMPARE_WORD_STEP + } + + while (len + WORDBYTES <= max_len) { + v_word = load_word_unaligned(&matchptr[len]) ^ + load_word_unaligned(&strptr[len]); + if (v_word != 0) + goto word_differs; + len += WORDBYTES; + } + } + + while (len < max_len && matchptr[len] == strptr[len]) + len++; + return len; + +word_differs: + if (CPU_IS_LITTLE_ENDIAN()) + len += (bsfw(v_word) >> 3); + else + len += (WORDBITS - 1 - bsrw(v_word)) >> 3; + return len; +} + +#endif /* _LIBLZX_MATCHFINDER_COMMON_H */ diff --git a/dlls/cabinet/liblzx_minmax.h b/dlls/cabinet/liblzx_minmax.h new file mode 100644 index 00000000000..d4d76130caa --- /dev/null +++ b/dlls/cabinet/liblzx_minmax.h @@ -0,0 +1,122 @@ +/* + * compiler.h + * + * Compiler-specific definitions. + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ +#ifndef _LIBLZX_MINMAX_H +#define _LIBLZX_MINMAX_H + +#include "liblzx_compiler.h" +#include "liblzx_types.h" + +/* Get the minimum of two variables, without multiple evaluation. */ +static attrib_forceinline double +min_double(double a, double b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline float +min_float(float a, float b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline unsigned +min_uint(unsigned a, unsigned b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline unsigned +min_u32(unsigned a, unsigned b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline size_t +min_size(size_t a, size_t b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline intmax_t +min_int(int a, int b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline void * +min_ptr(void *a, void *b) +{ + return (a < b) ? a : b; +} + +static attrib_forceinline const void * +min_constptr(const void *a, const void *b) +{ + return (a < b) ? a : b; +} + +/* Get the maximum of two variables, without multiple evaluation. */ +static attrib_forceinline double +max_float(double a, double b) +{ + return (a > b) ? a : b; +} + +static attrib_forceinline unsigned +max_uint(unsigned a, unsigned b) +{ + return (a > b) ? a : b; +} + +static attrib_forceinline uint32_t +max_u32(uint32_t a, uint32_t b) +{ + return (a > b) ? a : b; +} + +static attrib_forceinline uint64_t +max_u64(uint64_t a, uint64_t b) +{ + return (a > b) ? a : b; +} + +static attrib_forceinline void * +max_ptr(void *a, void *b) +{ + return (a > b) ? a : b; +} + +static attrib_forceinline const void * +max_constptr(const void *a, const void *b) +{ + return (a > b) ? a : b; +} + +#endif diff --git a/dlls/cabinet/liblzx_types.h b/dlls/cabinet/liblzx_types.h new file mode 100644 index 00000000000..b63e4b31fc1 --- /dev/null +++ b/dlls/cabinet/liblzx_types.h @@ -0,0 +1,33 @@ +#ifndef _LIBLZX_TYPES_H +#define _LIBLZX_TYPES_H + +#include <inttypes.h> +#include <stdbool.h> +#include <stddef.h> + +#include "liblzx_compiler.h" + +/* Unsigned little endian types of exact size */ +typedef uint16_t _bitwise_attr le16_t; +typedef uint32_t _bitwise_attr le32_t; +typedef uint64_t _bitwise_attr le64_t; + +/* Unsigned big endian types of exact size */ +typedef uint16_t _bitwise_attr be16_t; +typedef uint32_t _bitwise_attr be32_t; +typedef uint64_t _bitwise_attr be64_t; + +/* A pointer to 'utf16lechar' indicates a UTF-16LE encoded string */ +typedef le16_t utf16lechar; + +/* + * Type of a machine word. 'unsigned long' would be logical, but that is only + * 32 bits on x86_64 Windows. The same applies to 'uint_fast32_t'. So the best + * we can do without a bunch of #ifdefs appears to be 'size_t'. + */ +typedef size_t machine_word_t; + +#define WORDBYTES sizeof(machine_word_t) +#define WORDBITS (8 * WORDBYTES) + +#endif /* _LIBLZX_TYPES_H */ diff --git a/dlls/cabinet/liblzx_unaligned.h b/dlls/cabinet/liblzx_unaligned.h new file mode 100644 index 00000000000..23ee938fb62 --- /dev/null +++ b/dlls/cabinet/liblzx_unaligned.h @@ -0,0 +1,134 @@ +/* + * unaligned.h - inline functions for unaligned memory accesses + * + * Copyright (C) 2025 Eric Lasota + * Based on wimlib. Copyright 2022 Eric Biggers + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, + * copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following + * conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES + * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT + * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, + * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR + * OTHER DEALINGS IN THE SOFTWARE. + */ + +#ifndef _LIBLZX_UNALIGNED_H +#define _LIBLZX_UNALIGNED_H + +#include <string.h> + +#include "liblzx_compiler.h" +#include "liblzx_endianness.h" +#include "liblzx_types.h" + +#define DEFINE_UNALIGNED_TYPE(name, type) \ +static attrib_forceinline type \ +load_##name##_unaligned(const void *p) \ +{ \ + type v; \ + memcpy(&v, p, sizeof(v)); \ + return v; \ +} \ + \ +static attrib_forceinline void \ +store_##name##_unaligned(type v, void *p) \ +{ \ + memcpy(p, &v, sizeof(v)); \ +} + +DEFINE_UNALIGNED_TYPE(u16, uint16_t); +DEFINE_UNALIGNED_TYPE(u32, uint32_t); +DEFINE_UNALIGNED_TYPE(u64, uint64_t); +DEFINE_UNALIGNED_TYPE(le16, le16_t); +DEFINE_UNALIGNED_TYPE(le32, le32_t); +DEFINE_UNALIGNED_TYPE(le64, le64_t); +DEFINE_UNALIGNED_TYPE(be16, be16_t); +DEFINE_UNALIGNED_TYPE(be32, be32_t); +DEFINE_UNALIGNED_TYPE(be64, be64_t); +DEFINE_UNALIGNED_TYPE(size_t, size_t); +DEFINE_UNALIGNED_TYPE(machine_word_t, machine_word_t); + +#define load_word_unaligned load_machine_word_t_unaligned +#define store_word_unaligned store_machine_word_t_unaligned + +static attrib_forceinline uint16_t +get_unaligned_le16(const uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le16_to_cpu(load_le16_unaligned(p)); + else + return ((uint16_t)p[1] << 8) | p[0]; +} + +static attrib_forceinline uint32_t +get_unaligned_le32(const uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return le32_to_cpu(load_le32_unaligned(p)); + else + return ((uint32_t)p[3] << 24) | ((uint32_t)p[2] << 16) | + ((uint32_t)p[1] << 8) | p[0]; +} + +static attrib_forceinline uint32_t +get_unaligned_be32(const uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) + return be32_to_cpu(load_be32_unaligned(p)); + else + return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) | + ((uint32_t)p[2] << 8) | p[3]; +} + +static attrib_forceinline void +put_unaligned_le16(uint16_t v, uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le16_unaligned(cpu_to_le16(v), p); + } else { + p[0] = (uint8_t)(v >> 0); + p[1] = (uint8_t)(v >> 8); + } +} + +static attrib_forceinline void +put_unaligned_le32(uint32_t v, uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_le32_unaligned(cpu_to_le32(v), p); + } else { + p[0] = (uint8_t)(v >> 0); + p[1] = (uint8_t)(v >> 8); + p[2] = (uint8_t)(v >> 16); + p[3] = (uint8_t)(v >> 24); + } +} + +static attrib_forceinline void +put_unaligned_be32(uint32_t v, uint8_t *p) +{ + if (UNALIGNED_ACCESS_IS_FAST) { + store_be32_unaligned(cpu_to_be32(v), p); + } else { + p[0] = (uint8_t)(v >> 24); + p[1] = (uint8_t)(v >> 16); + p[2] = (uint8_t)(v >> 8); + p[3] = (uint8_t)(v >> 0); + } +} + +#endif /* _LIBLZX_UNALIGNED_H */ diff --git a/dlls/cabinet/liblzx_util.h b/dlls/cabinet/liblzx_util.h new file mode 100644 index 00000000000..4d832ae0da6 --- /dev/null +++ b/dlls/cabinet/liblzx_util.h @@ -0,0 +1,20 @@ +/* + * util.h - utility functions and macros + */ +#ifndef _LIBLZX_UTIL_H +#define _LIBLZX_UTIL_H + +#include "liblzx_compiler.h" +#include "liblzx_types.h" + +/**************** + * General macros + *****************/ + +/* Calculate 'n / d', but round up instead of down. */ +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) + +/* Get the number of elements of an array type. */ +#define ARRAY_LEN(array) (sizeof(array) / sizeof((array)[0])) + +#endif /* _LIBLZX_UTIL_H */ diff --git a/dlls/cabinet/tests/extract.c b/dlls/cabinet/tests/extract.c index 2e37dc7cda6..748115d67fd 100644 --- a/dlls/cabinet/tests/extract.c +++ b/dlls/cabinet/tests/extract.c @@ -104,9 +104,11 @@ static void create_test_files(void)
createTestFile("a.txt"); createTestFile("b.txt"); + createTestFile("c.txt"); + createTestFile("d.txt"); CreateDirectoryA("testdir", NULL); - createTestFile("testdir\c.txt"); - createTestFile("testdir\d.txt"); + createTestFile("testdir\e.txt"); + createTestFile("testdir\f.txt"); CreateDirectoryA("dest", NULL); }
@@ -114,11 +116,13 @@ static void delete_test_files(void) { DeleteFileA("a.txt"); DeleteFileA("b.txt"); - DeleteFileA("testdir\c.txt"); - DeleteFileA("testdir\d.txt"); + DeleteFileA("c.txt"); + DeleteFileA("d.txt"); + DeleteFileA("testdir\e.txt"); + DeleteFileA("testdir\f.txt"); RemoveDirectoryA("testdir");
- DeleteFileA("extract.cab"); + //DeleteFileA("extract.cab"); }
/* the FCI callbacks */ @@ -269,7 +273,7 @@ static INT_PTR CDECL get_open_info(char *pszName, USHORT *pdate, USHORT *ptime, return (INT_PTR)handle; }
-static void add_file(HFCI hfci, char *file) +static void add_file(HFCI hfci, char *file, TCOMP typeCompress) { char path[MAX_PATH]; BOOL res; @@ -279,7 +283,7 @@ static void add_file(HFCI hfci, char *file) lstrcatA(path, file);
res = FCIAddFile(hfci, path, file, FALSE, get_next_cabinet, progress, - get_open_info, tcompTYPE_MSZIP); + get_open_info, typeCompress); ok(res, "Expected FCIAddFile to succeed\n"); }
@@ -300,10 +304,12 @@ static void create_cab_file(void) CCAB cabParams; HFCI hfci; ERF erf; - static CHAR a_txt[] = "a.txt", - b_txt[] = "b.txt", - testdir_c_txt[] = "testdir\c.txt", - testdir_d_txt[] = "testdir\d.txt"; + static CHAR a_txt[] = "a.txt", + b_txt[] = "b.txt", + c_txt[] = "c.txt", + d_txt[] = "d.txt", + testdir_e_txt[] = "testdir\e.txt", + testdir_f_txt[] = "testdir\f.txt"; BOOL res;
set_cab_parameters(&cabParams); @@ -314,10 +320,12 @@ static void create_cab_file(void)
ok(hfci != NULL, "Failed to create an FCI context\n");
- add_file(hfci, a_txt); - add_file(hfci, b_txt); - add_file(hfci, testdir_c_txt); - add_file(hfci, testdir_d_txt); + add_file(hfci, a_txt, tcompTYPE_MSZIP); + add_file(hfci, b_txt, tcompTYPE_MSZIP); + add_file(hfci, c_txt, TCOMPfromLZXWindow(21)); + add_file(hfci, d_txt, TCOMPfromLZXWindow(21)); + add_file(hfci, testdir_e_txt, tcompTYPE_MSZIP); + add_file(hfci, testdir_f_txt, tcompTYPE_MSZIP);
res = FCIFlushCabinet(hfci, FALSE, get_next_cabinet, progress); ok(res, "Failed to flush the cabinet\n"); @@ -380,26 +388,30 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount); + ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount); ok(session.Operation == (EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES), "Expected EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); - ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n"); - ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n"); + ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n"); + ok(DeleteFileA("dest\d.txt"), "Expected dest\d.txt to exist\n"); + ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n"); + ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n"); ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n"); - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n"); free_file_list(&session); @@ -411,23 +423,25 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount); + ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount); ok(session.Operation == EXTRACT_FILLFILELIST, "Expected EXTRACT_FILLFILELIST, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n"); - ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", TRUE), "list entry wrong\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n"); + ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", TRUE), "list entry wrong\n"); ok(check_list(&node, "b.txt", TRUE), "list entry wrong\n"); ok(check_list(&node, "a.txt", TRUE), "list entry wrong\n");
@@ -436,27 +450,31 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount); + ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount); ok(session.Operation == EXTRACT_EXTRACTFILES, "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); - ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n"); - ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n"); + ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n"); + ok(DeleteFileA("dest\d.txt"), "Expected dest\d.txt to exist\n"); + ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n"); + ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n"); ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n"); ok(RemoveDirectoryA("dest"), "Expected dest to exist\n"); - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -464,25 +482,29 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount); + ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount); ok(session.Operation == EXTRACT_EXTRACTFILES, "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n"); ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(!DeleteFileA("dest\c.txt"), "Expected dest\a.txt to not exist\n"); + ok(!DeleteFileA("dest\d.txt"), "Expected dest\b.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -497,26 +519,30 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount); + ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount); ok(session.Operation == EXTRACT_EXTRACTFILES, "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); - ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n"); - ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); + ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); + ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n"); + ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n"); + ok(!DeleteFileA("dest\d.txt"), "Expected dest\d.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(!check_list(&node, "c.txt", FALSE), "list entry wrong\n"); + ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(!check_list(&node, "a.txt", FALSE), "list entry wrong\n"); free_file_list(&session);
@@ -525,25 +551,25 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount); + ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount); ok(session.Operation == EXTRACT_FILLFILELIST, "Expected EXTRACT_FILLFILELIST, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n"); ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n"); - ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n"); + ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(!check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -551,24 +577,26 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount); + ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount); ok(session.Operation == 0, "Expected 0, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(!session.FilterList, "Expected empty filter list\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n"); ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n"); - ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", TRUE), "list entry wrong\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n"); + ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", TRUE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", TRUE), "list entry wrong\n"); ok(check_list(&node, "b.txt", TRUE), "list entry wrong\n"); ok(check_list(&node, "a.txt", TRUE), "list entry wrong\n");
@@ -577,28 +605,32 @@ static void test_Extract(void) res = pExtract(&session, "extract.cab"); node = session.FileList; ok(res == S_OK, "Expected S_OK, got %ld\n", res); - ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize); + ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_NONE, "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError); - ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount); + ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount); ok(session.Operation == 0, "Expected 0, got %d\n", session.Operation); ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"), - "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"), + "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile); ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved); ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); - ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n"); + ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n"); ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); - ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n"); - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n"); node = session.FilterList; - ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n"); - ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n"); + ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n"); free_file_list(&session); @@ -625,10 +657,10 @@ static void test_Extract(void) ok(!session.FilterList, "Expected empty filter list\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n"); ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n"); - ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry should not exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n"); + ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry should not exist\n"); ok(!check_list(&node, "b.txt", FALSE), "list entry should not exist\n"); ok(!check_list(&node, "a.txt", FALSE), "list entry should not exist\n"); free_file_list(&session); @@ -660,10 +692,10 @@ static void test_Extract(void) ok(getFileSize("dest\a.txt") == 11, "Expected dest\a.txt to be 11 bytes\n"); ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to be read-only\n"); ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n"); - ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry should not exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n"); + ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry should not exist\n"); ok(!check_list(&node, "b.txt", FALSE), "list entry should not exist\n"); ok(!check_list(&node, "a.txt", FALSE), "list entry should not exist\n"); free_file_list(&session); @@ -673,8 +705,8 @@ static void test_Extract(void)
/* first file exists and is writable, third file exists but is read-only */ createTestFile("dest\a.txt"); - createTestFile("dest\testdir\c.txt"); - SetFileAttributesA("dest\testdir\c.txt", FILE_ATTRIBUTE_READONLY); + createTestFile("dest\testdir\e.txt"); + SetFileAttributesA("dest\testdir\e.txt", FILE_ATTRIBUTE_READONLY); ZeroMemory(&session, sizeof(SESSION)); lstrcpyA(session.Destination, "dest"); session.Operation = EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES; @@ -682,12 +714,12 @@ static void test_Extract(void) node = session.FileList; ok(res == HRESULT_FROM_WIN32(ERROR_ACCESS_DENIED) || res == E_FAIL, "Expected HRESULT_FROM_WIN32(ERROR_ACCESS_DENIED) or E_FAIL, got %08lx\n", res); - ok(session.FileSize == 26, "Expected 26, got %d\n", session.FileSize); + ok(session.FileSize == 38, "Expected 38, got %d\n", session.FileSize); ok(session.Error.erfOper == FDIERROR_USER_ABORT, "Expected FDIERROR_USER_ABORT, got %d\n", session.Error.erfOper); ok(session.Error.fError == TRUE, "Expected TRUE, got %d\n", session.Error.fError); - ok(session.FileCount == 3, "Expected 3, got %d\n", session.FileCount); - ok(!lstrcmpA(session.CurrentFile, "dest\testdir\c.txt"), + ok(session.FileCount == 5, "Expected 5, got %d\n", session.FileCount); + ok(!lstrcmpA(session.CurrentFile, "dest\testdir\e.txt"), "Expected dest\c.txt, got %s\n", session.CurrentFile); ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType); ok(session.Operation == (EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES), @@ -698,19 +730,21 @@ static void test_Extract(void) ok(getFileSize("dest\a.txt") == 6, "Expected dest\a.txt to be 6 bytes\n"); ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n"); ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n"); - ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to be read-only\n"); - ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n"); - ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n"); - ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n"); + ok(DeleteFileA("dest\c.txt"), "Expected dest\a.txt to exist\n"); + ok(DeleteFileA("dest\d.txt"), "Expected dest\b.txt to exist\n"); + ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to be read-only\n"); + ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n"); + ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n"); + ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n"); ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n"); ok(!check_list(&node, "a.txt", TRUE), "list entry wrong\n"); free_file_list(&session);
- SetFileAttributesA("dest\testdir\c.txt", FILE_ATTRIBUTE_NORMAL); - DeleteFileA("dest\testdir\c.txt"); + SetFileAttributesA("dest\testdir\e.txt", FILE_ATTRIBUTE_NORMAL); + DeleteFileA("dest\testdir\e.txt");
- ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n"); - ok(RemoveDirectoryA("dest"), "Expected dest to exist\n"); + ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist and be empty\n"); + ok(RemoveDirectoryA("dest"), "Expected dest to exist and be empty\n"); }
START_TEST(extract)