[PATCH v10 1/2] cabinet: LZX compression support

27 Sep 2025

From: elasota 1137273+elasota@users.noreply.github.com
---
 dlls/cabinet/Makefile.in                 |    5 +-
 dlls/cabinet/fci.c                       |  253 +-
 dlls/cabinet/liblzx.h                    |  141 +
 dlls/cabinet/liblzx_bitops.h             |  156 +
 dlls/cabinet/liblzx_bt_matchfinder.h     |  446 +++
 dlls/cabinet/liblzx_compiler.h           |  214 ++
 dlls/cabinet/liblzx_compress_common.c    |  673 ++++
 dlls/cabinet/liblzx_compress_common.h    |   19 +
 dlls/cabinet/liblzx_config.h             |   29 +
 dlls/cabinet/liblzx_endianness.h         |  136 +
 dlls/cabinet/liblzx_error.h              |   11 +
 dlls/cabinet/liblzx_hc_matchfinder.h     |  432 +++
 dlls/cabinet/liblzx_lzx_common.c         |  325 ++
 dlls/cabinet/liblzx_lzx_common.h         |   29 +
 dlls/cabinet/liblzx_lzx_compress.c       | 3662 ++++++++++++++++++++++
 dlls/cabinet/liblzx_lzx_constants.h      |  108 +
 dlls/cabinet/liblzx_matchfinder_common.h |  131 +
 dlls/cabinet/liblzx_minmax.h             |  122 +
 dlls/cabinet/liblzx_types.h              |   33 +
 dlls/cabinet/liblzx_unaligned.h          |  134 +
 dlls/cabinet/liblzx_util.h               |   20 +
 dlls/cabinet/tests/extract.c             |  240 +-
 22 files changed, 7181 insertions(+), 138 deletions(-)
 create mode 100644 dlls/cabinet/liblzx.h
 create mode 100644 dlls/cabinet/liblzx_bitops.h
 create mode 100644 dlls/cabinet/liblzx_bt_matchfinder.h
 create mode 100644 dlls/cabinet/liblzx_compiler.h
 create mode 100644 dlls/cabinet/liblzx_compress_common.c
 create mode 100644 dlls/cabinet/liblzx_compress_common.h
 create mode 100644 dlls/cabinet/liblzx_config.h
 create mode 100644 dlls/cabinet/liblzx_endianness.h
 create mode 100644 dlls/cabinet/liblzx_error.h
 create mode 100644 dlls/cabinet/liblzx_hc_matchfinder.h
 create mode 100644 dlls/cabinet/liblzx_lzx_common.c
 create mode 100644 dlls/cabinet/liblzx_lzx_common.h
 create mode 100644 dlls/cabinet/liblzx_lzx_compress.c
 create mode 100644 dlls/cabinet/liblzx_lzx_constants.h
 create mode 100644 dlls/cabinet/liblzx_matchfinder_common.h
 create mode 100644 dlls/cabinet/liblzx_minmax.h
 create mode 100644 dlls/cabinet/liblzx_types.h
 create mode 100644 dlls/cabinet/liblzx_unaligned.h
 create mode 100644 dlls/cabinet/liblzx_util.h

diff --git a/dlls/cabinet/Makefile.in b/dlls/cabinet/Makefile.in
index a72e5a31be6..40db872bbfd 100644
--- a/dlls/cabinet/Makefile.in
+++ b/dlls/cabinet/Makefile.in
@@ -7,4 +7,7 @@ SOURCES = \
    cabinet.rc \
    cabinet_main.c \
    fci.c \
-	fdi.c
+	fdi.c \
+	liblzx_compress_common.c \
+	liblzx_lzx_common.c \
+	liblzx_lzx_compress.c
diff --git a/dlls/cabinet/fci.c b/dlls/cabinet/fci.c
index e62399db1ba..87a29623656 100644
--- a/dlls/cabinet/fci.c
+++ b/dlls/cabinet/fci.c
@@ -47,6 +47,8 @@ There is still some work to be done:
 #include "wine/list.h"
 #include "wine/debug.h"
+#include "liblzx.h"
+
 WINE_DEFAULT_DEBUG_CHANNEL(cabinet);
#ifdef WORDS_BIGENDIAN
@@ -165,11 +167,13 @@ typedef struct FCI_Int
   char               szPrevDisk[CB_MAX_DISK_NAME];   /* disk name of previous cabinet */
   unsigned char      data_in[CAB_BLOCKMAX];          /* uncompressed data blocks */
   unsigned char      data_out[2 * CAB_BLOCKMAX];     /* compressed data blocks */
+  BOOL               have_data_out;
   cab_UWORD          cdata_in;
   ULONG              cCompressedBytesInFolder;
   cab_UWORD          cFolders;
   cab_UWORD          cFiles;
-  cab_ULONG          cDataBlocks;
+  cab_ULONG          cDataBlocksIn;
+  cab_ULONG          cDataBlocksOut;
   cab_ULONG          cbFileRemainder; /* uncompressed, yet to be written data */
                /* of spanned file of a spanning folder of a spanning cabinet */
   struct temp_file   data;
@@ -185,6 +189,9 @@ typedef struct FCI_Int
   cab_ULONG          folders_data_size;   /* total size of data contained in the current folders */
   TCOMP              compression;
   cab_UWORD        (*compress)(struct FCI_Int *);
+  cab_UWORD        (*flush)(struct FCI_Int *);
+  void             (*compress_shutdown)(struct FCI_Int *);
+  struct liblzx_compressor *lzx_compressor;
 } FCI_Int;
#define FCI_INT_MAGIC 0xfcfcfc05
@@ -274,7 +281,7 @@ static struct file *add_file( FCI_Int *fci, const char *filename )
         return NULL;
     }
     file->size    = 0;
-    file->offset  = fci->cDataBlocks * CAB_BLOCKMAX + fci->cdata_in;
+    file->offset  = fci->cDataBlocksIn * CAB_BLOCKMAX + fci->cdata_in;
     file->folder  = fci->cFolders;
     file->date    = 0;
     file->time    = 0;
@@ -305,43 +312,82 @@ static void free_file( FCI_Int *fci, struct file *file )
     fci->free( file );
 }
-/* create a new data block for the data in fci->data_in */
-static BOOL add_data_block( FCI_Int *fci, PFNFCISTATUS status_callback )
+/* creates new data blocks for the data in fci->data_in */
+static BOOL add_data_blocks( FCI_Int *fci, BOOL is_last_block, PFNFCISTATUS status_callback )
 {
     int err;
     struct data_block *block;
+    cab_UWORD compressed_size = 0;
+    cab_UWORD uncompressed_size = fci->cdata_in;
-    if (!fci->cdata_in) return TRUE;
+    if (!uncompressed_size)
+    {
+        if (fci->cDataBlocksIn == 0 || !is_last_block) return TRUE;
+    }
if (fci->data.handle == -1 && !create_temp_file( fci, &fci->data )) return FALSE;
-    if (!(block = fci->alloc( sizeof(*block) )))
+    if (uncompressed_size)
     {
-        set_error( fci, FCIERR_ALLOC_FAIL, ERROR_NOT_ENOUGH_MEMORY );
-        return FALSE;
+        compressed_size = fci->compress( fci );
+
+        fci->cdata_in = 0;
+        fci->cDataBlocksIn++;
     }
-    block->uncompressed = fci->cdata_in;
-    block->compressed   = fci->compress( fci );
-    if (fci->write( fci->data.handle, fci->data_out,
-                    block->compressed, &err, fci->pv ) != block->compressed)
+    if (compressed_size == 0 && is_last_block && fci->flush)
     {
-        set_error( fci, FCIERR_TEMP_FILE, err );
-        fci->free( block );
-        return FALSE;
+        compressed_size = fci->flush( fci );
     }
-    fci->cdata_in = 0;
-    fci->pending_data_size += sizeof(CFDATA) + fci->ccab.cbReserveCFData + block->compressed;
-    fci->cCompressedBytesInFolder += block->compressed;
-    fci->cDataBlocks++;
-    list_add_tail( &fci->blocks_list, &block->entry );
-
-    if (status_callback( statusFile, block->compressed, block->uncompressed, fci->pv ) == -1)
+    while (compressed_size > 0)
     {
-        set_error( fci, FCIERR_USER_ABORT, 0 );
-        return FALSE;
+        if (!(block = fci->alloc( sizeof(*block) )))
+        {
+            set_error( fci, FCIERR_ALLOC_FAIL, ERROR_NOT_ENOUGH_MEMORY );
+            return FALSE;
+        }
+
+        if (is_last_block && fci->cDataBlocksIn - 1 == fci->cDataBlocksOut)
+        {
+            block->uncompressed = uncompressed_size;
+        }
+        else
+        {
+            block->uncompressed = CAB_BLOCKMAX;
+        }
+
+        block->compressed = compressed_size;
+
+        if (fci->write( fci->data.handle, fci->data_out,
+                        block->compressed, &err, fci->pv ) != block->compressed)
+        {
+            set_error( fci, FCIERR_TEMP_FILE, err );
+            fci->free( block );
+            return FALSE;
+        }
+
+        fci->pending_data_size += sizeof(CFDATA) + fci->ccab.cbReserveCFData + block->compressed;
+        fci->cCompressedBytesInFolder += block->compressed;
+        fci->cDataBlocksOut++;
+        list_add_tail( &fci->blocks_list, &block->entry );
+
+        if (status_callback( statusFile, block->compressed, block->uncompressed, fci->pv ) == -1)
+        {
+            set_error( fci, FCIERR_USER_ABORT, 0 );
+            return FALSE;
+        }
+
+        if (is_last_block && fci->flush)
+        {
+            compressed_size = fci->flush( fci );
+        }
+        else
+        {
+            compressed_size = 0;
+        }
     }
+
     return TRUE;
 }
@@ -377,7 +423,7 @@ static BOOL add_file_data( FCI_Int *fci, char *sourcefile, char *filename, BOOL
         }
         file->size += len;
         fci->cdata_in += len;
-        if (fci->cdata_in == CAB_BLOCKMAX && !add_data_block( fci, status_callback )) return FALSE;
+        if (fci->cdata_in == CAB_BLOCKMAX && !add_data_blocks( fci, FALSE, status_callback )) return FALSE;
     }
     fci->close( handle, &err, fci->pv );
     return TRUE;
@@ -824,7 +870,8 @@ static BOOL add_data_to_folder( FCI_Int *fci, struct folder *folder, cab_ULONG *
         }
         if (split_block) break;
         free_data_block( fci, block );
-        fci->cDataBlocks--;
+        fci->cDataBlocksIn--;
+        fci->cDataBlocksOut--;
     }
if (list_empty( &fci->blocks_list )) return TRUE;
@@ -905,6 +952,10 @@ static cab_UWORD compress_NONE( FCI_Int *fci )
     return fci->cdata_in;
 }
+static void shutdown_NONE(FCI_Int *fci)
+{
+}
+
 static void *zalloc( void *opaque, unsigned int items, unsigned int size )
 {
     FCI_Int *fci = opaque;
@@ -938,9 +989,118 @@ static cab_UWORD compress_MSZIP( FCI_Int *fci )
     fci->data_out[1] = 'K';
     deflate( &stream, Z_FINISH );
     deflateEnd( &stream );
+    fci->have_data_out = TRUE;
     return stream.total_out + 2;
 }
+static void shutdown_MSZIP( FCI_Int *fci )
+{
+}
+
+static void shutdown_LZX(FCI_Int *fci)
+{
+    liblzx_compress_destroy(fci->lzx_compressor);
+    fci->lzx_compressor = NULL;
+}
+
+static void *compress_LZX_alloc_callback(void *userdata, size_t size)
+{
+    FCI_Int *fci = (FCI_Int *)userdata;
+
+    return fci->alloc((ULONG)size);
+}
+
+static void compress_LZX_free_callback(void *userdata, void *ptr)
+{
+    FCI_Int *fci = (FCI_Int *)userdata;
+
+    fci->free(ptr);
+}
+
+static cab_UWORD compress_LZX(FCI_Int *fci)
+{
+    size_t in_digested = 0;
+    size_t compressed_size = 0;
+    const liblzx_output_chunk_t *out_chunk = NULL;
+
+    if (fci->cDataBlocksIn == 0) {
+        /* First block, restart compression */
+        int window_size_bits = LZXCompressionWindowFromTCOMP(fci->compression);
+        liblzx_compress_properties_t props;
+
+        if (fci->lzx_compressor)
+        {
+            liblzx_compress_destroy(fci->lzx_compressor);
+            fci->lzx_compressor = NULL;
+        }
+
+        memset(&props, 0, sizeof(props));
+        props.lzx_variant = LIBLZX_VARIANT_CAB_DELTA;
+        props.window_size = 1 << window_size_bits;
+        props.chunk_granularity = CAB_BLOCKMAX;
+        props.compression_level = 70;
+        props.e8_file_size = LIBLZX_CONST_DEFAULT_E8_FILE_SIZE;
+        props.alloc_func = compress_LZX_alloc_callback;
+        props.free_func = compress_LZX_free_callback;
+        props.userdata = fci;
+
+        fci->lzx_compressor = liblzx_compress_create(&props);
+
+        if (!fci->lzx_compressor)
+        {
+            set_error(fci, FCIERR_ALLOC_FAIL, ERROR_OUTOFMEMORY);
+            return 0;
+        }
+    }
+
+    while (in_digested < fci->cdata_in)
+    {
+        in_digested += liblzx_compress_add_input(fci->lzx_compressor, fci->data_in + in_digested, fci->cdata_in - in_digested);
+
+        if (out_chunk)
+        {
+            /* After producing an output chunk, all data should be digestable. */
+            assert(in_digested == fci->cdata_in);
+            break;
+        }
+
+        out_chunk = liblzx_compress_get_next_chunk(fci->lzx_compressor);
+
+        if (out_chunk)
+        {
+            compressed_size = out_chunk->size;
+            memcpy(fci->data_out, out_chunk->data, compressed_size);
+            liblzx_compress_release_next_chunk(fci->lzx_compressor);
+
+            fci->have_data_out = TRUE;
+        }
+    }
+
+    return compressed_size;
+}
+
+cab_UWORD flush_LZX(FCI_Int *fci)
+{
+    const liblzx_output_chunk_t *out_chunk = NULL;
+    cab_UWORD compressed_size = 0;
+
+    liblzx_compress_end_input(fci->lzx_compressor);
+    out_chunk = liblzx_compress_get_next_chunk(fci->lzx_compressor);
+
+    if (out_chunk == NULL)
+    {
+        return 0;
+    }
+
+    compressed_size = out_chunk->size;
+    memcpy(fci->data_out, out_chunk->data, out_chunk->size);
+
+    liblzx_compress_release_next_chunk(fci->lzx_compressor);
+
+    fci->have_data_out = TRUE;
+
+    return compressed_size;
+}
/***********************************************************************
  *		FCICreate (CABINET.10)
@@ -1046,6 +1206,7 @@ HFCI __cdecl FCICreate(
   p_fci_internal->pv = pv;
   p_fci_internal->data.handle = -1;
   p_fci_internal->compress = compress_NONE;
+  p_fci_internal->compress_shutdown = shutdown_NONE;
list_init( &p_fci_internal->folders_list );
   list_init( &p_fci_internal->files_list );
@@ -1102,11 +1263,12 @@ static BOOL fci_flush_folder( FCI_Int *p_fci_internal,
   p_fci_internal->fSplitFolder=FALSE;
/* START of COPY */
-  if (!add_data_block( p_fci_internal, pfnfcis )) return FALSE;
+  if (!add_data_blocks( p_fci_internal, TRUE, pfnfcis )) return FALSE;
/* reset to get the number of data blocks of this folder which are */
   /* actually in this cabinet ( at least partially ) */
-  p_fci_internal->cDataBlocks=0;
+  p_fci_internal->cDataBlocksIn = 0;
+  p_fci_internal->cDataBlocksOut = 0;
p_fci_internal->statusFolderTotal = get_header_size( p_fci_internal ) +
       sizeof(CFFOLDER) + p_fci_internal->ccab.cbReserveCFFolder +
@@ -1211,7 +1373,8 @@ static BOOL fci_flush_folder( FCI_Int *p_fci_internal,
   if (!add_files_to_folder( p_fci_internal, folder, payload )) return FALSE;
/* reset CFFolder specific information */
-  p_fci_internal->cDataBlocks=0;
+  p_fci_internal->cDataBlocksIn=0;
+  p_fci_internal->cDataBlocksOut=0;
   p_fci_internal->cCompressedBytesInFolder=0;
return TRUE;
@@ -1409,19 +1572,41 @@ BOOL __cdecl FCIAddFile(
if (typeCompress != p_fci_internal->compression)
   {
+      if ((typeCompress & tcompMASK_TYPE) == tcompTYPE_LZX) {
+          TCOMP window_size_bits = (typeCompress & tcompMASK_LZX_WINDOW);
+
+          if (window_size_bits < tcompLZX_WINDOW_LO || window_size_bits > tcompLZX_WINDOW_HI) {
+              set_error(p_fci_internal, FCIERR_BAD_COMPR_TYPE, ERROR_BAD_ARGUMENTS);
+              return FALSE;
+          }
+      }
+
       if (!FCIFlushFolder( hfci, pfnfcignc, pfnfcis )) return FALSE;
-      switch (typeCompress)
+
+      p_fci_internal->compress_shutdown(p_fci_internal);
+
+      switch (typeCompress & tcompMASK_TYPE)
       {
       case tcompTYPE_MSZIP:
-          p_fci_internal->compression = tcompTYPE_MSZIP;
-          p_fci_internal->compress    = compress_MSZIP;
+          p_fci_internal->compression       = tcompTYPE_MSZIP;
+          p_fci_internal->compress          = compress_MSZIP;
+          p_fci_internal->flush             = NULL;
+          p_fci_internal->compress_shutdown = shutdown_MSZIP;
+          break;
+      case tcompTYPE_LZX:
+          p_fci_internal->compression       = typeCompress;
+          p_fci_internal->compress          = compress_LZX;
+          p_fci_internal->flush             = flush_LZX;
+          p_fci_internal->compress_shutdown = shutdown_LZX;
           break;
       default:
           FIXME( "compression %x not supported, defaulting to none\n", typeCompress );
           /* fall through */
       case tcompTYPE_NONE:
-          p_fci_internal->compression = tcompTYPE_NONE;
-          p_fci_internal->compress    = compress_NONE;
+          p_fci_internal->compression       = tcompTYPE_NONE;
+          p_fci_internal->compress          = compress_NONE;
+          p_fci_internal->flush             = NULL;
+          p_fci_internal->compress_shutdown = shutdown_NONE;
           break;
       }
   }
diff --git a/dlls/cabinet/liblzx.h b/dlls/cabinet/liblzx.h
new file mode 100644
index 00000000000..504a3289bac
--- /dev/null
+++ b/dlls/cabinet/liblzx.h
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright (C) 2012-2017 Eric Biggers
+ *
+ * This file is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this file; if not, see https://www.gnu.org/licenses/.
+ */
+
+#pragma once
+
+#ifndef __LIBLZX_H__
+#define __LIBLZX_H__
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "liblzx_error.h"
+
+typedef struct liblzx_internal liblzx_internal_t;
+typedef struct liblzx_compress_properties liblzx_compress_properties_t;
+typedef struct liblzx_compressor liblzx_compressor_t;
+typedef struct liblzx_output_chunk liblzx_output_chunk_t;
+
+typedef void *(*liblzx_alloc_func_t)(void *opaque, size_t size);
+typedef void (*liblzx_free_func_t)(void *opaque, void *ptr);
+
+enum liblzx_variant {
+        /* LZX variant used by CAB files and LZX DELTA */
+        LIBLZX_VARIANT_CAB_DELTA,
+
+        /* LZX variant used by WIM */
+        LIBLZX_VARIANT_WIM,
+};
+
+typedef enum liblzx_variant liblzx_variant_t;
+
+enum liblzx_constant {
+        LIBLZX_CONST_DEFAULT_CHUNK_SIZE = 32768,
+        LIBLZX_CONST_DEFAULT_E8_FILE_SIZE = 12 * 1024 * 1024,
+        LIBLZX_CONST_MAX_WINDOW_SIZE = 64 * 1024 * 1024,
+};
+
+struct liblzx_output_chunk {
+        const void *data;
+        size_t size;
+};
+
+struct liblzx_compress_properties {
+        /* LZX variant to use */
+        liblzx_variant_t lzx_variant;
+
+        /* Source file size for LZX DELTA.  Ignored for WIM.
+         * When using this, use liblzx_compress_add_input to add the source
+         * file's data before adding the new file's data.  For compression
+         * only, set this to 0.
+         */
+        size_t delta_source_size;
+
+        /* Compression window size. */
+        uint32_t window_size;
+
+        /* Granularity of a chunk.  Should generally be set to
+         * LIBLZX_CONST_DEFAULT_CHUNK_SIZE.
+         */
+        uint32_t chunk_granularity;
+
+        /* Compression level.  Can be set arbitrarily high. */
+        uint16_t compression_level;
+
+        /* E8 file size parameter.  For WIM, this is ignored.  For other
+         * variants, this value is expected to be user-controllable and
+         * is sent outside of the LZX data stream.
+         */
+        uint32_t e8_file_size;
+
+        /* Memory allocation function. */
+        liblzx_alloc_func_t alloc_func;
+
+        /* Memory free function. */
+        liblzx_free_func_t free_func;
+
+        /* Userdata parameter to pass to alloc function. */
+        void *userdata;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Creates a compressor object and returns a pointer to it. */
+liblzx_compressor_t *
+liblzx_compress_create(const liblzx_compress_properties_t *props);
+
+/* Destroys a compressor object and releases all resources. */
+void
+liblzx_compress_destroy(liblzx_compressor_t *stream);
+
+/* Resets a compressor to its initial state. */
+void
+liblzx_compress_reset(liblzx_compressor_t *stream);
+
+/* Adds input data to the compression stream and returns the number of bytes
+ * digested.  The return value will never exceed in_data_size.  If this
+ * returns a value smaller than in_data_size, then a compressed block was
+ * produced and must be released with liblzx_compress_release_next_block
+ * before more data can be added.
+ */
+size_t
+liblzx_compress_add_input(liblzx_compressor_t *stream, const void *in_data,
+                          size_t in_data_size);
+
+/* Returns the next compressed chunk.  This doesn't consume the chunk in the
+ * process, so repeated calls will keep returning the same chunk.  If no chunk
+ * is available, returns NULL.
+ */
+const liblzx_output_chunk_t *
+liblzx_compress_get_next_chunk(const liblzx_compressor_t *stream);
+
+/* Releases the next compressed chunk, allowing compression to continue. */
+void
+liblzx_compress_release_next_chunk(liblzx_compressor_t *stream);
+
+/* Ends the compression stream. */
+void
+liblzx_compress_end_input(liblzx_compressor_t *stream);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/dlls/cabinet/liblzx_bitops.h b/dlls/cabinet/liblzx_bitops.h
new file mode 100644
index 00000000000..976a294fe54
--- /dev/null
+++ b/dlls/cabinet/liblzx_bitops.h
@@ -0,0 +1,156 @@
+/*
+ * bitops.h - inline functions for bit manipulation
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LIBLZX_BITOPS_H
+#define _LIBLZX_BITOPS_H
+
+#include "liblzx_compiler.h"
+#include "liblzx_types.h"
+
+#if LIBLZX_IS_MSVC_COMPILER
+#include <intrin.h>
+#endif
+
+/*
+ * Bit Scan Reverse (BSR) - find the 0-based index (relative to the least
+ * significant bit) of the *most* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static attrib_forceinline unsigned
+bsr32(uint32_t v)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+        unsigned long result;
+        _BitScanReverse(&result, v);
+        return result;
+#else
+        return 31 - __builtin_clz(v);
+#endif
+}
+
+static attrib_forceinline unsigned
+bsr64(uint64_t v)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+#  ifdef _M_AMD64
+        unsigned long result;
+        _BitScanReverse64(&result, v);
+        return result;
+#  else
+        unsigned long index;
+        if (_BitScanReverse(&index, v >> 32))
+                return index + 32;
+
+        _BitScanReverse(&index, v & 0xffffffffu);
+
+        return index;
+#  endif
+#else
+        return 63 - __builtin_clzll(v);
+#endif
+}
+
+static attrib_forceinline unsigned
+bsrw(machine_word_t v)
+{
+        STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+        if (WORDBITS == 32)
+                return bsr32((uint32_t)v);
+        else
+                return bsr64(v);
+}
+
+/*
+ * Bit Scan Forward (BSF) - find the 0-based index (relative to the least
+ * significant bit) of the *least* significant 1 bit in the input value.  The
+ * input value must be nonzero!
+ */
+
+static attrib_forceinline unsigned
+bsf32(uint32_t v)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+        unsigned long result;
+        _BitScanForward(&result, v);
+        return result;
+#else
+        return __builtin_ctz(v);
+#endif
+}
+
+static attrib_forceinline unsigned
+bsf64(uint64_t v)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+#  ifdef _M_AMD64
+        unsigned long result;
+        _BitScanForward64(&result, v);
+        return result;
+#  else
+        unsigned long index;
+        if (_BitScanForward(&index, v & 0xffffffffu))
+                return index;
+
+        if (_BitScanForward(&index, v >> 32))
+                index += 32;
+
+        return -1;
+#  endif
+#else
+        return __builtin_ctzll(v);
+#endif
+}
+
+static attrib_forceinline unsigned
+bsfw(machine_word_t v)
+{
+        STATIC_ASSERT(WORDBITS == 32 || WORDBITS == 64);
+        if (WORDBITS == 32)
+                return bsf32(v);
+        else
+                return bsf64(v);
+}
+
+/* Return the log base 2 of 'n', rounded up to the nearest integer. */
+static attrib_forceinline unsigned
+ilog2_ceil(size_t n)
+{
+        if (n <= 1)
+                return 0;
+        return 1 + bsrw(n - 1);
+}
+
+/* Round 'n' up to the nearest power of 2 */
+static attrib_forceinline size_t
+roundup_pow_of_2(size_t n)
+{
+        return (size_t)1 << ilog2_ceil(n);
+}
+
+#endif /* _LIBLZX_BITOPS_H */
diff --git a/dlls/cabinet/liblzx_bt_matchfinder.h b/dlls/cabinet/liblzx_bt_matchfinder.h
new file mode 100644
index 00000000000..d601e110aca
--- /dev/null
+++ b/dlls/cabinet/liblzx_bt_matchfinder.h
@@ -0,0 +1,446 @@
+/*
+ * bt_matchfinder.h - Lempel-Ziv matchfinding with a hash table of binary trees
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ----------------------------------------------------------------------------
+ *
+ * This is a Binary Trees (bt) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * binary tree of sequences whose first 4 bytes share the same hash code.  Each
+ * sequence is identified by its starting position in the input buffer.  Each
+ * binary tree is always sorted such that each left child represents a sequence
+ * lexicographically lesser than its parent and each right child represents a
+ * sequence lexicographically greater than its parent.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, a new binary tree
+ * node is created to represent the current sequence.  Then, in a single tree
+ * traversal, the hash bucket's binary tree is searched for matches and is
+ * re-rooted at the new node.
+ *
+ * Compared to the simpler algorithm that uses linked lists instead of binary
+ * trees (see hc_matchfinder.h), the binary tree version gains more information
+ * at each node visitation.  Ideally, the binary tree version will examine only
+ * 'log(n)' nodes to find the same matches that the linked list version will
+ * find by examining 'n' nodes.  In addition, the binary tree version can
+ * examine fewer bytes at each node by taking advantage of the common prefixes
+ * that result from the sort order, whereas the linked list version may have to
+ * examine up to the full length of the match at each node.
+ *
+ * However, it is not always best to use the binary tree version.  It requires
+ * nearly twice as much memory as the linked list version, and it takes time to
+ * keep the binary trees sorted, even at positions where the compressor does not
+ * need matches.  Generally, when doing fast compression on small buffers,
+ * binary trees are the wrong approach.  They are best suited for thorough
+ * compression and/or large buffers.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+
+#include <string.h>
+
+#include "liblzx_matchfinder_common.h"
+
+#define BT_MATCHFINDER_HASH3_ORDER 15
+#define BT_MATCHFINDER_HASH3_WAYS  2
+#define BT_MATCHFINDER_HASH4_ORDER 16
+
+/* TEMPLATED functions and structures have MF_SUFFIX appended to their name.  */
+#undef TEMPLATED
+#define TEMPLATED(name)                CONCAT(name, MF_SUFFIX)
+
+#ifndef _LIBLZX_BT_MATCHFINDER_H
+#define _LIBLZX_BT_MATCHFINDER_H
+
+/* Non-templated definitions  */
+
+/* Representation of a match found by the bt_matchfinder  */
+struct lz_match {
+
+        /* The number of bytes matched.  */
+        uint32_t length;
+
+        /* The offset back from the current position that was matched.  */
+        uint32_t offset;
+};
+
+#endif /* _LIBLZX_BT_MATCHFINDER_H */
+
+struct TEMPLATED(bt_matchfinder) {
+
+        /* The hash table for finding length 2 matches, if enabled  */
+#ifdef BT_MATCHFINDER_HASH2_ORDER
+        mf_pos_t hash2_tab[1UL << BT_MATCHFINDER_HASH2_ORDER];
+#endif
+
+        /* The hash table for finding length 3 matches  */
+        mf_pos_t hash3_tab[1UL << BT_MATCHFINDER_HASH3_ORDER][BT_MATCHFINDER_HASH3_WAYS];
+
+        /* The hash table which contains the roots of the binary trees for
+         * finding length 4+ matches  */
+        mf_pos_t hash4_tab[1UL << BT_MATCHFINDER_HASH4_ORDER];
+
+        /* The child node references for the binary trees.  The left and right
+         * children of the node for the sequence with position 'pos' are
+         * 'child_tab[pos * 2]' and 'child_tab[pos * 2 + 1]', respectively.  */
+        mf_pos_t child_tab[];
+};
+
+static attrib_forceinline bool
+TEMPLATED(matchfinder_is_valid_pos)(mf_pos_t pos, mf_pos_t min_pos)
+{
+        return ((pos + 1) & MF_INVALID_POS) > min_pos;
+}
+
+static attrib_forceinline void
+TEMPLATED(matchfinder_rebase)(mf_pos_t * mf_base, size_t count,
+                              mf_pos_t cull_amount)
+{
+        /* The invalid value points to the last element of the buffer. */
+        /* Since no match can start from that byte, it is always invalid. */
+        while (count > 0) {
+                mf_pos_t pos = *mf_base;
+
+                if (pos < cull_amount || pos == MF_INVALID_POS) {
+                        *mf_base = MF_INVALID_POS;
+                } else {
+                        *mf_base -= cull_amount;
+                }
+
+                mf_base++;
+                count--;
+        }
+}
+
+/* Return the number of bytes that must be allocated for a 'bt_matchfinder' that
+ * can work with buffers up to the specified size.  */
+static attrib_forceinline size_t
+TEMPLATED(bt_matchfinder_size)(size_t max_bufsize, bool streaming)
+{
+        const size_t streaming_bufsize_mul = streaming ? 4 : 2;
+
+        const size_t base_size =
+            sizeof(struct TEMPLATED(bt_matchfinder)) +
+            (streaming_bufsize_mul * max_bufsize * sizeof(mf_pos_t));
+
+        return base_size;
+}
+
+/* Prepare the matchfinder for a new input buffer.  */
+static attrib_forceinline void
+TEMPLATED(bt_matchfinder_init)(struct TEMPLATED(bt_matchfinder) *mf)
+{
+        memset(mf, 0xFF, sizeof(*mf));
+}
+
+static attrib_forceinline mf_pos_t *
+TEMPLATED(bt_left_child)(struct TEMPLATED(bt_matchfinder) *mf, uint32_t node)
+{
+        return &mf->child_tab[(node << 1) + 0];
+}
+
+static attrib_forceinline mf_pos_t *
+TEMPLATED(bt_right_child)(struct TEMPLATED(bt_matchfinder) *mf, uint32_t node)
+{
+        return &mf->child_tab[(node << 1) + 1];
+}
+
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_byte().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define BT_MATCHFINDER_REQUIRED_NBYTES        5
+
+/* Advance the binary tree matchfinder by one byte, optionally recording
+ * matches.  @record_matches should be a compile-time constant.  */
+static attrib_forceinline struct lz_match *
+TEMPLATED(bt_matchfinder_advance_one_byte)(struct TEMPLATED(bt_matchfinder) * const mf,
+                                           const uint8_t * const in_begin,
+                                           mf_pos_t in_min_pos,
+                                           const ptrdiff_t cur_pos,
+                                           const uint32_t max_find_len,
+                                           const uint32_t max_produce_len,
+                                           const uint32_t nice_len,
+                                           const uint32_t max_search_depth,
+                                           uint32_t * const next_hashes,
+                                           uint32_t * const best_len_ret,
+                                           struct lz_match *lz_matchptr,
+                                           const bool record_matches)
+{
+        const uint8_t *in_next = in_begin + cur_pos;
+        uint32_t depth_remaining = max_search_depth;
+        uint32_t next_hashseq;
+        uint32_t hash3;
+        uint32_t hash4;
+#ifdef BT_MATCHFINDER_HASH2_ORDER
+        uint16_t seq2;
+        uint32_t hash2;
+#endif
+        STATIC_ASSERT(BT_MATCHFINDER_HASH3_WAYS >= 1 &&
+                      BT_MATCHFINDER_HASH3_WAYS <= 2);
+        uint32_t cur_node;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+        uint32_t cur_node_2;
+#endif
+        const uint8_t *matchptr;
+        mf_pos_t *pending_lt_ptr, *pending_gt_ptr;
+        uint32_t best_lt_len, best_gt_len;
+        uint32_t len;
+        uint32_t best_len = 3;
+
+        next_hashseq = get_unaligned_le32(in_next + 1);
+
+        hash3 = next_hashes[0];
+        hash4 = next_hashes[1];
+
+        next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, BT_MATCHFINDER_HASH3_ORDER);
+        next_hashes[1] = lz_hash(next_hashseq, BT_MATCHFINDER_HASH4_ORDER);
+        prefetchw(&mf->hash3_tab[next_hashes[0]]);
+        prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+#ifdef BT_MATCHFINDER_HASH2_ORDER
+        seq2 = load_u16_unaligned(in_next);
+        hash2 = lz_hash(seq2, BT_MATCHFINDER_HASH2_ORDER);
+        cur_node = mf->hash2_tab[hash2];
+        mf->hash2_tab[hash2] = cur_pos;
+        if (record_matches &&
+            TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos) &&
+            seq2 == load_u16_unaligned(&in_begin[cur_node]))
+        {
+                lz_matchptr->length = 2;
+                lz_matchptr->offset = in_next - &in_begin[cur_node];
+                lz_matchptr++;
+        }
+#endif
+
+        cur_node = mf->hash3_tab[hash3][0];
+        mf->hash3_tab[hash3][0] = cur_pos;
+#if BT_MATCHFINDER_HASH3_WAYS >= 2
+        cur_node_2 = mf->hash3_tab[hash3][1];
+        mf->hash3_tab[hash3][1] = cur_node;
+#endif
+        if (record_matches &&
+            TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos)) {
+                uint32_t seq3 = load_u24_unaligned(in_next);
+                if (seq3 == load_u24_unaligned(&in_begin[cur_node]) &&
+                        likely(cur_node >= in_min_pos)) {
+                        lz_matchptr->length = 3;
+                        lz_matchptr->offset = in_next - &in_begin[cur_node];
+                        lz_matchptr++;
+                }
+        #if BT_MATCHFINDER_HASH3_WAYS >= 2
+                else if (TEMPLATED(matchfinder_is_valid_pos)(cur_node_2,
+                                                             in_min_pos) &&
+                        seq3 == load_u24_unaligned(&in_begin[cur_node_2])) {
+                        lz_matchptr->length = 3;
+                        lz_matchptr->offset = in_next - &in_begin[cur_node_2];
+                        lz_matchptr++;
+                }
+        #endif
+        }
+
+        cur_node = mf->hash4_tab[hash4];
+        mf->hash4_tab[hash4] = cur_pos;
+
+        pending_lt_ptr = TEMPLATED(bt_left_child)(mf, cur_pos);
+        pending_gt_ptr = TEMPLATED(bt_right_child)(mf, cur_pos);
+
+        if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node, in_min_pos)) {
+                *pending_lt_ptr = MF_INVALID_POS;
+                *pending_gt_ptr = MF_INVALID_POS;
+                *best_len_ret = best_len;
+                return lz_matchptr;
+        }
+
+        best_lt_len = 0;
+        best_gt_len = 0;
+        len = 0;
+
+        for (;;) {
+                matchptr = &in_begin[cur_node];
+
+                if (matchptr[len] == in_next[len]) {
+                        len = lz_extend(in_next, matchptr, len + 1, max_find_len);
+                        if (!record_matches || len > best_len) {
+                                if (record_matches) {
+                                        best_len = len;
+                                        lz_matchptr->length = min_u32(len, max_produce_len);
+                                        lz_matchptr->offset =
+                                                in_next - matchptr;
+                                        lz_matchptr++;
+                                }
+                                if (len >= nice_len) {
+                                        *pending_lt_ptr =
+                                                *TEMPLATED(bt_left_child)(mf, cur_node);
+                                        *pending_gt_ptr =
+                                                *TEMPLATED(bt_right_child)(mf, cur_node);
+                                        *best_len_ret = best_len;
+                                        return lz_matchptr;
+                                }
+                        }
+                }
+
+                if (matchptr[len] < in_next[len]) {
+                        *pending_lt_ptr = cur_node;
+                        pending_lt_ptr = TEMPLATED(bt_right_child)(mf, cur_node);
+                        cur_node = *pending_lt_ptr;
+                        best_lt_len = len;
+                        if (best_gt_len < len)
+                                len = best_gt_len;
+                } else {
+                        *pending_gt_ptr = cur_node;
+                        pending_gt_ptr = TEMPLATED(bt_left_child)(mf, cur_node);
+                        cur_node = *pending_gt_ptr;
+                        best_gt_len = len;
+                        if (best_lt_len < len)
+                                len = best_lt_len;
+                }
+
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node,
+                                                         in_min_pos) ||
+                    !--depth_remaining) {
+                        *pending_lt_ptr = MF_INVALID_POS;
+                        *pending_gt_ptr = MF_INVALID_POS;
+                        *best_len_ret = best_len;
+
+                        return lz_matchptr;
+                }
+        }
+}
+
+/*
+ * Retrieve a list of matches with the current position.
+ *
+ * @mf
+ *        The matchfinder structure.
+ * @in_begin
+ *        Pointer to the beginning of the input buffer.
+ * @in_abs_pos
+ *        Absolute position of in_begin in the file
+ * @cur_pos
+ *        The current position in the input buffer relative to @in_begin (the
+ *        position of the sequence being matched against).
+ * @max_len
+ *        The maximum permissible match length at this position.  Must be >=
+ *        BT_MATCHFINDER_REQUIRED_NBYTES.
+ * @nice_len
+ *        Stop searching if a match of at least this length is found.
+ *        Must be <= @max_len.
+ * @max_search_depth
+ *        Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *        The precomputed hash codes for the sequence beginning at @in_next.
+ *        These will be used and then updated with the precomputed hashcodes for
+ *        the sequence beginning at @in_next + 1.
+ * @best_len_ret
+ *        If a match of length >= 4 was found, then the length of the longest such
+ *        match is written here; otherwise 3 is written here.  (Note: this is
+ *        redundant with the 'struct lz_match' array, but this is easier for the
+ *        compiler to optimize when inlined and the caller immediately does a
+ *        check against 'best_len'.)
+ * @lz_matchptr
+ *        An array in which this function will record the matches.  The recorded
+ *        matches will be sorted by strictly increasing length and (non-strictly)
+ *        increasing offset.  The maximum number of matches that may be found is
+ *        'nice_len - 1', or one less if length 2 matches are disabled.
+ *
+ * The return value is a pointer to the next available slot in the @lz_matchptr
+ * array.  (If no matches were found, this will be the same as @lz_matchptr.)
+ */
+static attrib_forceinline struct lz_match *
+TEMPLATED(bt_matchfinder_get_matches)(struct TEMPLATED(bt_matchfinder) *mf,
+                                      const uint8_t *in_begin,
+                                      uint32_t in_min_pos,
+                                      ptrdiff_t cur_pos,
+                                      uint32_t max_find_len,
+                                      uint32_t max_produce_len,
+                                      uint32_t nice_len,
+                                      uint32_t max_search_depth,
+                                      uint32_t next_hashes[2],
+                                      uint32_t *best_len_ret,
+                                      struct lz_match *lz_matchptr)
+{
+        return TEMPLATED(bt_matchfinder_advance_one_byte)(mf,
+                                                          in_begin,
+                                                          in_min_pos,
+                                                          cur_pos,
+                                                          max_find_len,
+                                                          max_produce_len,
+                                                          nice_len,
+                                                          max_search_depth,
+                                                          next_hashes,
+                                                          best_len_ret,
+                                                          lz_matchptr,
+                                                          true);
+}
+
+/*
+ * Advance the matchfinder, but don't record any matches.
+ *
+ * This is very similar to bt_matchfinder_get_matches() because both functions
+ * must do hashing and tree re-rooting.
+ */
+static attrib_forceinline void
+TEMPLATED(bt_matchfinder_skip_byte)(struct TEMPLATED(bt_matchfinder) *mf,
+                                    const uint8_t *in_begin,
+                                    uint32_t in_min_pos,
+                                    ptrdiff_t cur_pos,
+                                    uint32_t nice_len,
+                                    uint32_t max_search_depth,
+                                    uint32_t next_hashes[2])
+{
+        uint32_t best_len;
+        TEMPLATED(bt_matchfinder_advance_one_byte)(mf,
+                                                   in_begin,
+                                                   in_min_pos,
+                                                   cur_pos,
+                                                   nice_len,
+                                                   nice_len,
+                                                   nice_len,
+                                                   max_search_depth,
+                                                   next_hashes,
+                                                   &best_len,
+                                                   NULL,
+                                                   false);
+}
+
+/*
+ * Culls any matches that are lower than a specified offset and reduces any
+ * remaining offsets by the same amount.
+ */
+static attrib_forceinline void
+TEMPLATED(bt_matchfinder_cull)(struct TEMPLATED(bt_matchfinder) * mf,
+                               uint32_t cull_size, uint32_t window_size)
+{
+        size_t mf_size = TEMPLATED(bt_matchfinder_size)(window_size, true);
+
+        const size_t mf_count = mf_size / sizeof(mf_pos_t);
+
+        TEMPLATED(matchfinder_rebase)((mf_pos_t *)mf, mf_count, cull_size);
+}
diff --git a/dlls/cabinet/liblzx_compiler.h b/dlls/cabinet/liblzx_compiler.h
new file mode 100644
index 00000000000..73e36c07a59
--- /dev/null
+++ b/dlls/cabinet/liblzx_compiler.h
@@ -0,0 +1,214 @@
+/*
+ * compiler.h
+ *
+ * Compiler-specific definitions.  Currently, only GCC and clang are supported.
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LIBLZX_COMPILER_H
+#define _LIBLZX_COMPILER_H
+
+#if defined(_MSC_VER) && !defined(__clang__) && !defined(__GNUC__)
+#define LIBLZX_IS_MSVC_COMPILER 1
+#else
+#define LIBLZX_IS_MSVC_COMPILER 0
+#endif
+
+#if LIBLZX_IS_MSVC_COMPILER
+#include <stdint.h>
+#include <stddef.h>
+
+#pragma warning(error:4013)
+#endif
+
+#ifndef __ORDER_LITTLE_ENDIAN__
+#define __ORDER_LITTLE_ENDIAN__ 1
+#endif
+
+#ifndef __ORDER_BIG_ENDIAN__
+#define __ORDER_BIG_ENDIAN__ 2
+#endif
+
+
+/* Is the compiler GCC of the specified version or later?  This always returns
+ * false for clang, since clang is "frozen" at GNUC 4.2.  The __has_*
+ * feature-test macros should be used to detect clang functionality instead.  */
+#define GCC_PREREQ(major, minor)                                        \
+        (!defined(__clang__) && !defined(__INTEL_COMPILER) &&                \
+         (__GNUC__ > major ||                                                \
+          (__GNUC__ == major && __GNUC_MINOR__ >= minor)))
+
+/* Feature-test macros defined by recent versions of clang.  */
+#ifndef __has_attribute
+#  define __has_attribute(attribute)    0
+#endif
+#ifndef __has_feature
+#  define __has_feature(feature)        0
+#endif
+#ifndef __has_builtin
+#  define __has_builtin(builtin)        0
+#endif
+
+/* Declare that the annotated function should always be inlined.  This might be
+ * desirable in highly tuned code, e.g. compression codecs.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define attrib_forceinline        __forceinline
+#else
+#define attrib_forceinline        inline __attribute__((always_inline))
+#endif
+
+/* Declare that the annotated function should *not* be inlined.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define attrib_noinline                __declspec(noinline)
+#else
+#define attrib_noinline                __attribute__((noinline))
+#endif
+
+/* Declare that the annotated function is unlikely to be executed */
+#if LIBLZX_IS_MSVC_COMPILER
+#define attrib_cold
+#else
+#define attrib_cold __attribute__((cold))
+#endif
+
+/* Declare that the annotated type or variable is aligned */
+#if LIBLZX_IS_MSVC_COMPILER
+#define attrib_aligned(alignment)        __declspec(align(alignment))
+#else
+#define attrib_aligned(alignment)        __attribute__((aligned(alignment)))
+#endif
+
+/* Functionally the same as 'attrib_noinline', but documents that the reason
+ * for not inlining is to prevent the annotated function from being inlined
+ * into a recursive function, thereby increasing its stack usage.  */
+#define attrib_noinline_for_stack attrib_noinline
+
+/* Hint that the expression is usually true.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define likely(expr)                (expr)
+#else
+#define likely(expr)                __builtin_expect(!!(expr), 1)
+#endif
+
+/* Hint that the expression is usually false.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define unlikely(expr)                (expr)
+#else
+#define unlikely(expr)                __builtin_expect(!!(expr), 0)
+#endif
+
+/* Prefetch into L1 cache for read.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define prefetchr(addr)                _mm_prefetch((const char *)(addr), _MM_HINT_T0)
+#else
+#define prefetchr(addr)                __builtin_prefetch((addr), 0)
+#endif
+
+/* Prefetch into L1 cache for write.  */
+#if LIBLZX_IS_MSVC_COMPILER
+#define prefetchw(addr)                _mm_prefetch((const char *)(addr), _MM_HINT_T0)
+#else
+#define prefetchw(addr)                __builtin_prefetch((addr), 1)
+#endif
+
+/* Hint that the annotated function takes a printf()-like format string and
+ * arguments.  This is currently disabled on Windows because MinGW does not
+ * support this attribute on functions taking wide-character strings.  */
+#ifdef _WIN32
+#  define _format_attribute(type, format_str, format_start)
+#else
+#  define _format_attribute(type, format_str, format_start)        \
+                        __attribute__((format(type, format_str, format_start)))
+#endif
+
+/* Endianness definitions.  Either CPU_IS_BIG_ENDIAN() or CPU_IS_LITTLE_ENDIAN()
+ * evaluates to 1.  The other evaluates to 0.  Note that newer gcc supports
+ * __BYTE_ORDER__ for easily determining the endianness; older gcc doesn't.  In
+ * the latter case we fall back to a configure-time check.  */
+#ifdef __BYTE_ORDER__
+#  define CPU_IS_BIG_ENDIAN()        (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#elif defined(HAVE_CONFIG_H)
+#  include "config.h"
+#  ifdef WORDS_BIGENDIAN
+#    define CPU_IS_BIG_ENDIAN()        1
+#  else
+#    define CPU_IS_BIG_ENDIAN()        0
+#  endif
+#endif
+#define CPU_IS_LITTLE_ENDIAN() (!CPU_IS_BIG_ENDIAN())
+
+/* UNALIGNED_ACCESS_IS_FAST should be defined to 1 if unaligned memory accesses
+ * can be performed efficiently on the target platform.  */
+#if defined(__x86_64__) || defined(__i386__) || \
+        defined(__ARM_FEATURE_UNALIGNED) || defined(__powerpc64__)
+#  define UNALIGNED_ACCESS_IS_FAST 1
+#else
+#  define UNALIGNED_ACCESS_IS_FAST 0
+#endif
+
+/* Swap the values of two variables, without multiple evaluation.  */
+#ifndef swap
+#  define swap(a, b) do { typeof(a) _a = (a); (a) = (b); (b) = _a; } while(0)
+#endif
+#define SWAP(a, b)        swap((a), (b))
+
+/* Optional definitions for checking with 'sparse'.  */
+#ifdef __CHECKER__
+#  define _bitwise_attr        __attribute__((bitwise))
+#  define _force_attr        __attribute__((force))
+#else
+#  define _bitwise_attr
+#  define _force_attr
+#endif
+
+/* STATIC_ASSERT() - verify the truth of an expression at compilation time.  */
+#ifdef __CHECKER__
+#  define STATIC_ASSERT(expr)
+#  define STATIC_ASSERT_STMT(expr)
+#elif __STDC_VERSION__ >= 201112L
+#  define STATIC_ASSERT(expr)        _Static_assert((expr), "")
+#  define STATIC_ASSERT_STMT(expr)   do {_Static_assert((expr), "");} while(0)
+#else
+#  define STATIC_ASSERT(expr)        ((void)sizeof(char[1 - 2 * !(expr)]))
+#  define STATIC_ASSERT_STMT(expr)   STATIC_ASSERT(expr)
+#endif
+
+/* STATIC_ASSERT_ZERO() - verify the truth of an expression at compilation time
+ * and also produce a result of value '0' to be used in constant expressions */
+#define STATIC_ASSERT_ZERO(expr) ((int)sizeof(char[-!(expr)]))
+
+#define CONCAT_IMPL(s1, s2)        s1##s2
+
+/* CONCAT() - concatenate two tokens at preprocessing time.  */
+#define CONCAT(s1, s2)                CONCAT_IMPL(s1, s2)
+
+#if LIBLZX_IS_MSVC_COMPILER
+#define __builtin_constant_p(n) (0)
+
+typedef ptrdiff_t ssize_t;
+#endif
+
+#endif /* _LIBLZX_COMPILER_H */
diff --git a/dlls/cabinet/liblzx_compress_common.c b/dlls/cabinet/liblzx_compress_common.c
new file mode 100644
index 00000000000..be88df26b85
--- /dev/null
+++ b/dlls/cabinet/liblzx_compress_common.c
@@ -0,0 +1,673 @@
+/*
+ * compress_common.c
+ *
+ * Code for compression shared among multiple compression formats.
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include <string.h>
+
+#include <assert.h>
+#include "liblzx_compress_common.h"
+#include "liblzx_minmax.h"
+#include "liblzx_util.h"
+
+/*
+ * Given the binary tree node A[subtree_idx] whose children already satisfy the
+ * maxheap property, swap the node with its greater child until it is greater
+ * than or equal to both of its children, so that the maxheap property is
+ * satisfied in the subtree rooted at A[subtree_idx].  'A' uses 1-based indices.
+ */
+static void
+heapify_subtree(uint32_t A[], unsigned length, unsigned subtree_idx)
+{
+        unsigned parent_idx;
+        unsigned child_idx;
+        uint32_t v;
+
+        v = A[subtree_idx];
+        parent_idx = subtree_idx;
+        while ((child_idx = parent_idx * 2) <= length) {
+                if (child_idx < length && A[child_idx + 1] > A[child_idx])
+                        child_idx++;
+                if (v >= A[child_idx])
+                        break;
+                A[parent_idx] = A[child_idx];
+                parent_idx = child_idx;
+        }
+        A[parent_idx] = v;
+}
+
+/*
+ * Rearrange the array 'A' so that it satisfies the maxheap property.
+ * 'A' uses 1-based indices, so the children of A[i] are A[i*2] and A[i*2 + 1].
+ */
+static void
+heapify_array(uint32_t A[], unsigned length)
+{
+        unsigned subtree_idx;
+
+        for (subtree_idx = length / 2; subtree_idx >= 1; subtree_idx--)
+                heapify_subtree(A, length, subtree_idx);
+}
+
+/*
+ * Sort the array 'A', which contains 'length' unsigned 32-bit integers.
+ *
+ * Note: name this function heap_sort() instead of heapsort() to avoid colliding
+ * with heapsort() from stdlib.h on BSD-derived systems --- though this isn't
+ * necessary when compiling with -D_ANSI_SOURCE, which is the better solution.
+ */
+static void
+heap_sort(uint32_t A[], unsigned length)
+{
+        A--; /* Use 1-based indices  */
+
+        heapify_array(A, length);
+
+        while (length >= 2) {
+                uint32_t tmp = A[length];
+
+                A[length] = A[1];
+                A[1] = tmp;
+                length--;
+                heapify_subtree(A, length, 1);
+        }
+}
+
+#define NUM_SYMBOL_BITS 10
+#define NUM_FREQ_BITS        (32 - NUM_SYMBOL_BITS)
+#define SYMBOL_MASK        ((1 << NUM_SYMBOL_BITS) - 1)
+#define FREQ_MASK        (~SYMBOL_MASK)
+
+#define GET_NUM_COUNTERS(num_syms)        (num_syms)
+
+/*
+ * Sort the symbols primarily by frequency and secondarily by symbol value.
+ * Discard symbols with zero frequency and fill in an array with the remaining
+ * symbols, along with their frequencies.  The low NUM_SYMBOL_BITS bits of each
+ * array entry will contain the symbol value, and the remaining bits will
+ * contain the frequency.
+ *
+ * @num_syms
+ *        Number of symbols in the alphabet, at most 1 << NUM_SYMBOL_BITS.
+ *
+ * @freqs[num_syms]
+ *        Frequency of each symbol, summing to at most (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens[num_syms]
+ *        An array that eventually will hold the length of each codeword.  This
+ *        function only fills in the codeword lengths for symbols that have zero
+ *        frequency, which are not well defined per se but will be set to 0.
+ *
+ * @symout[num_syms]
+ *        The output array, described above.
+ *
+ * Returns the number of entries in 'symout' that were filled.  This is the
+ * number of symbols that have nonzero frequency.
+ */
+static unsigned
+sort_symbols(unsigned num_syms, const uint32_t freqs[], uint8_t lens[], uint32_t symout[])
+{
+        unsigned sym;
+        unsigned i;
+        unsigned num_used_syms;
+        unsigned num_counters;
+        unsigned counters[GET_NUM_COUNTERS(MAX_NUM_SYMS)];
+
+        /*
+         * We use heapsort, but with an added optimization.  Since often most
+         * symbol frequencies are low, we first do a count sort using a limited
+         * number of counters.  High frequencies are counted in the last
+         * counter, and only they will be sorted with heapsort.
+         *
+         * Note: with more symbols, it is generally beneficial to have more
+         * counters.  About 1 counter per symbol seems fastest.
+         */
+
+        num_counters = GET_NUM_COUNTERS(num_syms);
+
+        memset(counters, 0, num_counters * sizeof(counters[0]));
+
+        /* Count the frequencies. */
+        for (sym = 0; sym < num_syms; sym++)
+                counters[min_size(freqs[sym], num_counters - 1)]++;
+
+        /*
+         * Make the counters cumulative, ignoring the zero-th, which counted
+         * symbols with zero frequency.  As a side effect, this calculates the
+         * number of symbols with nonzero frequency.
+         */
+        num_used_syms = 0;
+        for (i = 1; i < num_counters; i++) {
+                unsigned count = counters[i];
+
+                counters[i] = num_used_syms;
+                num_used_syms += count;
+        }
+
+        /*
+         * Sort nonzero-frequency symbols using the counters.  At the same time,
+         * set the codeword lengths of zero-frequency symbols to 0.
+         */
+        for (sym = 0; sym < num_syms; sym++) {
+                uint32_t freq = freqs[sym];
+
+                if (freq != 0) {
+                        symout[counters[min_size(freq, num_counters - 1)]++] =
+                                sym | (freq << NUM_SYMBOL_BITS);
+                } else {
+                        lens[sym] = 0;
+                }
+        }
+
+        /* Sort the symbols counted in the last counter. */
+        heap_sort(symout + counters[num_counters - 2],
+                  counters[num_counters - 1] - counters[num_counters - 2]);
+
+        return num_used_syms;
+}
+
+/*
+ * Build a Huffman tree.
+ *
+ * This is an optimized implementation that
+ *        (a) takes advantage of the frequencies being already sorted;
+ *        (b) only generates non-leaf nodes, since the non-leaf nodes of a Huffman
+ *            tree are sufficient to generate a canonical code;
+ *        (c) Only stores parent pointers, not child pointers;
+ *        (d) Produces the nodes in the same memory used for input frequency
+ *            information.
+ *
+ * Array 'A', which contains 'sym_count' entries, is used for both input and
+ * output.  For this function, 'sym_count' must be at least 2.
+ *
+ * For input, the array must contain the frequencies of the symbols, sorted in
+ * increasing order.  Specifically, each entry must contain a frequency left
+ * shifted by NUM_SYMBOL_BITS bits.  Any data in the low NUM_SYMBOL_BITS bits of
+ * the entries will be ignored by this function.  Although these bits will, in
+ * fact, contain the symbols that correspond to the frequencies, this function
+ * is concerned with frequencies only and keeps the symbols as-is.
+ *
+ * For output, this function will produce the non-leaf nodes of the Huffman
+ * tree.  These nodes will be stored in the first (sym_count - 1) entries of the
+ * array.  Entry A[sym_count - 2] will represent the root node.  Each other node
+ * will contain the zero-based index of its parent node in 'A', left shifted by
+ * NUM_SYMBOL_BITS bits.  The low NUM_SYMBOL_BITS bits of each entry in A will
+ * be kept as-is.  Again, note that although these low bits will, in fact,
+ * contain a symbol value, this symbol will have *no relationship* with the
+ * Huffman tree node that happens to occupy the same slot.  This is because this
+ * implementation only generates the non-leaf nodes of the tree.
+ */
+static void
+build_tree(uint32_t A[], unsigned sym_count)
+{
+        const unsigned last_idx = sym_count - 1;
+
+        /* Index of the next lowest frequency leaf that still needs a parent */
+        unsigned i = 0;
+
+        /*
+         * Index of the next lowest frequency non-leaf that still needs a
+         * parent, or 'e' if there is currently no such node
+         */
+        unsigned b = 0;
+
+        /* Index of the next spot for a non-leaf (will overwrite a leaf) */
+        unsigned e = 0;
+
+        do {
+                uint32_t new_freq;
+
+                /*
+                 * Select the next two lowest frequency nodes among the leaves
+                 * A[i] and non-leaves A[b], and create a new node A[e] to be
+                 * their parent.  Set the new node's frequency to the sum of the
+                 * frequencies of its two children.
+                 *
+                 * Usually the next two lowest frequency nodes are of the same
+                 * type (leaf or non-leaf), so check those cases first.
+                 */
+                if (i + 1 <= last_idx &&
+                    (b == e || (A[i + 1] & FREQ_MASK) <= (A[b] & FREQ_MASK))) {
+                        /* Two leaves */
+                        new_freq = (A[i] & FREQ_MASK) + (A[i + 1] & FREQ_MASK);
+                        i += 2;
+                } else if (b + 2 <= e &&
+                           (i > last_idx ||
+                            (A[b + 1] & FREQ_MASK) < (A[i] & FREQ_MASK))) {
+                        /* Two non-leaves */
+                        new_freq = (A[b] & FREQ_MASK) + (A[b + 1] & FREQ_MASK);
+                        A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+                        A[b + 1] = (e << NUM_SYMBOL_BITS) |
+                                   (A[b + 1] & SYMBOL_MASK);
+                        b += 2;
+                } else {
+                        /* One leaf and one non-leaf */
+                        new_freq = (A[i] & FREQ_MASK) + (A[b] & FREQ_MASK);
+                        A[b] = (e << NUM_SYMBOL_BITS) | (A[b] & SYMBOL_MASK);
+                        i++;
+                        b++;
+                }
+                A[e] = new_freq | (A[e] & SYMBOL_MASK);
+                /*
+                 * A binary tree with 'n' leaves has 'n - 1' non-leaves, so the
+                 * tree is complete once we've created 'n - 1' non-leaves.
+                 */
+        } while (++e < last_idx);
+}
+
+/*
+ * Given the stripped-down Huffman tree constructed by build_tree(), determine
+ * the number of codewords that should be assigned each possible length, taking
+ * into account the length-limited constraint.
+ *
+ * @A
+ *        The array produced by build_tree(), containing parent index information
+ *        for the non-leaf nodes of the Huffman tree.  Each entry in this array is
+ *        a node; a node's parent always has a greater index than that node
+ *        itself.  This function will overwrite the parent index information in
+ *        this array, so essentially it will destroy the tree.  However, the data
+ *        in the low NUM_SYMBOL_BITS of each entry will be preserved.
+ *
+ * @root_idx
+ *        The 0-based index of the root node in 'A', and consequently one less
+ *        than the number of tree node entries in 'A'.  (Or, really 2 less than
+ *        the actual length of 'A'.)
+ *
+ * @len_counts
+ *        An array of length ('max_codeword_len' + 1) in which the number of
+ *        codewords having each length <= max_codeword_len will be returned.
+ *
+ * @max_codeword_len
+ *        The maximum permissible codeword length.
+ */
+static void
+compute_length_counts(uint32_t A[], unsigned root_idx, unsigned len_counts[],
+                      unsigned max_codeword_len)
+{
+        unsigned len;
+        int node;
+
+        /*
+         * The key observations are:
+         *
+         * (1) We can traverse the non-leaf nodes of the tree, always visiting a
+         *     parent before its children, by simply iterating through the array
+         *     in reverse order.  Consequently, we can compute the depth of each
+         *     node in one pass, overwriting the parent indices with depths.
+         *
+         * (2) We can initially assume that in the real Huffman tree, both
+         *     children of the root are leaves.  This corresponds to two
+         *     codewords of length 1.  Then, whenever we visit a (non-leaf) node
+         *     during the traversal, we modify this assumption to account for
+         *     the current node *not* being a leaf, but rather its two children
+         *     being leaves.  This causes the loss of one codeword for the
+         *     current depth and the addition of two codewords for the current
+         *     depth plus one.
+         *
+         * (3) We can handle the length-limited constraint fairly easily by
+         *     simply using the largest length available when a depth exceeds
+         *     max_codeword_len.
+         */
+
+        for (len = 0; len <= max_codeword_len; len++)
+                len_counts[len] = 0;
+        len_counts[1] = 2;
+
+        /* Set the root node's depth to 0. */
+        A[root_idx] &= SYMBOL_MASK;
+
+        for (node = root_idx - 1; node >= 0; node--) {
+
+                /* Calculate the depth of this node. */
+
+                unsigned parent = A[node] >> NUM_SYMBOL_BITS;
+                unsigned parent_depth = A[parent] >> NUM_SYMBOL_BITS;
+                unsigned depth = parent_depth + 1;
+                unsigned len = depth;
+
+                /*
+                 * Set the depth of this node so that it is available when its
+                 * children (if any) are processed.
+                 */
+                A[node] = (A[node] & SYMBOL_MASK) | (depth << NUM_SYMBOL_BITS);
+
+                /*
+                 * If needed, decrease the length to meet the length-limited
+                 * constraint.  This is not the optimal method for generating
+                 * length-limited Huffman codes!  But it should be good enough.
+                 */
+                if (len >= max_codeword_len) {
+                        len = max_codeword_len;
+                        do {
+                                len--;
+                        } while (len_counts[len] == 0);
+                }
+
+                /*
+                 * Account for the fact that we have a non-leaf node at the
+                 * current depth.
+                 */
+                len_counts[len]--;
+                len_counts[len + 1] += 2;
+        }
+}
+
+/*
+ * Generate the codewords for a canonical Huffman code.
+ *
+ * @A
+ *        The output array for codewords.  In addition, initially this
+ *        array must contain the symbols, sorted primarily by frequency and
+ *        secondarily by symbol value, in the low NUM_SYMBOL_BITS bits of
+ *        each entry.
+ *
+ * @len
+ *        Output array for codeword lengths.
+ *
+ * @len_counts
+ *        An array that provides the number of codewords that will have
+ *        each possible length <= max_codeword_len.
+ *
+ * @max_codeword_len
+ *        Maximum length, in bits, of each codeword.
+ *
+ * @num_syms
+ *        Number of symbols in the alphabet, including symbols with zero
+ *        frequency.  This is the length of the 'A' and 'len' arrays.
+ */
+static void
+gen_codewords(uint32_t A[], uint8_t lens[], const unsigned len_counts[],
+              unsigned max_codeword_len, unsigned num_syms)
+{
+        uint32_t next_codewords[MAX_CODEWORD_LEN + 1];
+        unsigned i;
+        unsigned len;
+        unsigned sym;
+
+        /*
+         * Given the number of codewords that will have each length, assign
+         * codeword lengths to symbols.  We do this by assigning the lengths in
+         * decreasing order to the symbols sorted primarily by increasing
+         * frequency and secondarily by increasing symbol value.
+         */
+        for (i = 0, len = max_codeword_len; len >= 1; len--) {
+                unsigned count = len_counts[len];
+
+                while (count--)
+                        lens[A[i++] & SYMBOL_MASK] = len;
+        }
+
+        /*
+         * Generate the codewords themselves.  We initialize the
+         * 'next_codewords' array to provide the lexicographically first
+         * codeword of each length, then assign codewords in symbol order.  This
+         * produces a canonical code.
+         */
+        next_codewords[0] = 0;
+        next_codewords[1] = 0;
+        for (len = 2; len <= max_codeword_len; len++)
+                next_codewords[len] =
+                        (next_codewords[len - 1] + len_counts[len - 1]) << 1;
+
+        for (sym = 0; sym < num_syms; sym++)
+                A[sym] = next_codewords[lens[sym]]++;
+}
+
+/*
+ * ---------------------------------------------------------------------
+ *                        make_canonical_huffman_code()
+ * ---------------------------------------------------------------------
+ *
+ * Given an alphabet and the frequency of each symbol in it, construct a
+ * length-limited canonical Huffman code.
+ *
+ * @num_syms
+ *        The number of symbols in the alphabet.  The symbols are the integers in
+ *        the range [0, num_syms - 1].  This parameter must be at least 2 and
+ *        must not exceed (1 << NUM_SYMBOL_BITS).
+ *
+ * @max_codeword_len
+ *        The maximum permissible codeword length.
+ *
+ * @freqs
+ *        An array of length @num_syms that gives the frequency of each symbol.
+ *        It is valid for some, none, or all of the frequencies to be 0.  The sum
+ *        of frequencies must not exceed (1 << NUM_FREQ_BITS) - 1.
+ *
+ * @lens
+ *        An array of @num_syms entries in which this function will return the
+ *        length, in bits, of the codeword assigned to each symbol.  Symbols with
+ *        0 frequency will not have codewords per se, but their entries in this
+ *        array will be set to 0.  No lengths greater than @max_codeword_len will
+ *        be assigned.
+ *
+ * @codewords
+ *        An array of @num_syms entries in which this function will return the
+ *        codeword for each symbol, right-justified and padded on the left with
+ *        zeroes.  Codewords for symbols with 0 frequency will be undefined.
+ *
+ * ---------------------------------------------------------------------
+ *
+ * This function builds a length-limited canonical Huffman code.
+ *
+ * A length-limited Huffman code contains no codewords longer than some
+ * specified length, and has exactly (with some algorithms) or approximately
+ * (with the algorithm used here) the minimum weighted path length from the
+ * root, given this constraint.
+ *
+ * A canonical Huffman code satisfies the properties that a longer codeword
+ * never lexicographically precedes a shorter codeword, and the lexicographic
+ * ordering of codewords of the same length is the same as the lexicographic
+ * ordering of the corresponding symbols.  A canonical Huffman code, or more
+ * generally a canonical prefix code, can be reconstructed from only a list
+ * containing the codeword length of each symbol.
+ *
+ * The classic algorithm to generate a Huffman code creates a node for each
+ * symbol, then inserts these nodes into a min-heap keyed by symbol frequency.
+ * Then, repeatedly, the two lowest-frequency nodes are removed from the
+ * min-heap and added as the children of a new node having frequency equal to
+ * the sum of its two children, which is then inserted into the min-heap.  When
+ * only a single node remains in the min-heap, it is the root of the Huffman
+ * tree.  The codeword for each symbol is determined by the path needed to reach
+ * the corresponding node from the root.  Descending to the left child appends a
+ * 0 bit, whereas descending to the right child appends a 1 bit.
+ *
+ * The classic algorithm is relatively easy to understand, but it is subject to
+ * a number of inefficiencies.  In practice, it is fastest to first sort the
+ * symbols by frequency.  (This itself can be subject to an optimization based
+ * on the fact that most frequencies tend to be low.)  At the same time, we sort
+ * secondarily by symbol value, which aids the process of generating a canonical
+ * code.  Then, during tree construction, no heap is necessary because both the
+ * leaf nodes and the unparented non-leaf nodes can be easily maintained in
+ * sorted order.  Consequently, there can never be more than two possibilities
+ * for the next-lowest-frequency node.
+ *
+ * In addition, because we're generating a canonical code, we actually don't
+ * need the leaf nodes of the tree at all, only the non-leaf nodes.  This is
+ * because for canonical code generation we don't need to know where the symbols
+ * are in the tree.  Rather, we only need to know how many leaf nodes have each
+ * depth (codeword length).  And this information can, in fact, be quickly
+ * generated from the tree of non-leaves only.
+ *
+ * Furthermore, we can build this stripped-down Huffman tree directly in the
+ * array in which the codewords are to be generated, provided that these array
+ * slots are large enough to hold a symbol and frequency value.
+ *
+ * Still furthermore, we don't even need to maintain explicit child pointers.
+ * We only need the parent pointers, and even those can be overwritten in-place
+ * with depth information as part of the process of extracting codeword lengths
+ * from the tree.  So in summary, we do NOT need a big structure like:
+ *
+ *        struct huffman_tree_node {
+ *                unsigned int symbol;
+ *                unsigned int frequency;
+ *                unsigned int depth;
+ *                struct huffman_tree_node *left_child;
+ *                struct huffman_tree_node *right_child;
+ *        };
+ *
+ *
+ * ... which often gets used in "naive" implementations of Huffman code
+ * generation.
+ *
+ * Many of these optimizations are based on the implementation in 7-Zip (source
+ * file: C/HuffEnc.c), which was placed in the public domain by Igor Pavlov.
+ *
+ * NOTE: in general, the same frequencies can be used to generate different
+ * length-limited canonical Huffman codes.  One choice we have is during tree
+ * construction, when we must decide whether to prefer a leaf or non-leaf when
+ * there is a tie in frequency.  Another choice we have is how to deal with
+ * codewords that would exceed @max_codeword_len bits in length.  Both of these
+ * choices affect the resulting codeword lengths, which otherwise can be mapped
+ * uniquely onto the resulting canonical Huffman code.
+ *
+ * Normally, there is no problem with choosing one valid code over another,
+ * provided that they produce similar compression ratios.  However, the LZMS
+ * compression format uses adaptive Huffman coding.  It requires that both the
+ * decompressor and compressor build a canonical code equivalent to that which
+ * can be generated by using the classic Huffman tree construction algorithm and
+ * always processing leaves before non-leaves when there is a frequency tie.
+ * Therefore, we make sure to do this.  This method also has the advantage of
+ * sometimes shortening the longest codeword that is generated.
+ *
+ * There also is the issue of how codewords longer than @max_codeword_len are
+ * dealt with.  Fortunately, for LZMS this is irrelevant because for the LZMS
+ * alphabets no codeword can ever exceed LZMS_MAX_CODEWORD_LEN (= 15).  Since
+ * the LZMS algorithm regularly halves all frequencies, the frequencies cannot
+ * become high enough for a length 16 codeword to be generated.  Specifically, I
+ * think that if ties are broken in favor of non-leaves (as we do), the lowest
+ * total frequency that would give a length-16 codeword would be the sum of the
+ * frequencies 1 1 1 3 4 7 11 18 29 47 76 123 199 322 521 843 1364, which is
+ * 3570.  And in LZMS we can't get a frequency that high based on the alphabet
+ * sizes, rebuild frequencies, and scaling factors.  This worst-case scenario is
+ * based on the following degenerate case (only the bottom of the tree shown):
+ *
+ *                          ...
+ *                        17
+ *                       /  \
+ *                      10   7
+ *                     / \
+ *                    6   4
+ *                   / \
+ *                  3   3
+ *                 / \
+ *                2   1
+ *               / \
+ *              1   1
+ *
+ * Excluding the first leaves (those with value 1), each leaf value must be
+ * greater than the non-leaf up 1 and down 2 from it; otherwise that leaf would
+ * have taken precedence over that non-leaf and been combined with the leaf
+ * below, thereby decreasing the height compared to that shown.
+ *
+ * Interesting fact: if we were to instead prioritize non-leaves over leaves,
+ * then the worst case frequencies would be the Fibonacci sequence, plus an
+ * extra frequency of 1.  In this hypothetical scenario, it would be slightly
+ * easier for longer codewords to be generated.
+ */
+void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+                            const uint32_t freqs[], uint8_t lens[], uint32_t codewords[])
+{
+        uint32_t *A = codewords;
+        unsigned num_used_syms;
+
+        assert(num_syms <= MAX_NUM_SYMS);
+        STATIC_ASSERT_STMT(MAX_NUM_SYMS <= 1 << NUM_SYMBOL_BITS);
+        assert(max_codeword_len <= MAX_CODEWORD_LEN);
+
+        /*
+         * We begin by sorting the symbols primarily by frequency and
+         * secondarily by symbol value.  As an optimization, the array used for
+         * this purpose ('A') shares storage with the space in which we will
+         * eventually return the codewords.
+         */
+        num_used_syms = sort_symbols(num_syms, freqs, lens, A);
+
+        /*
+         * 'num_used_syms' is the number of symbols with nonzero frequency.
+         * This may be less than @num_syms.  'num_used_syms' is also the number
+         * of entries in 'A' that are valid.  Each entry consists of a distinct
+         * symbol and a nonzero frequency packed into a 32-bit integer.
+         */
+
+        /*
+         * Handle special cases where only 0 or 1 symbols were used (had nonzero
+         * frequency).
+         */
+
+        if (unlikely(num_used_syms == 0)) {
+                /*
+                 * Code is empty.  sort_symbols() already set all lengths to 0,
+                 * so there is nothing more to do.
+                 */
+                return;
+        }
+
+        if (unlikely(num_used_syms == 1)) {
+                /*
+                 * Only one symbol was used, so we only need one codeword.  But
+                 * two codewords are needed to form the smallest complete
+                 * Huffman code, which uses codewords 0 and 1.  Therefore, we
+                 * choose another symbol to which to assign a codeword.  We use
+                 * 0 (if the used symbol is not 0) or 1 (if the used symbol is
+                 * 0).  In either case, the lesser-valued symbol must be
+                 * assigned codeword 0 so that the resulting code is canonical.
+                 */
+
+                unsigned sym = A[0] & SYMBOL_MASK;
+                unsigned nonzero_idx = sym ? sym : 1;
+
+                codewords[0] = 0;
+                lens[0] = 1;
+                codewords[nonzero_idx] = 1;
+                lens[nonzero_idx] = 1;
+                return;
+        }
+
+        /*
+         * Build a stripped-down version of the Huffman tree, sharing the array
+         * 'A' with the symbol values.  Then extract length counts from the tree
+         * and use them to generate the final codewords.
+         */
+
+        build_tree(A, num_used_syms);
+
+        {
+                unsigned len_counts[MAX_CODEWORD_LEN + 1];
+
+                compute_length_counts(A, num_used_syms - 2,
+                                      len_counts, max_codeword_len);
+
+                gen_codewords(A, lens, len_counts, max_codeword_len, num_syms);
+        }
+}
diff --git a/dlls/cabinet/liblzx_compress_common.h b/dlls/cabinet/liblzx_compress_common.h
new file mode 100644
index 00000000000..5eb73086fb7
--- /dev/null
+++ b/dlls/cabinet/liblzx_compress_common.h
@@ -0,0 +1,19 @@
+/*
+ * compress_common.h
+ *
+ * Header for compression code shared by multiple compression formats.
+ */
+
+#ifndef _LIBLZX_COMPRESS_COMMON_H
+#define _LIBLZX_COMPRESS_COMMON_H
+
+#include "liblzx_types.h"
+
+#define MAX_NUM_SYMS            799        /* LZMS_MAX_NUM_SYMS */
+#define MAX_CODEWORD_LEN        16
+
+void
+make_canonical_huffman_code(unsigned num_syms, unsigned max_codeword_len,
+                            const uint32_t freqs[], uint8_t lens[], uint32_t codewords[]);
+
+#endif /* _LIBLZX_COMPRESS_COMMON_H */
diff --git a/dlls/cabinet/liblzx_config.h b/dlls/cabinet/liblzx_config.h
new file mode 100644
index 00000000000..2c070d93022
--- /dev/null
+++ b/dlls/cabinet/liblzx_config.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright (C) 2012-2017 Eric Biggers
+ *
+ * This file is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this file; if not, see https://www.gnu.org/licenses/.
+ */
+ 
+ #pragma once
+
+#ifndef __LIBLZX_CONFIG_H__
+#define __LIBLZX_CONFIG_H__
+
+// Set to 1 to export as a DLL
+#ifndef LIBLZX_DLL_EXPORT
+#define LIBLZX_DLL_EXPORT 0
+#endif
+
+#endif
diff --git a/dlls/cabinet/liblzx_endianness.h b/dlls/cabinet/liblzx_endianness.h
new file mode 100644
index 00000000000..2ee8fceee69
--- /dev/null
+++ b/dlls/cabinet/liblzx_endianness.h
@@ -0,0 +1,136 @@
+/*
+ * endianness.h - macros and inline functions for endianness conversion
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LIBLZX_ENDIANNESS_H
+#define _LIBLZX_ENDIANNESS_H
+
+#include "liblzx_compiler.h"
+#include "liblzx_types.h"
+
+#if LIBLZX_IS_MSVC_COMPILER
+#include <intrin.h>
+#endif
+
+#ifdef HAVE_SYS_ENDIAN_H
+   /* Needed on NetBSD to stop system bswap macros from messing things up */
+#  include <sys/endian.h>
+#  undef bswap16
+#  undef bswap32
+#  undef bswap64
+#endif
+
+/* Watch out for conflict with ntfs-3g/endians.h ... */
+#ifndef _NTFS_ENDIANS_H
+
+#define bswap16_const(n)                        \
+        ((((uint16_t)(n) & 0x00FF) << 8)        |       \
+         (((uint16_t)(n) & 0xFF00) >> 8))
+
+#define bswap32_const(n)                                \
+        ((((uint32_t)(n) & 0x000000FF) << 24)   |       \
+         (((uint32_t)(n) & 0x0000FF00) << 8)    |       \
+         (((uint32_t)(n) & 0x00FF0000) >> 8)    |       \
+         (((uint32_t)(n) & 0xFF000000) >> 24))
+
+#define bswap64_const(n)                                \
+        ((((uint64_t)(n) & 0x00000000000000FF) << 56)   |       \
+         (((uint64_t)(n) & 0x000000000000FF00) << 40)   |       \
+         (((uint64_t)(n) & 0x0000000000FF0000) << 24)   |       \
+         (((uint64_t)(n) & 0x00000000FF000000) << 8)    |       \
+         (((uint64_t)(n) & 0x000000FF00000000) >> 8)    |       \
+         (((uint64_t)(n) & 0x0000FF0000000000) >> 24)   |       \
+         (((uint64_t)(n) & 0x00FF000000000000) >> 40)   |       \
+         (((uint64_t)(n) & 0xFF00000000000000) >> 56))
+
+static attrib_forceinline uint16_t do_bswap16(uint16_t n)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+        return _byteswap_ushort(n);
+#elif GCC_PREREQ(4, 8) || __has_builtin(__builtin_bswap16)
+        return __builtin_bswap16(n);
+#else
+        return bswap16_const(n);
+#endif
+}
+
+static attrib_forceinline uint32_t do_bswap32(uint32_t n)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+        return _byteswap_ulong(n);
+#elif GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap32)
+        return __builtin_bswap32(n);
+#else
+        return bswap32_const(n);
+#endif
+}
+
+static attrib_forceinline uint64_t do_bswap64(uint64_t n)
+{
+#if LIBLZX_IS_MSVC_COMPILER
+        return _byteswap_uint64(n);
+#elif GCC_PREREQ(4, 3) || __has_builtin(__builtin_bswap64)
+        return __builtin_bswap64(n);
+#else
+        return bswap64_const(n);
+#endif
+}
+
+#define bswap16(n) (__builtin_constant_p(n) ? bswap16_const(n) : do_bswap16(n))
+#define bswap32(n) (__builtin_constant_p(n) ? bswap32_const(n) : do_bswap32(n))
+#define bswap64(n) (__builtin_constant_p(n) ? bswap64_const(n) : do_bswap64(n))
+
+#if CPU_IS_BIG_ENDIAN()
+#  define cpu_to_le16(n) ((_force_attr le16_t)bswap16(n))
+#  define cpu_to_le32(n) ((_force_attr le32_t)bswap32(n))
+#  define cpu_to_le64(n) ((_force_attr le64_t)bswap64(n))
+#  define le16_to_cpu(n) bswap16((_force_attr uint16_t)(le16_t)(n))
+#  define le32_to_cpu(n) bswap32((_force_attr uint32_t)(le32_t)(n))
+#  define le64_to_cpu(n) bswap64((_force_attr uint64_t)(le64_t)(n))
+#  define cpu_to_be16(n) ((_force_attr be16_t)(uint16_t)(n))
+#  define cpu_to_be32(n) ((_force_attr be32_t)(uint32_t)(n))
+#  define cpu_to_be64(n) ((_force_attr be64_t)(uint64_t)(n))
+#  define be16_to_cpu(n) ((_force_attr uint16_t)(be16_t)(n))
+#  define be32_to_cpu(n) ((_force_attr uint32_t)(be32_t)(n))
+#  define be64_to_cpu(n) ((_force_attr uint64_t)(be64_t)(n))
+#else
+#  define cpu_to_le16(n) ((_force_attr le16_t)(uint16_t)(n))
+#  define cpu_to_le32(n) ((_force_attr le32_t)(uint32_t)(n))
+#  define cpu_to_le64(n) ((_force_attr le64_t)(uint64_t)(n))
+#  define le16_to_cpu(n) ((_force_attr uint16_t)(le16_t)(n))
+#  define le32_to_cpu(n) ((_force_attr uint32_t)(le32_t)(n))
+#  define le64_to_cpu(n) ((_force_attr uint64_t)(le64_t)(n))
+#  define cpu_to_be16(n) ((_force_attr be16_t)bswap16(n))
+#  define cpu_to_be32(n) ((_force_attr be32_t)bswap32(n))
+#  define cpu_to_be64(n) ((_force_attr be64_t)bswap64(n))
+#  define be16_to_cpu(n) bswap16((_force_attr uint16_t)(be16_t)(n))
+#  define be32_to_cpu(n) bswap32((_force_attr uint32_t)(be32_t)(n))
+#  define be64_to_cpu(n) bswap64((_force_attr uint64_t)(be64_t)(n))
+#endif
+
+#endif /* _NTFS_ENDIANS_H */
+#endif /* _LIBLZX_ENDIANNESS_H */
diff --git a/dlls/cabinet/liblzx_error.h b/dlls/cabinet/liblzx_error.h
new file mode 100644
index 00000000000..cd984b61094
--- /dev/null
+++ b/dlls/cabinet/liblzx_error.h
@@ -0,0 +1,11 @@
+#ifndef __LIBLZX_ERROR_H__
+#define __LIBLZX_ERROR_H__
+
+enum liblzx_error {
+    LIBLZX_ERR_NONE = 0,
+
+    LIBLZX_ERR_NOMEM = -1,
+    LIBLZX_ERR_INVALID_PARAM = -2,
+};
+
+#endif /* __LIBLZX_ERROR_H__ */
diff --git a/dlls/cabinet/liblzx_hc_matchfinder.h b/dlls/cabinet/liblzx_hc_matchfinder.h
new file mode 100644
index 00000000000..31d659ab1fa
--- /dev/null
+++ b/dlls/cabinet/liblzx_hc_matchfinder.h
@@ -0,0 +1,432 @@
+/*
+ * hc_matchfinder.h - Lempel-Ziv matchfinding with a hash table of linked lists
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *                                   Algorithm
+ *
+ * This is a Hash Chains (hc) based matchfinder.
+ *
+ * The main data structure is a hash table where each hash bucket contains a
+ * linked list (or "chain") of sequences whose first 4 bytes share the same hash
+ * code.  Each sequence is identified by its starting position in the input
+ * buffer.
+ *
+ * The algorithm processes the input buffer sequentially.  At each byte
+ * position, the hash code of the first 4 bytes of the sequence beginning at
+ * that position (the sequence being matched against) is computed.  This
+ * identifies the hash bucket to use for that position.  Then, this hash
+ * bucket's linked list is searched for matches.  Then, a new linked list node
+ * is created to represent the current sequence and is prepended to the list.
+ *
+ * This algorithm has several useful properties:
+ *
+ * - It only finds true Lempel-Ziv matches; i.e., those where the matching
+ *   sequence occurs prior to the sequence being matched against.
+ *
+ * - The sequences in each linked list are always sorted by decreasing starting
+ *   position.  Therefore, the closest (smallest offset) matches are found
+ *   first, which in many compression formats tend to be the cheapest to encode.
+ *
+ * - Although fast running time is not guaranteed due to the possibility of the
+ *   lists getting very long, the worst degenerate behavior can be easily
+ *   prevented by capping the number of nodes searched at each position.
+ *
+ * - If the compressor decides not to search for matches at a certain position,
+ *   then that position can be quickly inserted without searching the list.
+ *
+ * - The algorithm is adaptable to sliding windows: just store the positions
+ *   relative to a "base" value that is updated from time to time, and stop
+ *   searching each list when the sequences get too far away.
+ *
+ * ---------------------------------------------------------------------------
+ *
+ *                                Notes on usage
+ *
+ * Before including this header, you must define 'mf_pos_t' to an integer type
+ * that can represent all possible positions.  This can be a 16-bit or 32-bit
+ * unsigned integer.  When possible, the former should be used due to the
+ * reduced cache pressure.  This header can be included multiple times in a
+ * single .c file with different 'mf_pos_t' definitions; however, you must
+ * define a different MF_SUFFIX each time to generate different names for the
+ * matchfinder structure and functions.
+ *
+ * The number of bytes that must be allocated for a given 'struct
+ * hc_matchfinder' must be gotten by calling hc_matchfinder_size().
+ *
+ * ----------------------------------------------------------------------------
+ *
+ *                                 Optimizations
+ *
+ * The main hash table and chains handle length 4+ matches.  Length 3 matches
+ * are handled by a separate hash table with no chains.  This works well for
+ * typical "greedy" or "lazy"-style compressors, where length 3 matches are
+ * often only helpful if they have small offsets.  Instead of searching a full
+ * chain for length 3+ matches, the algorithm just checks for one close length 3
+ * match, then focuses on finding length 4+ matches.
+ *
+ * The longest_match() and skip_bytes() functions are inlined into the
+ * compressors that use them.  This isn't just about saving the overhead of a
+ * function call.  These functions are intended to be called from the inner
+ * loops of compressors, where giving the compiler more control over register
+ * allocation is very helpful.  There is also significant benefit to be gained
+ * from allowing the CPU to predict branches independently at each call site.
+ * For example, "lazy"-style compressors can be written with two calls to
+ * longest_match(), each of which starts with a different 'best_len' and
+ * therefore has significantly different performance characteristics.
+ *
+ * Although any hash function can be used, a multiplicative hash is fast and
+ * works well.
+ *
+ * On some processors, it is significantly faster to extend matches by whole
+ * words (32 or 64 bits) instead of by individual bytes.  For this to be the
+ * case, the processor must implement unaligned memory accesses efficiently and
+ * must have either a fast "find first set bit" instruction or a fast "find last
+ * set bit" instruction, depending on the processor's endianness.
+ *
+ * The code uses one loop for finding the first match and one loop for finding a
+ * longer match.  Each of these loops is tuned for its respective task and in
+ * combination are faster than a single generalized loop that handles both
+ * tasks.
+ *
+ * The code also uses a tight inner loop that only compares the last and first
+ * bytes of a potential match.  It is only when these bytes match that a full
+ * match extension is attempted.
+ *
+ * ----------------------------------------------------------------------------
+ */
+
+#include <string.h>
+
+#include "liblzx_matchfinder_common.h"
+
+#define HC_MATCHFINDER_HASH3_ORDER        15
+#define HC_MATCHFINDER_HASH4_ORDER        16
+
+/* TEMPLATED functions and structures have MF_SUFFIX appended to their name.  */
+#undef TEMPLATED
+#define TEMPLATED(name)                CONCAT(name, MF_SUFFIX)
+
+struct TEMPLATED(hc_matchfinder) {
+
+        /* The hash table for finding length 3 matches  */
+        mf_pos_t hash3_tab[1UL << HC_MATCHFINDER_HASH3_ORDER];
+
+        /* The hash table which contains the first nodes of the linked lists for
+         * finding length 4+ matches  */
+        mf_pos_t hash4_tab[1UL << HC_MATCHFINDER_HASH4_ORDER];
+
+        /* The "next node" references for the linked lists.  The "next node" of
+         * the node for the sequence with position 'pos' is 'next_tab[pos]'.  */
+        mf_pos_t next_tab[];
+};
+
+/* Return the number of bytes that must be allocated for a 'hc_matchfinder' that
+ * can work with buffers up to the specified size.  */
+static attrib_forceinline size_t
+TEMPLATED(hc_matchfinder_size)(size_t max_bufsize, bool streaming)
+{
+        const size_t streaming_mul = streaming ? 2 : 1;
+
+        return sizeof(struct TEMPLATED(hc_matchfinder)) +
+               (max_bufsize * streaming_mul * sizeof(mf_pos_t));
+}
+
+/* Prepare the matchfinder for a new input buffer.  */
+static attrib_forceinline void
+TEMPLATED(hc_matchfinder_init)(struct TEMPLATED(hc_matchfinder) * mf,
+                               size_t max_bufsize, bool streaming)
+{
+        memset(mf, 0xFF, TEMPLATED(hc_matchfinder_size)(max_bufsize, streaming));
+}
+
+/* The minimum permissible value of 'max_len' for bt_matchfinder_get_matches()
+ * and bt_matchfinder_skip_byte().  There must be sufficiently many bytes
+ * remaining to load a 32-bit integer from the *next* position.  */
+#define HC_MATCHFINDER_REQUIRED_NBYTES 5
+
+/*
+ * Find the longest match longer than 'best_len' bytes.
+ *
+ * @mf
+ *        The matchfinder structure.
+ * @in_begin
+ *        Pointer to the beginning of the input buffer.
+ * @in_next
+ *        Pointer to the next position in the input buffer, i.e. the sequence
+ *        being matched against.
+ * @best_len
+ *        Require a match longer than this length.
+ * @max_len
+ *        The maximum permissible match length at this position.
+ * @nice_len
+ *        Stop searching if a match of at least this length is found.
+ *        Must be <= @max_len.
+ * @max_search_depth
+ *        Limit on the number of potential matches to consider.  Must be >= 1.
+ * @next_hashes
+ *        The precomputed hash codes for the sequence beginning at @in_next.
+ *        These will be used and then updated with the precomputed hashcodes for
+ *        the sequence beginning at @in_next + 1.
+ * @offset_ret
+ *        If a match is found, its offset is returned in this location.
+ *
+ * Return the length of the match found, or 'best_len' if no match longer than
+ * 'best_len' was found.
+ */
+static attrib_forceinline uint32_t
+TEMPLATED(hc_matchfinder_longest_match)(struct TEMPLATED(hc_matchfinder) * const mf,
+                                        const uint8_t * const in_begin,
+                                        uint32_t in_min_pos,
+                                        const uint8_t * const in_next,
+                                        uint32_t best_len,
+                                        const uint32_t max_find_len,
+                                        const uint32_t max_produce_len,
+                                        const uint32_t nice_len,
+                                        const uint32_t max_search_depth,
+                                        uint32_t * const next_hashes,
+                                        uint32_t * const offset_ret)
+{
+        uint32_t depth_remaining = max_search_depth;
+        const uint8_t *best_matchptr = in_next;
+        mf_pos_t cur_node3, cur_node4;
+        uint32_t hash3, hash4;
+        uint32_t next_hashseq;
+        uint32_t seq4;
+        const uint8_t *matchptr;
+        uint32_t len;
+        uint32_t cur_pos = in_next - in_begin;
+
+        /* can we read 4 bytes from 'in_next + 1'? */
+        if (unlikely(max_find_len < HC_MATCHFINDER_REQUIRED_NBYTES))
+                goto out;
+
+        /* Get the precomputed hash codes.  */
+        hash3 = next_hashes[0];
+        hash4 = next_hashes[1];
+
+        /* From the hash buckets, get the first node of each linked list.  */
+        cur_node3 = mf->hash3_tab[hash3];
+        cur_node4 = mf->hash4_tab[hash4];
+
+        /* Update for length 3 matches.  This replaces the singleton node in the
+         * 'hash3' bucket with the node for the current sequence.  */
+        mf->hash3_tab[hash3] = cur_pos;
+
+        /* Update for length 4 matches.  This prepends the node for the current
+         * sequence to the linked list in the 'hash4' bucket.  */
+        mf->hash4_tab[hash4] = cur_pos;
+        mf->next_tab[cur_pos] = cur_node4;
+
+        /* Compute the next hash codes.  */
+        next_hashseq = get_unaligned_le32(in_next + 1);
+        next_hashes[0] = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+        next_hashes[1] = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+        prefetchw(&mf->hash3_tab[next_hashes[0]]);
+        prefetchw(&mf->hash4_tab[next_hashes[1]]);
+
+        if (best_len < 4) {  /* No match of length >= 4 found yet?  */
+
+                /* Check for a length 3 match if needed.  */
+
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node3, in_min_pos))
+                        goto out;
+
+                seq4 = load_u32_unaligned(in_next);
+
+                if (best_len < 3) {
+                        matchptr = &in_begin[cur_node3];
+                        if (load_u24_unaligned(matchptr) == loaded_u32_to_u24(seq4)) {
+                                best_len = 3;
+                                best_matchptr = matchptr;
+                        }
+                }
+
+                /* Check for a length 4 match.  */
+
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4, in_min_pos))
+                        goto out;
+
+                for (;;) {
+                        /* No length 4 match found yet.  Check the first 4 bytes.  */
+                        matchptr = &in_begin[cur_node4];
+
+                        if (load_u32_unaligned(matchptr) == seq4)
+                                break;
+
+                        /* The first 4 bytes did not match.  Keep trying.  */
+                        cur_node4 = mf->next_tab[cur_node4];
+                        if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4,
+                                                                 in_min_pos) ||
+                            !--depth_remaining)
+                                goto out;
+                }
+
+                /* Found a match of length >= 4.  Extend it to its full length.  */
+                best_matchptr = matchptr;
+                best_len = lz_extend(in_next, best_matchptr, 4, max_find_len);
+                if (best_len >= nice_len)
+                        goto out;
+                cur_node4 = mf->next_tab[cur_node4];
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4,
+                                                         in_min_pos) ||
+                    !--depth_remaining)
+                        goto out;
+        } else {
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4,
+                                                         in_min_pos) ||
+                    best_len >= nice_len)
+                        goto out;
+        }
+
+        /* Check for matches of length >= 5.  */
+
+        for (;;) {
+                for (;;) {
+                        matchptr = &in_begin[cur_node4];
+
+                        /* Already found a length 4 match.  Try for a longer
+                         * match; start by checking either the last 4 bytes and
+                         * the first 4 bytes, or the last byte.  (The last byte,
+                         * the one which would extend the match length by 1, is
+                         * the most important.)  */
+                #if UNALIGNED_ACCESS_IS_FAST
+                        if ((load_u32_unaligned(matchptr + best_len - 3) ==
+                             load_u32_unaligned(in_next + best_len - 3)) &&
+                            (load_u32_unaligned(matchptr) ==
+                             load_u32_unaligned(in_next)))
+                #else
+                        if (matchptr[best_len] == in_next[best_len])
+                #endif
+                                break;
+
+                        /* Continue to the next node in the list.  */
+                        cur_node4 = mf->next_tab[cur_node4];
+                        if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4,
+                                                                 in_min_pos) ||
+                                !--depth_remaining)
+                                goto out;
+                }
+
+        #if UNALIGNED_ACCESS_IS_FAST
+                len = 4;
+        #else
+                len = 0;
+        #endif
+                len = lz_extend(in_next, matchptr, len, max_find_len);
+                if (len > best_len) {
+                        /* This is the new longest match.  */
+                        best_len = len;
+                        best_matchptr = matchptr;
+                        if (best_len >= nice_len)
+                                goto out;
+                }
+
+                /* Continue to the next node in the list.  */
+                cur_node4 = mf->next_tab[cur_node4];
+                if (!TEMPLATED(matchfinder_is_valid_pos)(cur_node4,
+                                                         in_min_pos) ||
+                    !--depth_remaining)
+                        goto out;
+        }
+out:
+        *offset_ret = in_next - best_matchptr;
+        best_len = min_u32(best_len, max_produce_len);
+        if (best_len < 2)
+                best_len = 2;
+
+        return best_len;
+}
+
+/*
+ * Advance the matchfinder, but don't search for matches.
+ *
+ * @mf
+ *        The matchfinder structure.
+ * @in_begin
+ *        Pointer to the beginning of the input buffer.
+ * @in_next
+ *        Pointer to the next position in the input buffer.
+ * @in_end
+ *        Pointer to the end of the input buffer.
+ * @count
+ *        The number of bytes to advance.  Must be > 0.
+ * @next_hashes
+ *        The precomputed hash codes for the sequence beginning at @in_next.
+ *        These will be used and then updated with the precomputed hashcodes for
+ *        the sequence beginning at @in_next + @count.
+ */
+static attrib_forceinline void
+TEMPLATED(hc_matchfinder_skip_bytes)(struct TEMPLATED(hc_matchfinder) * const mf,
+                                     const uint8_t * const in_begin,
+                                     const uint8_t *in_next,
+                                     const uint8_t * const in_end,
+                                     const uint32_t count,
+                                     uint32_t * const next_hashes)
+{
+        uint32_t cur_pos;
+        uint32_t hash3, hash4;
+        uint32_t next_hashseq;
+        uint32_t remaining = count;
+
+        if (unlikely(count + HC_MATCHFINDER_REQUIRED_NBYTES > in_end - in_next))
+                return;
+
+        cur_pos = in_next - in_begin;
+        hash3 = next_hashes[0];
+        hash4 = next_hashes[1];
+        do {
+                mf->hash3_tab[hash3] = cur_pos;
+                mf->next_tab[cur_pos] = mf->hash4_tab[hash4];
+                mf->hash4_tab[hash4] = cur_pos;
+
+                next_hashseq = get_unaligned_le32(++in_next);
+                hash3 = lz_hash(next_hashseq & 0xFFFFFF, HC_MATCHFINDER_HASH3_ORDER);
+                hash4 = lz_hash(next_hashseq, HC_MATCHFINDER_HASH4_ORDER);
+                cur_pos++;
+        } while (--remaining);
+
+        prefetchw(&mf->hash3_tab[hash3]);
+        prefetchw(&mf->hash4_tab[hash4]);
+        next_hashes[0] = hash3;
+        next_hashes[1] = hash4;
+}
+
+/*
+ * Culls any matches that are lower than a specified offset and reduces any
+ * remaining offsets by the same amount.
+ */
+static attrib_forceinline void
+TEMPLATED(hc_matchfinder_cull)(struct TEMPLATED(hc_matchfinder) * mf,
+                               uint32_t cull_size, uint32_t window_size)
+{
+        const size_t mf_count =
+            TEMPLATED(hc_matchfinder_size)(window_size, true) /
+            sizeof(mf_pos_t);
+
+        TEMPLATED(matchfinder_rebase)((mf_pos_t *)mf, mf_count, cull_size);
+}
diff --git a/dlls/cabinet/liblzx_lzx_common.c b/dlls/cabinet/liblzx_lzx_common.c
new file mode 100644
index 00000000000..d8619e0965f
--- /dev/null
+++ b/dlls/cabinet/liblzx_lzx_common.c
@@ -0,0 +1,325 @@
+/*
+ * lzx_common.c - Common code for LZX compression and decompression.
+ */
+
+/*
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright (C) 2012-2016 Eric Biggers
+ *
+ * This file is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this file; if not, see https://www.gnu.org/licenses/.
+ */
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "liblzx_minmax.h"
+
+#include <string.h>
+
+#ifdef __SSE2__
+#  include <emmintrin.h>
+#endif
+
+#ifdef __AVX2__
+#  include <immintrin.h>
+#endif
+
+#include "liblzx_bitops.h"
+#include "liblzx_endianness.h"
+#include "liblzx_lzx_common.h"
+#include "liblzx_unaligned.h"
+#include "liblzx_util.h"
+
+/* Mapping: offset slot => first match offset that uses that offset slot.
+ * The offset slots for repeat offsets map to "fake" offsets < 1.  */
+const int32_t lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1] = {
+        -2     , -1     , 0      , 1      , 2      ,    /* 0  --- 4  */
+        4      , 6      , 10     , 14     , 22     ,    /* 5  --- 9  */
+        30     , 46     , 62     , 94     , 126    ,    /* 10 --- 14 */
+        190    , 254    , 382    , 510    , 766    ,    /* 15 --- 19 */
+        1022   , 1534   , 2046   , 3070   , 4094   ,    /* 20 --- 24 */
+        6142   , 8190   , 12286  , 16382  , 24574  ,    /* 25 --- 29 */
+        32766  , 49150  , 65534  , 98302  , 131070 ,    /* 30 --- 34 */
+        196606 , 262142 , 393214 , 524286 , 655358 ,    /* 35 --- 39 */
+        786430 , 917502 , 1048574, 1179646, 1310718,    /* 40 --- 44 */
+        1441790, 1572862, 1703934, 1835006, 1966078,    /* 45 --- 49 */
+        2097150                                         /* extra     */
+};
+
+/* Mapping: offset slot => how many extra bits must be read and added to the
+ * corresponding offset slot base to decode the match offset.  */
+const uint8_t lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS] = {
+        0 , 0 , 0 , 0 , 1 ,
+        1 , 2 , 2 , 3 , 3 ,
+        4 , 4 , 5 , 5 , 6 ,
+        6 , 7 , 7 , 8 , 8 ,
+        9 , 9 , 10, 10, 11,
+        11, 12, 12, 13, 13,
+        14, 14, 15, 15, 16,
+        16, 17, 17, 17, 17,
+        17, 17, 17, 17, 17,
+        17, 17, 17, 17, 17,
+};
+
+/* Round the specified buffer size up to the next valid LZX window size, and
+ * return its order (log2).  Or, if the buffer size is 0 or greater than the
+ * largest valid LZX window size, return 0.  */
+unsigned
+lzx_get_window_order(size_t max_bufsize)
+{
+        if (max_bufsize == 0 || max_bufsize > LZX_MAX_WINDOW_SIZE)
+                return 0;
+
+        return max_uint(ilog2_ceil(max_bufsize), LZX_MIN_WINDOW_ORDER);
+}
+
+/* Given a valid LZX window order, return the number of symbols that will exist
+ * in the main Huffman code.  */
+unsigned
+lzx_get_num_main_syms(unsigned window_order)
+{
+        /* Note: one would expect that the maximum match offset would be
+         * 'window_size - LZX_MIN_MATCH_LEN', which would occur if the first two
+         * bytes were to match the last two bytes.  However, the format
+         * disallows this case.  This reduces the number of needed offset slots
+         * by 1.  */
+        uint32_t window_size = (uint32_t)1 << window_order;
+        uint32_t max_offset = window_size - LZX_MIN_MATCH_LEN - 1;
+        unsigned num_offset_slots = 30;
+        while (max_offset >= (uint32_t)lzx_offset_slot_base[num_offset_slots])
+                num_offset_slots++;
+
+        return LZX_NUM_CHARS + (num_offset_slots * LZX_NUM_LEN_HEADERS);
+}
+
+static void
+do_translate_target(void *target, int32_t input_pos, int32_t e8_file_size)
+{
+        int32_t abs_offset, rel_offset;
+
+        rel_offset = get_unaligned_le32(target);
+        if (rel_offset >= -input_pos && rel_offset < e8_file_size) {
+                if (rel_offset < e8_file_size - input_pos) {
+                        /* "good translation" */
+                        abs_offset = rel_offset + input_pos;
+                } else {
+                        /* "compensating translation" */
+                        abs_offset = rel_offset - e8_file_size;
+                }
+                put_unaligned_le32(abs_offset, target);
+        }
+}
+
+static void
+undo_translate_target(void *target, int32_t input_pos, int32_t e8_file_size)
+{
+        int32_t abs_offset, rel_offset;
+
+        abs_offset = get_unaligned_le32(target);
+        if (abs_offset >= 0) {
+                if (abs_offset < e8_file_size) {
+                        /* "good translation" */
+                        rel_offset = abs_offset - input_pos;
+                        put_unaligned_le32(rel_offset, target);
+                }
+        } else {
+                if (abs_offset >= -input_pos) {
+                        /* "compensating translation" */
+                        rel_offset = abs_offset + e8_file_size;
+                        put_unaligned_le32(rel_offset, target);
+                }
+        }
+}
+
+/*
+ * Do or undo the 'E8' preprocessing used in LZX.  Before compression, the
+ * uncompressed data is preprocessed by changing the targets of x86 CALL
+ * instructions from relative offsets to absolute offsets.  After decompression,
+ * the translation is undone by changing the targets of x86 CALL instructions
+ * from absolute offsets to relative offsets.
+ *
+ * Note that despite its intent, E8 preprocessing can be done on any data even
+ * if it is not actually x86 machine code.  In fact, E8 preprocessing appears to
+ * always be used in LZX-compressed resources in WIM files; there is no bit to
+ * indicate whether it is used or not, unlike in the LZX compressed format as
+ * used in cabinet files, where a bit is reserved for that purpose.
+ *
+ * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data,
+ * which really means the 5-byte call instruction cannot start in the last 10
+ * bytes of the uncompressed data.  This is one of the errors in the LZX
+ * documentation.
+ *
+ * E8 preprocessing does not appear to be disabled after the 32768th chunk of a
+ * WIM resource, which apparently is another difference from the LZX compression
+ * used in cabinet files.
+ *
+ * E8 processing is supposed to take the file size as a parameter, as it is used
+ * in calculating the translated jump targets.  But in WIM files, this file size
+ * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000).
+ */
+static void
+lzx_e8_filter(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size,
+              void (*process_target)(void *, int32_t, int32_t))
+{
+
+#if !defined(__SSE2__) && !defined(__AVX2__)
+        uint8_t *tail;
+        uint8_t *p;
+
+        if (size <= LZX_E8_FILTER_TAIL_SIZE)
+                return;
+
+        tail = &data[size - LZX_E8_FILTER_TAIL_SIZE];
+        p = data;
+        while (p < tail) {
+                if (*p != 0xE8) {
+                        p++;
+                        continue;
+                }
+
+                (*process_target)(p + 1, (int32_t)(p - data + chunk_offset),
+                                  e8_file_size);
+                p += 5;
+        }
+#else
+        /* SSE2 or AVX-2 optimized version for x86_64  */
+
+        uint8_t *p = data;
+        uint64_t valid_mask = ~0;
+
+        if (size <= LZX_E8_FILTER_TAIL_SIZE)
+                return;
+#ifdef __AVX2__
+#  define ALIGNMENT_REQUIRED 32
+#else
+#  define ALIGNMENT_REQUIRED 16
+#endif
+
+        /* Process one byte at a time until the pointer is properly aligned.  */
+        while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) {
+                if (p >= data + size - LZX_E8_FILTER_TAIL_SIZE)
+                        return;
+                if (*p == 0xE8 && (valid_mask & 1)) {
+                        (*process_target)(p + 1, p - data + chunk_offset,
+                                          e8_file_size);
+                        valid_mask &= ~0x1F;
+                }
+                p++;
+                valid_mask >>= 1;
+                valid_mask |= (uint64_t)1 << 63;
+        }
+
+        if (data + size - p >= 64) {
+
+                /* Vectorized processing  */
+
+                /* Note: we use a "trap" E8 byte to eliminate the need to check
+                 * for end-of-buffer in the inner loop.  This byte is carefully
+                 * positioned so that it will never be changed by a previous
+                 * translation before it is detected.  */
+
+                uint8_t *trap = p + ((data + size - p) & ~31) - 32 + 4;
+                uint8_t saved_byte = *trap;
+                *trap = 0xE8;
+
+                for (;;) {
+                        uint32_t e8_mask;
+                        uint8_t *orig_p = p;
+                #ifdef __AVX2__
+                        const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
+                        for (;;) {
+                                __m256i bytes = *(const __m256i *)p;
+                                __m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
+                                e8_mask = _mm256_movemask_epi8(cmpresult);
+                                if (e8_mask)
+                                        break;
+                                p += 32;
+                        }
+                #else
+                        const __m128i e8_bytes = _mm_set1_epi8(0xE8);
+                        for (;;) {
+                                /* Read the next 32 bytes of data and test them
+                                 * for E8 bytes.  */
+                                __m128i bytes1 = *(const __m128i *)p;
+                                __m128i bytes2 = *(const __m128i *)(p + 16);
+                                __m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes);
+                                __m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes);
+                                uint32_t mask1 = _mm_movemask_epi8(cmpresult1);
+                                uint32_t mask2 = _mm_movemask_epi8(cmpresult2);
+                                /* The masks have a bit set for each E8 byte.
+                                 * We stay in this fast inner loop as long as
+                                 * there are no E8 bytes.  */
+                                if (mask1 | mask2) {
+                                        e8_mask = mask1 | (mask2 << 16);
+                                        break;
+                                }
+                                p += 32;
+                        }
+                #endif
+
+                        /* Did we pass over data with no E8 bytes?  */
+                        if (p != orig_p)
+                                valid_mask = ~0;
+
+                        /* Are we nearing end-of-buffer?  */
+                        if (p == trap - 4)
+                                break;
+
+                        /* Process the E8 bytes.  However, the AND with
+                         * 'valid_mask' ensures we never process an E8 byte that
+                         * was itself part of a translation target.  */
+                        while ((e8_mask &= valid_mask)) {
+                                unsigned bit = bsf32(e8_mask);
+                                (*process_target)(p + bit + 1,
+                                                  p + bit - data + chunk_offset,
+                                                  e8_file_size);
+                                valid_mask &= ~((uint64_t)0x1F << bit);
+                        }
+
+                        valid_mask >>= 32;
+                        valid_mask |= 0xFFFFFFFF00000000;
+                        p += 32;
+                }
+
+                *trap = saved_byte;
+        }
+
+        /* Approaching the end of the buffer; process one byte a time.  */
+        while (p < data + size - LZX_E8_FILTER_TAIL_SIZE) {
+                if (*p == 0xE8 && (valid_mask & 1)) {
+                        (*process_target)(p + 1, p - data + chunk_offset,
+                                          e8_file_size);
+                        valid_mask &= ~0x1F;
+                }
+                p++;
+                valid_mask >>= 1;
+                valid_mask |= (uint64_t)1 << 63;
+        }
+#endif /* __SSE2__ || __AVX2__ */
+}
+
+void
+lzx_preprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size)
+{
+        lzx_e8_filter(data, size, chunk_offset, e8_file_size,
+                      do_translate_target);
+}
+
+void
+lzx_postprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size)
+{
+        lzx_e8_filter(data, size, chunk_offset, e8_file_size,
+                      undo_translate_target);
+}
diff --git a/dlls/cabinet/liblzx_lzx_common.h b/dlls/cabinet/liblzx_lzx_common.h
new file mode 100644
index 00000000000..d6fd20b095c
--- /dev/null
+++ b/dlls/cabinet/liblzx_lzx_common.h
@@ -0,0 +1,29 @@
+/*
+ * lzx_common.h
+ *
+ * Declarations shared between LZX compression and decompression.
+ */
+
+#ifndef _LZX_COMMON_H
+#define _LZX_COMMON_H
+
+#include "liblzx_lzx_constants.h"
+#include "liblzx_types.h"
+
+extern const int32_t lzx_offset_slot_base[LZX_MAX_OFFSET_SLOTS + 1];
+
+extern const uint8_t lzx_extra_offset_bits[LZX_MAX_OFFSET_SLOTS];
+
+unsigned
+lzx_get_window_order(size_t max_bufsize);
+
+unsigned
+lzx_get_num_main_syms(unsigned window_order);
+
+void
+lzx_preprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size);
+
+void
+lzx_postprocess(uint8_t *data, uint32_t size, uint32_t chunk_offset, uint32_t e8_file_size);
+
+#endif /* _LZX_COMMON_H */
diff --git a/dlls/cabinet/liblzx_lzx_compress.c b/dlls/cabinet/liblzx_lzx_compress.c
new file mode 100644
index 00000000000..f26629d5fdb
--- /dev/null
+++ b/dlls/cabinet/liblzx_lzx_compress.c
@@ -0,0 +1,3662 @@
+/*
+ * lzx_compress.c
+ *
+ * A compressor for the LZX compression format, as used in WIM archives.
+ */
+
+/*
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright (C) 2012-2017 Eric Biggers
+ *
+ * This file is free software; you can redistribute it and/or modify it under
+ * the terms of the GNU Lesser General Public License as published by the Free
+ * Software Foundation; either version 2.1 of the License, or (at your option) any
+ * later version.
+ *
+ * This file is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+ * FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
+ * details.
+ *
+ * You should have received a copy of the GNU Lesser General Public License
+ * along with this file; if not, see https://www.gnu.org/licenses/.
+ */
+
+
+/*
+ * This file contains a compressor for the LZX ("Lempel-Ziv eXtended")
+ * compression format, as used in the WIM (Windows IMaging) file format.
+ *
+ * Two different LZX-compatible algorithms are implemented: "near-optimal" and
+ * "lazy".  "Near-optimal" is significantly slower than "lazy", but results in a
+ * better compression ratio.  The "near-optimal" algorithm is used at the
+ * default compression level.
+ *
+ * This file may need some slight modifications to be used outside of the WIM
+ * format.  In particular, in other situations the LZX block header might be
+ * slightly different, and sliding window support might be required.
+ *
+ * LZX is a compression format derived from DEFLATE, the format used by zlib and
+ * gzip.  Both LZX and DEFLATE use LZ77 matching and Huffman coding.  Certain
+ * details are quite similar, such as the method for storing Huffman codes.
+ * However, the main differences are:
+ *
+ * - LZX preprocesses the data to attempt to make x86 machine code slightly more
+ *   compressible before attempting to compress it further.
+ *
+ * - LZX uses a "main" alphabet which combines literals and matches, with the
+ *   match symbols containing a "length header" (giving all or part of the match
+ *   length) and an "offset slot" (giving, roughly speaking, the order of
+ *   magnitude of the match offset).
+ *
+ * - LZX does not have static Huffman blocks (that is, the kind with preset
+ *   Huffman codes); however it does have two types of dynamic Huffman blocks
+ *   ("verbatim" and "aligned").
+ *
+ * - LZX has a minimum match length of 2 rather than 3.  Length 2 matches can be
+ *   useful, but generally only if the compressor is smart about choosing them.
+ *
+ * - In LZX, offset slots 0 through 2 actually represent entries in an LRU queue
+ *   of match offsets.  This is very useful for certain types of files, such as
+ *   binary files that have repeating records.
+ */
+
+/******************************************************************************/
+/*                            General parameters                              */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * The compressor uses the faster algorithm at levels <= MAX_FAST_LEVEL.  It
+ * uses the slower algorithm at levels > MAX_FAST_LEVEL.
+ */
+#define MAX_FAST_LEVEL                                34
+
+/*
+ * The compressor-side limits on the codeword lengths (in bits) for each Huffman
+ * code.  To make outputting bits slightly faster, some of these limits are
+ * lower than the limits defined by the LZX format.  This does not significantly
+ * affect the compression ratio.
+ */
+#define MAIN_CODEWORD_LIMIT                        16
+#define LENGTH_CODEWORD_LIMIT                        12
+#define ALIGNED_CODEWORD_LIMIT                        7
+#define PRE_CODEWORD_LIMIT                        7
+
+/******************************************************************************/
+/*                         Block splitting parameters                         */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * The compressor always outputs blocks of at least this size in bytes, except
+ * for the last block which may need to be smaller.
+ */
+#define MIN_BLOCK_SIZE                                6500
+
+/*
+ * The compressor attempts to end a block when it reaches this size in bytes.
+ * The final size might be slightly larger due to matches extending beyond the
+ * end of the block.  Specifically:
+ *
+ *  - The near-optimal compressor may choose a match of up to LZX_MAX_MATCH_LEN
+ *    bytes starting at position 'SOFT_MAX_BLOCK_SIZE - 1'.
+ *
+ *  - The lazy compressor may choose a sequence of literals starting at position
+ *    'SOFT_MAX_BLOCK_SIZE - 1' when it sees a sequence of increasingly better
+ *    matches.  The final match may be up to LZX_MAX_MATCH_LEN bytes.  The
+ *    length of the literal sequence is approximately limited by the "nice match
+ *    length" parameter.
+ */
+#define SOFT_MAX_BLOCK_SIZE                        100000
+
+/*
+ * The number of observed items (matches and literals) that represents
+ * sufficient data for the compressor to decide whether the current block should
+ * be ended or not.
+ */
+#define NUM_OBSERVATIONS_PER_BLOCK_CHECK        400
+
+
+/******************************************************************************/
+/*                      Parameters for slower algorithm                       */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * The log base 2 of the number of entries in the hash table for finding length
+ * 2 matches.  This could be as high as 16, but using a smaller hash table
+ * speeds up compression due to reduced cache pressure.
+ */
+#define BT_MATCHFINDER_HASH2_ORDER                12
+
+/*
+ * The number of lz_match structures in the match cache, excluding the extra
+ * "overflow" entries.  This value should be high enough so that nearly the
+ * time, all matches found in a given block can fit in the match cache.
+ * However, fallback behavior (immediately terminating the block) on cache
+ * overflow is still required.
+ */
+#define CACHE_LENGTH                                (SOFT_MAX_BLOCK_SIZE * 5)
+
+/*
+ * An upper bound on the number of matches that can ever be saved in the match
+ * cache for a single position.  Since each match we save for a single position
+ * has a distinct length, we can use the number of possible match lengths in LZX
+ * as this bound.  This bound is guaranteed to be valid in all cases, although
+ * if 'nice_match_length < LZX_MAX_MATCH_LEN', then it will never actually be
+ * reached.
+ */
+#define MAX_MATCHES_PER_POS                        LZX_NUM_LENS
+
+/*
+ * A scaling factor that makes it possible to consider fractional bit costs.  A
+ * single bit has a cost of BIT_COST.
+ *
+ * Note: this is only useful as a statistical trick for when the true costs are
+ * unknown.  Ultimately, each token in LZX requires a whole number of bits to
+ * output.
+ */
+#define BIT_COST_BITS                                6
+#define BIT_COST                                (1 << BIT_COST_BITS)
+
+/*
+ * Should the compressor take into account the costs of aligned offset symbols
+ * instead of assuming that all are equally likely?
+ */
+#define CONSIDER_ALIGNED_COSTS                        1
+
+/*
+ * Should the "minimum" cost path search algorithm consider "gap" matches, where
+ * a normal match is followed by a literal, then by a match with the same
+ * offset?  This is one specific, somewhat common situation in which the true
+ * minimum cost path is often different from the path found by looking only one
+ * edge ahead.
+ */
+#define CONSIDER_GAP_MATCHES                        1
+
+/******************************************************************************/
+/*                                  Includes                                  */
+/*----------------------------------------------------------------------------*/
+
+#ifdef HAVE_CONFIG_H
+#  include "config.h"
+#endif
+
+#include "liblzx_compress_common.h"
+#include "liblzx_error.h"
+#include "liblzx_lzx_common.h"
+#include "liblzx_minmax.h"
+#include "liblzx_unaligned.h"
+#include "liblzx_util.h"
+#include "liblzx.h"
+
+#include <assert.h>
+
+#include <malloc.h>
+
+/* Note: BT_MATCHFINDER_HASH2_ORDER must be defined before including
+ * bt_matchfinder.h. */
+
+/* Matchfinders with 16-bit positions */
+#define mf_pos_t        uint16_t
+#define MF_SUFFIX        _16
+#define MF_INVALID_POS        (0xFFFFu)
+#include "liblzx_bt_matchfinder.h"
+#include "liblzx_hc_matchfinder.h"
+
+/* Matchfinders with 32-bit positions */
+#undef mf_pos_t
+#undef MF_SUFFIX
+#undef MF_INVALID_POS
+#define mf_pos_t        uint32_t
+#define MF_SUFFIX        _32
+#define MF_INVALID_POS        (0xFFFFFFFFu)
+#include "liblzx_bt_matchfinder.h"
+#include "liblzx_hc_matchfinder.h"
+
+#undef mf_pos_t
+#undef MF_SUFFIX
+#undef MF_INVALID_POS
+
+/******************************************************************************/
+/*                            Compressor structure                            */
+/*----------------------------------------------------------------------------*/
+
+/* Codewords for the Huffman codes */
+struct lzx_codewords {
+        uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+        uint32_t len[LZX_LENCODE_NUM_SYMBOLS];
+        uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+};
+
+/*
+ * Codeword lengths, in bits, for the Huffman codes.
+ *
+ * A codeword length of 0 means the corresponding codeword has zero frequency.
+ *
+ * The main and length codes each have one extra entry for use as a sentinel.
+ * See lzx_write_compressed_code().
+ */
+struct lzx_lens {
+        uint8_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS + 1];
+        uint8_t len[LZX_LENCODE_NUM_SYMBOLS + 1];
+        uint8_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+};
+
+/* Codewords and lengths for the Huffman codes */
+struct lzx_codes {
+        struct lzx_codewords codewords;
+        struct lzx_lens lens;
+};
+
+/* Symbol frequency counters for the Huffman-encoded alphabets */
+struct lzx_freqs {
+        uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+        uint32_t len[LZX_LENCODE_NUM_SYMBOLS];
+        uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+};
+
+/* Block split statistics.  See the "Block splitting algorithm" section later in
+ * this file for details. */
+#define NUM_LITERAL_OBSERVATION_TYPES 8
+#define NUM_MATCH_OBSERVATION_TYPES 2
+#define NUM_OBSERVATION_TYPES (NUM_LITERAL_OBSERVATION_TYPES + \
+                               NUM_MATCH_OBSERVATION_TYPES)
+struct lzx_block_split_stats {
+        uint32_t new_observations[NUM_OBSERVATION_TYPES];
+        uint32_t observations[NUM_OBSERVATION_TYPES];
+        uint32_t num_new_observations;
+        uint32_t num_observations;
+};
+
+/*
+ * Represents a run of literals followed by a match or end-of-block.  This
+ * structure is needed to temporarily store items chosen by the compressor,
+ * since items cannot be written until all items for the block have been chosen
+ * and the block's Huffman codes have been computed.
+ */
+struct attrib_aligned(8) lzx_sequence {
+
+        /*
+         * Bits 9..31: the number of literals in this run.  This may be 0 and
+         * can be at most about SOFT_MAX_BLOCK_LENGTH.  The literals are not
+         * stored explicitly in this structure; instead, they are read directly
+         * from the uncompressed data.
+         *
+         * Bits 0..8: the length of the match which follows the literals, or 0
+         * if this literal run was the last in the block, so there is no match
+         * which follows it.  This can be at most LZX_MAX_MATCH_LEN.
+         */
+        uint32_t litrunlen_and_matchlen;
+#define SEQ_MATCHLEN_BITS        9
+#define SEQ_MATCHLEN_MASK        (((uint32_t)1 << SEQ_MATCHLEN_BITS) - 1)
+
+        /*
+         * If 'matchlen' doesn't indicate end-of-block, then this contains:
+         *
+         * Bits 10..31: either the offset plus LZX_OFFSET_ADJUSTMENT or a recent
+         * offset code, depending on the offset slot encoded in the main symbol.
+         *
+         * Bits 0..9: the main symbol.
+         */
+        uint32_t adjusted_offset_and_mainsym;
+#define SEQ_MAINSYM_BITS        10
+#define SEQ_MAINSYM_MASK        (((uint32_t)1 << SEQ_MAINSYM_BITS) - 1)
+};
+
+/*
+ * This structure represents a byte position in the input buffer and a node in
+ * the graph of possible match/literal choices.
+ *
+ * Logically, each incoming edge to this node is labeled with a literal or a
+ * match that can be taken to reach this position from an earlier position; and
+ * each outgoing edge from this node is labeled with a literal or a match that
+ * can be taken to advance from this position to a later position.
+ */
+struct attrib_aligned(8) lzx_optimum_node {
+
+        /* The cost, in bits, of the lowest-cost path that has been found to
+         * reach this position.  This can change as progressively lower cost
+         * paths are found to reach this position.  */
+        uint32_t cost;
+
+        /*
+         * The best arrival to this node, i.e. the match or literal that was
+         * used to arrive to this position at the given 'cost'.  This can change
+         * as progressively lower cost paths are found to reach this position.
+         *
+         * For non-gap matches, this variable is divided into two bitfields
+         * whose meanings depend on the item type:
+         *
+         * Literals:
+         *        Low bits are 0, high bits are the literal.
+         *
+         * Explicit offset matches:
+         *        Low bits are the match length, high bits are the offset plus
+         *        LZX_OFFSET_ADJUSTMENT.
+         *
+         * Repeat offset matches:
+         *        Low bits are the match length, high bits are the queue index.
+         *
+         * For gap matches, identified by OPTIMUM_GAP_MATCH set, special
+         * behavior applies --- see the code.
+         */
+        uint32_t item;
+#define OPTIMUM_OFFSET_SHIFT        SEQ_MATCHLEN_BITS
+#define OPTIMUM_LEN_MASK        SEQ_MATCHLEN_MASK
+#if CONSIDER_GAP_MATCHES
+#  define OPTIMUM_GAP_MATCH 0x80000000
+#endif
+
+};
+
+/* The cost model for near-optimal parsing */
+struct lzx_costs {
+
+        /*
+         * 'match_cost[offset_slot][len - LZX_MIN_MATCH_LEN]' is the cost of a
+         * length 'len' match which has an offset belonging to 'offset_slot'.
+         * The cost includes the main symbol, the length symbol if required, and
+         * the extra offset bits if any, excluding any entropy-coded bits
+         * (aligned offset bits).  It does *not* include the cost of the aligned
+         * offset symbol which may be required.
+         */
+        uint16_t match_cost[LZX_MAX_OFFSET_SLOTS][LZX_NUM_LENS];
+
+        /* Cost of each symbol in the main code */
+        uint32_t main[LZX_MAINCODE_MAX_NUM_SYMBOLS];
+
+        /* Cost of each symbol in the length code */
+        uint32_t len[LZX_LENCODE_NUM_SYMBOLS];
+
+#if CONSIDER_ALIGNED_COSTS
+        /* Cost of each symbol in the aligned offset code */
+        uint32_t aligned[LZX_ALIGNEDCODE_NUM_SYMBOLS];
+#endif
+};
+
+struct lzx_output_bitstream;
+
+/* The main LZX compressor structure */
+struct liblzx_compressor {
+
+        /* The LZX variant to use */
+        enum liblzx_variant variant;
+
+        /* Output chunk */
+        liblzx_output_chunk_t out_chunk;
+
+        /* True if a flush was requested*/
+        bool flushing;
+
+        /* Memory allocation function */
+        liblzx_alloc_func_t alloc_func;
+
+        /* Memory free function */
+        liblzx_free_func_t free_func;
+
+        /* Memory allocation userdata */
+        void *alloc_userdata;
+
+        /* True if the compressor is outputting the first block */
+        bool first_block;
+
+        /* E8 preprocessor file size */
+        uint32_t e8_file_size;
+
+        /* E8 preprocessor chunk offset */
+        uint32_t e8_chunk_offset;
+
+        /* The buffer for preprocessed input data, if not using destructive
+         * compression */
+        void *in_buffer;
+
+        /* The buffer for output data */
+        void *out_buffer;
+
+        /* Capacity of in_buffer */
+        uint32_t in_buffer_capacity;
+
+        /* Capacity of out_buffer */
+        uint32_t out_buffer_capacity;
+
+        /* Number of prefix bytes currently in in_buffer */
+        uint32_t in_prefix_size;
+
+        /* Number of bytes currently in in_buffer */
+        uint32_t in_used;
+
+        /* Maximum size of a chunk */
+        uint32_t chunk_size;
+
+        /* Pointer to the reset() implementation chosen at allocation time */
+        void (*reset)(struct liblzx_compressor *);
+
+        /* Pointer to the compress() implementation chosen at allocation time */
+        void (*impl)(struct liblzx_compressor *, const uint8_t *, size_t, size_t,
+                     struct lzx_output_bitstream *);
+
+        /* Pointer to the cul() implementation chosen at allocation time */
+        void (*cull)(struct liblzx_compressor *, size_t);
+
+        /* The window size. */
+        uint32_t window_size;
+
+        /* The log base 2 of the window size for match offset encoding purposes.
+         * This will be >= LZX_MIN_WINDOW_ORDER and <= LZX_MAX_WINDOW_ORDER. */
+        unsigned window_order;
+
+        /* The number of symbols in the main alphabet.  This depends on the
+         * window order, since the window order determines the maximum possible
+         * match offset. */
+        unsigned num_main_syms;
+
+        /* The "nice" match length: if a match of this length is found, then it
+         * is chosen immediately without further consideration. */
+        unsigned nice_match_length;
+
+        /* The maximum search depth: at most this many potential matches are
+         * considered at each position. */
+        unsigned max_search_depth;
+
+        /* The number of optimization passes per block */
+        unsigned num_optim_passes;
+
+        /* The symbol frequency counters for the current block */
+        struct lzx_freqs freqs;
+
+        /* Block split statistics for the current block */
+        struct lzx_block_split_stats split_stats;
+
+        /* The Huffman codes for the current and previous blocks.  The one with
+         * index 'codes_index' is for the current block, and the other one is
+         * for the previous block. */
+        struct lzx_codes codes[2];
+        unsigned codes_index;
+
+        /* The matches and literals that the compressor has chosen for the
+         * current block.  The required length of this array is limited by the
+         * maximum number of matches that can ever be chosen for a single block,
+         * plus one for the special entry at the end. */
+        struct lzx_sequence chosen_sequences[
+                       DIV_ROUND_UP(SOFT_MAX_BLOCK_SIZE, LZX_MIN_MATCH_LEN) + 1];
+
+        /* Least-recently-used match queue */
+        uint32_t lru_queue[LZX_NUM_RECENT_OFFSETS];
+
+        /* Next hashes */
+        uint32_t next_hashes[2];
+
+        /* Tables for mapping adjusted offsets to offset slots */
+        uint8_t offset_slot_tab_1[32768]; /* offset slots [0, 29] */
+        uint8_t offset_slot_tab_2[128]; /* offset slots [30, 49] */
+
+        union {
+                /* Data for lzx_compress_lazy() */
+                struct {
+                        /* Hash chains matchfinder (MUST BE LAST!!!) */
+                        union {
+                                struct hc_matchfinder_16 hc_mf_16;
+                                struct hc_matchfinder_32 hc_mf_32;
+                        };
+                };
+
+                /* Data for lzx_compress_near_optimal() */
+                struct {
+                        /*
+                         * Array of nodes, one per position, for running the
+                         * minimum-cost path algorithm.
+                         *
+                         * This array must be large enough to accommodate the
+                         * worst-case number of nodes, which occurs if the
+                         * compressor finds a match of length LZX_MAX_MATCH_LEN
+                         * at position 'SOFT_MAX_BLOCK_SIZE - 1', producing a
+                         * block of size 'SOFT_MAX_BLOCK_SIZE - 1 +
+                         * LZX_MAX_MATCH_LEN'.  Add one for the end-of-block
+                         * node.
+                         */
+                        struct lzx_optimum_node optimum_nodes[
+                                                    SOFT_MAX_BLOCK_SIZE - 1 +
+                                                    LZX_MAX_MATCH_LEN + 1];
+
+                        /* The cost model for the current optimization pass */
+                        struct lzx_costs costs;
+
+                        /*
+                         * Cached matches for the current block.  This array
+                         * contains the matches that were found at each position
+                         * in the block.  Specifically, for each position, there
+                         * is a special 'struct lz_match' whose 'length' field
+                         * contains the number of matches that were found at
+                         * that position; this is followed by the matches
+                         * themselves, if any, sorted by strictly increasing
+                         * length.
+                         *
+                         * Note: in rare cases, there will be a very high number
+                         * of matches in the block and this array will overflow.
+                         * If this happens, we force the end of the current
+                         * block.  CACHE_LENGTH is the length at which we
+                         * actually check for overflow.  The extra slots beyond
+                         * this are enough to absorb the worst case overflow,
+                         * which occurs if starting at &match_cache[CACHE_LENGTH
+                         * - 1], we write the match count header, then write
+                         * MAX_MATCHES_PER_POS matches, then skip searching for
+                         * matches at 'LZX_MAX_MATCH_LEN - 1' positions and
+                         * write the match count header for each.
+                         */
+                        struct lz_match match_cache[CACHE_LENGTH +
+                                                    MAX_MATCHES_PER_POS +
+                                                    LZX_MAX_MATCH_LEN - 1];
+
+                        /* Binary trees matchfinder (MUST BE LAST!!!) */
+                        union {
+                                struct bt_matchfinder_16 bt_mf_16;
+                                struct bt_matchfinder_32 bt_mf_32;
+                        };
+                };
+        };
+};
+
+/******************************************************************************/
+/*                            Matchfinder utilities                           */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Will a matchfinder using 16-bit positions be sufficient for compressing
+ * buffers of up to the specified size?  The limit could be 65536 bytes, but we
+ * also want to optimize out the use of offset_slot_tab_2 in the 16-bit case.
+ * This requires that the limit be no more than the length of offset_slot_tab_1
+ * (currently 32768).
+ */
+static attrib_forceinline bool
+lzx_is_16_bit(size_t max_bufsize)
+{
+        STATIC_ASSERT(ARRAY_LEN(((struct liblzx_compressor *)0)->offset_slot_tab_1) == 32768);
+        return max_bufsize <= 32768;
+}
+
+/*
+ * Return the offset slot for the specified adjusted match offset.
+ */
+static attrib_forceinline unsigned
+lzx_get_offset_slot(struct liblzx_compressor *c, uint32_t adjusted_offset,
+                    bool is_16_bit)
+{
+        if (__builtin_constant_p(adjusted_offset) &&
+            adjusted_offset < LZX_NUM_RECENT_OFFSETS)
+                return adjusted_offset;
+        if (is_16_bit || adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1))
+                return c->offset_slot_tab_1[adjusted_offset];
+
+        assert((adjusted_offset >> 14) < ARRAY_LEN(c->offset_slot_tab_2));
+
+        return c->offset_slot_tab_2[adjusted_offset >> 14];
+}
+
+/*
+ * For a match that has the specified length and adjusted offset, tally its main
+ * symbol, and if needed its length symbol; then return its main symbol.
+ */
+static attrib_forceinline unsigned
+lzx_tally_main_and_lensyms(struct liblzx_compressor *c, unsigned length,
+                           uint32_t adjusted_offset, bool is_16_bit)
+{
+        unsigned mainsym;
+
+        if (length >= LZX_MIN_SECONDARY_LEN) {
+                /* Length symbol needed */
+                c->freqs.len[length - LZX_MIN_SECONDARY_LEN]++;
+                mainsym = LZX_NUM_CHARS + LZX_NUM_PRIMARY_LENS;
+        } else {
+                /* No length symbol needed */
+                mainsym = LZX_NUM_CHARS + length - LZX_MIN_MATCH_LEN;
+        }
+
+        mainsym += LZX_NUM_LEN_HEADERS *
+                   lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
+        c->freqs.main[mainsym]++;
+        return mainsym;
+}
+
+/*
+ * The following macros call either the 16-bit or the 32-bit version of a
+ * matchfinder function based on the value of 'is_16_bit', which will be known
+ * at compilation time.
+ */
+
+#define CALL_HC_MF(is_16_bit, c, funcname, ...)                                      \
+        ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->hc_mf_16, ##__VA_ARGS__) : \
+                       CONCAT(funcname, _32)(&(c)->hc_mf_32, ##__VA_ARGS__));
+
+#define CALL_BT_MF(is_16_bit, c, funcname, ...)                                      \
+        ((is_16_bit) ? CONCAT(funcname, _16)(&(c)->bt_mf_16, ##__VA_ARGS__) : \
+                       CONCAT(funcname, _32)(&(c)->bt_mf_32, ##__VA_ARGS__));
+
+/******************************************************************************/
+/*                             Output bitstream                               */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * The LZX bitstream is encoded as a sequence of little endian 16-bit coding
+ * units.  Bits are ordered from most significant to least significant within
+ * each coding unit.
+ */
+
+/*
+ * Structure to keep track of the current state of sending bits to the
+ * compressed output buffer.
+ */
+struct lzx_output_bitstream {
+
+        /* Bits that haven't yet been written to the output buffer */
+        machine_word_t bitbuf;
+
+        /* Number of bits currently held in @bitbuf */
+        machine_word_t bitcount;
+
+        /* Pointer to the start of the output buffer */
+        uint8_t *start;
+
+        /* Pointer to the position in the output buffer at which the next coding
+         * unit should be written */
+        uint8_t *next;
+
+        /* Pointer to just past the end of the output buffer, rounded down by
+         * one byte if needed to make 'end - start' a multiple of 2 */
+        uint8_t *end;
+};
+
+/* Can the specified number of bits always be added to 'bitbuf' after all
+ * pending 16-bit coding units have been flushed?  */
+#define CAN_BUFFER(n)        ((n) <= WORDBITS - 15)
+
+/* Initialize the output bitstream to write to the specified buffer. */
+static void
+lzx_init_output(struct lzx_output_bitstream *os, void *buffer, size_t size)
+{
+        os->bitbuf = 0;
+        os->bitcount = 0;
+        os->start = buffer;
+        os->next = buffer;
+        os->end = (uint8_t *)buffer + (size & ~1);
+}
+
+/*
+ * Add some bits to the bitbuffer variable of the output bitstream.  The caller
+ * must make sure there is enough room.
+ */
+static attrib_forceinline void
+lzx_add_bits(struct lzx_output_bitstream *os, uint32_t bits, unsigned num_bits)
+{
+        os->bitbuf = (os->bitbuf << num_bits) | bits;
+        os->bitcount += num_bits;
+}
+
+/*
+ * Flush bits from the bitbuffer variable to the output buffer.  'max_num_bits'
+ * specifies the maximum number of bits that may have been added since the last
+ * flush.
+ */
+static attrib_forceinline void
+lzx_flush_bits(struct lzx_output_bitstream *os, unsigned max_num_bits)
+{
+        /* Masking the number of bits to shift is only needed to avoid undefined
+         * behavior; we don't actually care about the results of bad shifts.  On
+         * x86, the explicit masking generates no extra code.  */
+        const uint32_t shift_mask = WORDBITS - 1;
+
+        if (os->end - os->next < 6)
+                return;
+        put_unaligned_le16(os->bitbuf >> ((os->bitcount - 16) &
+                                            shift_mask), os->next + 0);
+        if (max_num_bits > 16)
+                put_unaligned_le16(os->bitbuf >> ((os->bitcount - 32) &
+                                                shift_mask), os->next + 2);
+        if (max_num_bits > 32)
+                put_unaligned_le16(os->bitbuf >> ((os->bitcount - 48) &
+                                                shift_mask), os->next + 4);
+        os->next += (os->bitcount >> 4) << 1;
+        os->bitcount &= 15;
+}
+
+/* Add at most 16 bits to the bitbuffer and flush it.  */
+static attrib_forceinline void
+lzx_write_bits(struct lzx_output_bitstream *os, uint32_t bits, unsigned num_bits)
+{
+        lzx_add_bits(os, bits, num_bits);
+        lzx_flush_bits(os, 16);
+}
+
+/*
+ * Flush the last coding unit to the output buffer if needed.  Return the total
+ * number of bytes written to the output buffer, or 0 if an overflow occurred.
+ */
+static size_t
+lzx_flush_output(struct lzx_output_bitstream *os)
+{
+        if (os->end - os->next < 6)
+                return 0;
+
+        if (os->bitcount != 0) {
+                put_unaligned_le16(os->bitbuf << (16 - os->bitcount), os->next);
+                os->next += 2;
+        }
+
+        return os->next - os->start;
+}
+
+/******************************************************************************/
+/*                           Preparing Huffman codes                          */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Build the Huffman codes.  This takes as input the frequency tables for each
+ * code and produces as output a set of tables that map symbols to codewords and
+ * codeword lengths.
+ */
+static void
+lzx_build_huffman_codes(struct liblzx_compressor *c)
+{
+        const struct lzx_freqs *freqs = &c->freqs;
+        struct lzx_codes *codes = &c->codes[c->codes_index];
+
+        STATIC_ASSERT_STMT(MAIN_CODEWORD_LIMIT >= 9 &&
+                      MAIN_CODEWORD_LIMIT <= LZX_MAX_MAIN_CODEWORD_LEN);
+        make_canonical_huffman_code(c->num_main_syms,
+                                    MAIN_CODEWORD_LIMIT,
+                                    freqs->main,
+                                    codes->lens.main,
+                                    codes->codewords.main);
+
+        STATIC_ASSERT_STMT(LENGTH_CODEWORD_LIMIT >= 8 &&
+                      LENGTH_CODEWORD_LIMIT <= LZX_MAX_LEN_CODEWORD_LEN);
+        make_canonical_huffman_code(LZX_LENCODE_NUM_SYMBOLS,
+                                    LENGTH_CODEWORD_LIMIT,
+                                    freqs->len,
+                                    codes->lens.len,
+                                    codes->codewords.len);
+
+        STATIC_ASSERT_STMT(
+            ALIGNED_CODEWORD_LIMIT >= LZX_NUM_ALIGNED_OFFSET_BITS &&
+                      ALIGNED_CODEWORD_LIMIT <= LZX_MAX_ALIGNED_CODEWORD_LEN);
+        make_canonical_huffman_code(LZX_ALIGNEDCODE_NUM_SYMBOLS,
+                                    ALIGNED_CODEWORD_LIMIT,
+                                    freqs->aligned,
+                                    codes->lens.aligned,
+                                    codes->codewords.aligned);
+}
+
+/* Reset the symbol frequencies for the current block. */
+static void
+lzx_reset_symbol_frequencies(struct liblzx_compressor *c)
+{
+        memset(&c->freqs, 0, sizeof(c->freqs));
+}
+
+static unsigned
+lzx_compute_precode_items(const uint8_t * restrict lens,
+                          const uint8_t * restrict prev_lens,
+                          uint32_t * restrict precode_freqs,
+                          unsigned * restrict precode_items)
+{
+        unsigned *itemptr;
+        unsigned run_start;
+        unsigned run_end;
+        unsigned extra_bits;
+        int delta;
+        uint8_t len;
+
+        itemptr = precode_items;
+        run_start = 0;
+
+        while (!((len = lens[run_start]) & 0x80)) {
+
+                /* len = the length being repeated  */
+
+                /* Find the next run of codeword lengths.  */
+
+                run_end = run_start + 1;
+
+                /* Fast case for a single length.  */
+                if (likely(len != lens[run_end])) {
+                        delta = prev_lens[run_start] - len;
+                        if (delta < 0)
+                                delta += 17;
+                        precode_freqs[delta]++;
+                        *itemptr++ = delta;
+                        run_start++;
+                        continue;
+                }
+
+                /* Extend the run.  */
+                do {
+                        run_end++;
+                } while (len == lens[run_end]);
+
+                if (len == 0) {
+                        /* Run of zeroes.  */
+
+                        /* Symbol 18: RLE 20 to 51 zeroes at a time.  */
+                        while ((run_end - run_start) >= 20) {
+                                extra_bits =
+                                    min_uint((run_end - run_start) - 20, 0x1F);
+                                precode_freqs[18]++;
+                                *itemptr++ = 18 | (extra_bits << 5);
+                                run_start += 20 + extra_bits;
+                        }
+
+                        /* Symbol 17: RLE 4 to 19 zeroes at a time.  */
+                        if ((run_end - run_start) >= 4) {
+                                extra_bits =
+                                    min_uint((run_end - run_start) - 4, 0xF);
+                                precode_freqs[17]++;
+                                *itemptr++ = 17 | (extra_bits << 5);
+                                run_start += 4 + extra_bits;
+                        }
+                } else {
+
+                        /* A run of nonzero lengths. */
+
+                        /* Symbol 19: RLE 4 to 5 of any length at a time.  */
+                        while ((run_end - run_start) >= 4) {
+                                extra_bits = (run_end - run_start) > 4;
+                                delta = prev_lens[run_start] - len;
+                                if (delta < 0)
+                                        delta += 17;
+                                precode_freqs[19]++;
+                                precode_freqs[delta]++;
+                                *itemptr++ = 19 | (extra_bits << 5) | (delta << 6);
+                                run_start += 4 + extra_bits;
+                        }
+                }
+
+                /* Output any remaining lengths without RLE.  */
+                while (run_start != run_end) {
+                        delta = prev_lens[run_start] - len;
+                        if (delta < 0)
+                                delta += 17;
+                        precode_freqs[delta]++;
+                        *itemptr++ = delta;
+                        run_start++;
+                }
+        }
+
+        return itemptr - precode_items;
+}
+
+/******************************************************************************/
+/*                          Outputting compressed data                        */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Output a Huffman code in the compressed form used in LZX.
+ *
+ * The Huffman code is represented in the output as a logical series of codeword
+ * lengths from which the Huffman code, which must be in canonical form, can be
+ * reconstructed.
+ *
+ * The codeword lengths are themselves compressed using a separate Huffman code,
+ * the "precode", which contains a symbol for each possible codeword length in
+ * the larger code as well as several special symbols to represent repeated
+ * codeword lengths (a form of run-length encoding).  The precode is itself
+ * constructed in canonical form, and its codeword lengths are represented
+ * literally in 20 4-bit fields that immediately precede the compressed codeword
+ * lengths of the larger code.
+ *
+ * Furthermore, the codeword lengths of the larger code are actually represented
+ * as deltas from the codeword lengths of the corresponding code in the previous
+ * block.
+ *
+ * @os:
+ *        Bitstream to which to write the compressed Huffman code.
+ * @lens:
+ *        The codeword lengths, indexed by symbol, in the Huffman code.
+ * @prev_lens:
+ *        The codeword lengths, indexed by symbol, in the corresponding Huffman
+ *        code in the previous block, or all zeroes if this is the first block.
+ * @num_lens:
+ *        The number of symbols in the Huffman code.
+ */
+static void
+lzx_write_compressed_code(struct lzx_output_bitstream *os,
+                          const uint8_t * restrict lens,
+                          const uint8_t * restrict prev_lens,
+                          unsigned num_lens)
+{
+        uint32_t precode_freqs[LZX_PRECODE_NUM_SYMBOLS];
+        uint8_t precode_lens[LZX_PRECODE_NUM_SYMBOLS];
+        uint32_t precode_codewords[LZX_PRECODE_NUM_SYMBOLS];
+        unsigned *precode_items = (unsigned *)alloca(sizeof(unsigned) * num_lens);
+        unsigned num_precode_items;
+        unsigned precode_item;
+        unsigned precode_sym;
+        unsigned i;
+        uint8_t saved = lens[num_lens];
+        *(uint8_t *)(lens + num_lens) = 0x80;
+
+        for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
+                precode_freqs[i] = 0;
+
+        /* Compute the "items" (RLE / literal tokens and extra bits) with which
+         * the codeword lengths in the larger code will be output.  */
+        num_precode_items = lzx_compute_precode_items(lens,
+                                                      prev_lens,
+                                                      precode_freqs,
+                                                      precode_items);
+
+        /* Build the precode.  */
+        STATIC_ASSERT_STMT(PRE_CODEWORD_LIMIT >= 5 &&
+                      PRE_CODEWORD_LIMIT <= LZX_MAX_PRE_CODEWORD_LEN);
+        make_canonical_huffman_code(LZX_PRECODE_NUM_SYMBOLS, PRE_CODEWORD_LIMIT,
+                                    precode_freqs, precode_lens,
+                                    precode_codewords);
+
+        /* Output the lengths of the codewords in the precode.  */
+        for (i = 0; i < LZX_PRECODE_NUM_SYMBOLS; i++)
+                lzx_write_bits(os, precode_lens[i], LZX_PRECODE_ELEMENT_SIZE);
+
+        /* Output the encoded lengths of the codewords in the larger code.  */
+        for (i = 0; i < num_precode_items; i++) {
+                precode_item = precode_items[i];
+                precode_sym = precode_item & 0x1F;
+                lzx_add_bits(os, precode_codewords[precode_sym],
+                             precode_lens[precode_sym]);
+                if (precode_sym >= 17) {
+                        if (precode_sym == 17) {
+                                lzx_add_bits(os, precode_item >> 5, 4);
+                        } else if (precode_sym == 18) {
+                                lzx_add_bits(os, precode_item >> 5, 5);
+                        } else {
+                                lzx_add_bits(os, (precode_item >> 5) & 1, 1);
+                                precode_sym = precode_item >> 6;
+                                lzx_add_bits(os, precode_codewords[precode_sym],
+                                             precode_lens[precode_sym]);
+                        }
+                }
+                STATIC_ASSERT_STMT(CAN_BUFFER(2 * PRE_CODEWORD_LIMIT + 1));
+                lzx_flush_bits(os, 2 * PRE_CODEWORD_LIMIT + 1);
+        }
+
+        *(uint8_t *)(lens + num_lens) = saved;
+}
+
+/*
+ * Write all matches and literal bytes (which were precomputed) in an LZX
+ * compressed block to the output bitstream in the final compressed
+ * representation.
+ *
+ * @os
+ *        The output bitstream.
+ * @block_type
+ *        The chosen type of the LZX compressed block (LZX_BLOCKTYPE_ALIGNED or
+ *        LZX_BLOCKTYPE_VERBATIM).
+ * @block_data
+ *        The uncompressed data of the block.
+ * @sequences
+ *        The matches and literals to output, given as a series of sequences.
+ * @codes
+ *        The main, length, and aligned offset Huffman codes for the block.
+ */
+static void
+lzx_write_sequences(struct lzx_output_bitstream *os, int block_type,
+                    const uint8_t *block_data, const struct lzx_sequence sequences[],
+                    const struct lzx_codes *codes)
+{
+        const struct lzx_sequence *seq = sequences;
+        unsigned min_aligned_offset_slot;
+
+        if (block_type == LZX_BLOCKTYPE_ALIGNED)
+                min_aligned_offset_slot = LZX_MIN_ALIGNED_OFFSET_SLOT;
+        else
+                min_aligned_offset_slot = LZX_MAX_OFFSET_SLOTS;
+
+        for (;;) {
+                /* Output the next sequence.  */
+
+                uint32_t litrunlen = seq->litrunlen_and_matchlen >> SEQ_MATCHLEN_BITS;
+                unsigned matchlen = seq->litrunlen_and_matchlen & SEQ_MATCHLEN_MASK;
+                STATIC_ASSERT((uint32_t)~SEQ_MATCHLEN_MASK >> SEQ_MATCHLEN_BITS >=
+                              SOFT_MAX_BLOCK_SIZE);
+                uint32_t adjusted_offset;
+                unsigned main_symbol;
+                unsigned offset_slot;
+                unsigned num_extra_bits;
+                uint32_t extra_bits;
+
+                /* Output the literal run of the sequence.  */
+
+                if (litrunlen) {  /* Is the literal run nonempty?  */
+
+                        /* Verify optimization is enabled on 64-bit  */
+                        STATIC_ASSERT(WORDBITS < 64 ||
+                                      CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT));
+
+                        if (CAN_BUFFER(3 * MAIN_CODEWORD_LIMIT)) {
+
+                                /* 64-bit: write 3 literals at a time.  */
+                                while (litrunlen >= 3) {
+                                        unsigned lit0 = block_data[0];
+                                        unsigned lit1 = block_data[1];
+                                        unsigned lit2 = block_data[2];
+                                        lzx_add_bits(os, codes->codewords.main[lit0],
+                                                     codes->lens.main[lit0]);
+                                        lzx_add_bits(os, codes->codewords.main[lit1],
+                                                     codes->lens.main[lit1]);
+                                        lzx_add_bits(os, codes->codewords.main[lit2],
+                                                     codes->lens.main[lit2]);
+                                        lzx_flush_bits(os, 3 * MAIN_CODEWORD_LIMIT);
+                                        block_data += 3;
+                                        litrunlen -= 3;
+                                }
+                                if (litrunlen--) {
+                                        unsigned lit = *block_data++;
+                                        lzx_add_bits(os, codes->codewords.main[lit],
+                                                     codes->lens.main[lit]);
+                                        if (litrunlen--) {
+                                                unsigned lit = *block_data++;
+                                                lzx_add_bits(os, codes->codewords.main[lit],
+                                                             codes->lens.main[lit]);
+                                                lzx_flush_bits(os, 2 * MAIN_CODEWORD_LIMIT);
+                                        } else {
+                                                lzx_flush_bits(os, 1 * MAIN_CODEWORD_LIMIT);
+                                        }
+                                }
+                        } else {
+                                /* 32-bit: write 1 literal at a time.  */
+                                do {
+                                        unsigned lit = *block_data++;
+                                        lzx_add_bits(os, codes->codewords.main[lit],
+                                                     codes->lens.main[lit]);
+                                        lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
+                                } while (--litrunlen);
+                        }
+                }
+
+                /* Was this the last literal run?  */
+                if (matchlen == 0)
+                        return;
+
+                /* Nope; output the match.  */
+
+                block_data += matchlen;
+
+                adjusted_offset = seq->adjusted_offset_and_mainsym >> SEQ_MAINSYM_BITS;
+                main_symbol = seq->adjusted_offset_and_mainsym & SEQ_MAINSYM_MASK;
+
+                offset_slot = (main_symbol - LZX_NUM_CHARS) / LZX_NUM_LEN_HEADERS;
+                num_extra_bits = lzx_extra_offset_bits[offset_slot];
+                extra_bits = adjusted_offset - (lzx_offset_slot_base[offset_slot] +
+                                                LZX_OFFSET_ADJUSTMENT);
+
+        #define MAX_MATCH_BITS (MAIN_CODEWORD_LIMIT +                \
+                                LENGTH_CODEWORD_LIMIT +                \
+                                LZX_MAX_NUM_EXTRA_BITS -        \
+                                LZX_NUM_ALIGNED_OFFSET_BITS +        \
+                                ALIGNED_CODEWORD_LIMIT)
+
+                /* Verify optimization is enabled on 64-bit  */
+                STATIC_ASSERT_STMT(WORDBITS < 64 || CAN_BUFFER(MAX_MATCH_BITS));
+
+                /* Output the main symbol for the match.  */
+
+                lzx_add_bits(os, codes->codewords.main[main_symbol],
+                             codes->lens.main[main_symbol]);
+                if (!CAN_BUFFER(MAX_MATCH_BITS))
+                        lzx_flush_bits(os, MAIN_CODEWORD_LIMIT);
+
+                /* If needed, output the length symbol for the match.  */
+
+                if (matchlen >= LZX_MIN_SECONDARY_LEN) {
+                        lzx_add_bits(os, codes->codewords.len[matchlen -
+                                                              LZX_MIN_SECONDARY_LEN],
+                                     codes->lens.len[matchlen -
+                                                     LZX_MIN_SECONDARY_LEN]);
+                        if (!CAN_BUFFER(MAX_MATCH_BITS))
+                                lzx_flush_bits(os, LENGTH_CODEWORD_LIMIT);
+                }
+
+                /* Output the extra offset bits for the match.  In aligned
+                 * offset blocks, the lowest 3 bits of the adjusted offset are
+                 * Huffman-encoded using the aligned offset code, provided that
+                 * there are at least extra 3 offset bits required.  All other
+                 * extra offset bits are output verbatim.  */
+
+                if (offset_slot >= min_aligned_offset_slot) {
+
+                        lzx_add_bits(os, extra_bits >> LZX_NUM_ALIGNED_OFFSET_BITS,
+                                     num_extra_bits - LZX_NUM_ALIGNED_OFFSET_BITS);
+                        if (!CAN_BUFFER(MAX_MATCH_BITS))
+                                lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS -
+                                                   LZX_NUM_ALIGNED_OFFSET_BITS);
+
+                        lzx_add_bits(os, codes->codewords.aligned[adjusted_offset &
+                                                                  LZX_ALIGNED_OFFSET_BITMASK],
+                                     codes->lens.aligned[adjusted_offset &
+                                                         LZX_ALIGNED_OFFSET_BITMASK]);
+                        if (!CAN_BUFFER(MAX_MATCH_BITS))
+                                lzx_flush_bits(os, ALIGNED_CODEWORD_LIMIT);
+                } else {
+                        STATIC_ASSERT(CAN_BUFFER(LZX_MAX_NUM_EXTRA_BITS));
+
+                        lzx_add_bits(os, extra_bits, num_extra_bits);
+                        if (!CAN_BUFFER(MAX_MATCH_BITS))
+                                lzx_flush_bits(os, LZX_MAX_NUM_EXTRA_BITS);
+                }
+
+                if (CAN_BUFFER(MAX_MATCH_BITS))
+                        lzx_flush_bits(os, MAX_MATCH_BITS);
+
+                /* Advance to the next sequence.  */
+                seq++;
+        }
+}
+
+static void
+lzx_write_header(uint32_t e8_file_size, struct lzx_output_bitstream *os)
+{
+        if (e8_file_size == 0) {
+                lzx_write_bits(os, 0, 1);
+        } else {
+                lzx_write_bits(os, 1, 1);
+                lzx_write_bits(os, (e8_file_size >> 16) & 0xffffu, 16);
+                lzx_write_bits(os, e8_file_size & 0xffffu, 16);
+        }
+}
+
+static void
+lzx_write_compressed_block(const uint8_t *block_begin,
+                           int block_type,
+                           uint32_t block_size,
+                           enum liblzx_variant variant,
+                           unsigned window_order,
+                           unsigned num_main_syms,
+                           const struct lzx_sequence sequences[],
+                           const struct lzx_codes * codes,
+                           const struct lzx_lens * prev_lens,
+                           struct lzx_output_bitstream * os)
+{
+        /* The first three bits indicate the type of block and are one of the
+         * LZX_BLOCKTYPE_* constants.  */
+        lzx_write_bits(os, block_type, 3);
+
+        /*
+         * Output the block size.
+         *
+         * The original LZX format encoded the block size in 24 bits.  However,
+         * the LZX format used in WIM archives uses 1 bit to specify whether the
+         * block has the default size of 32768 bytes, then optionally 16 bits to
+         * specify a non-default size.  This works fine for Microsoft's WIM
+         * software (WIMGAPI), which never compresses more than 32768 bytes at a
+         * time with LZX.  However, as an extension, our LZX compressor supports
+         * compressing up to 2097152 bytes, with a corresponding increase in
+         * window size.  It is possible for blocks in these larger buffers to
+         * exceed 65535 bytes; such blocks cannot have their size represented in
+         * 16 bits.
+         *
+         * The chosen solution was to use 24 bits for the block size when
+         * possibly required --- specifically, when the compressor has been
+         * allocated to be capable of compressing more than 32768 bytes at once
+         * (which also causes the number of main symbols to be increased).
+         */
+        if (variant == LIBLZX_VARIANT_WIM) {
+                if (block_size == LZX_DEFAULT_BLOCK_SIZE) {
+                        lzx_write_bits(os, 1, 1);
+                } else {
+                        lzx_write_bits(os, 0, 1);
+
+                        if (window_order >= 16)
+                                lzx_write_bits(os, block_size >> 16, 8);
+
+                        lzx_write_bits(os, block_size & 0xFFFF, 16);
+                }
+        } else {
+                lzx_write_bits(os, block_size >> 16, 8);
+                lzx_write_bits(os, block_size & 0xFFFF, 16);
+        }
+
+        /* If it's an aligned offset block, output the aligned offset code.  */
+        if (block_type == LZX_BLOCKTYPE_ALIGNED) {
+                for (int i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+                        lzx_write_bits(os, codes->lens.aligned[i],
+                                       LZX_ALIGNEDCODE_ELEMENT_SIZE);
+                }
+        }
+
+        /* Output the main code (two parts).  */
+        lzx_write_compressed_code(os, codes->lens.main,
+                                  prev_lens->main,
+                                  LZX_NUM_CHARS);
+        lzx_write_compressed_code(os, codes->lens.main + LZX_NUM_CHARS,
+                                  prev_lens->main + LZX_NUM_CHARS,
+                                  num_main_syms - LZX_NUM_CHARS);
+
+        /* Output the length code.  */
+        lzx_write_compressed_code(os, codes->lens.len,
+                                  prev_lens->len,
+                                  LZX_LENCODE_NUM_SYMBOLS);
+
+        /* Output the compressed matches and literals.  */
+        lzx_write_sequences(os, block_type, block_begin, sequences, codes);
+}
+
+/*
+ * Given the frequencies of symbols in an LZX-compressed block and the
+ * corresponding Huffman codes, return LZX_BLOCKTYPE_ALIGNED or
+ * LZX_BLOCKTYPE_VERBATIM if an aligned offset or verbatim block, respectively,
+ * will take fewer bits to output.
+ */
+static int
+lzx_choose_verbatim_or_aligned(const struct lzx_freqs * freqs,
+                               const struct lzx_codes * codes)
+{
+        uint32_t verbatim_cost = 0;
+        uint32_t aligned_cost = 0;
+
+        /* A verbatim block requires 3 bits in each place that an aligned offset
+         * symbol would be used in an aligned offset block.  */
+        for (unsigned i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+                verbatim_cost += LZX_NUM_ALIGNED_OFFSET_BITS * freqs->aligned[i];
+                aligned_cost += codes->lens.aligned[i] * freqs->aligned[i];
+        }
+
+        /* Account for the cost of sending the codeword lengths of the aligned
+         * offset code.  */
+        aligned_cost += LZX_ALIGNEDCODE_ELEMENT_SIZE *
+                        LZX_ALIGNEDCODE_NUM_SYMBOLS;
+
+        if (aligned_cost < verbatim_cost)
+                return LZX_BLOCKTYPE_ALIGNED;
+        else
+                return LZX_BLOCKTYPE_VERBATIM;
+}
+
+/*
+ * Flush an LZX block:
+ *
+ * 1. Build the Huffman codes.
+ * 2. Decide whether to output the block as VERBATIM or ALIGNED.
+ * 3. Write the block.
+ * 4. Swap the indices of the current and previous Huffman codes.
+ *
+ * Note: we never output UNCOMPRESSED blocks.  This probably should be
+ * implemented sometime, but it doesn't make much difference.
+ */
+static void
+lzx_flush_block(struct liblzx_compressor *c, struct lzx_output_bitstream *os,
+                const uint8_t *block_begin, uint32_t block_size, uint32_t seq_idx)
+{
+        int block_type;
+
+        lzx_build_huffman_codes(c);
+
+        block_type = lzx_choose_verbatim_or_aligned(&c->freqs,
+                                                    &c->codes[c->codes_index]);
+
+        if (c->variant != LIBLZX_VARIANT_WIM) {
+                if (c->first_block) {
+                        lzx_write_header(c->e8_file_size, os);
+                        c->first_block = false;
+                }
+        }
+
+        lzx_write_compressed_block(block_begin,
+                                   block_type,
+                                   block_size,
+                                   c->variant,
+                                   c->window_order,
+                                   c->num_main_syms,
+                                   &c->chosen_sequences[seq_idx],
+                                   &c->codes[c->codes_index],
+                                   &c->codes[c->codes_index ^ 1].lens,
+                                   os);
+        c->codes_index ^= 1;
+}
+
+/******************************************************************************/
+/*                          Block splitting algorithm                         */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * The problem of block splitting is to decide when it is worthwhile to start a
+ * new block with new entropy codes.  There is a theoretically optimal solution:
+ * recursively consider every possible block split, considering the exact cost
+ * of each block, and choose the minimum cost approach.  But this is far too
+ * slow.  Instead, as an approximation, we can count symbols and after every N
+ * symbols, compare the expected distribution of symbols based on the previous
+ * data with the actual distribution.  If they differ "by enough", then start a
+ * new block.
+ *
+ * As an optimization and heuristic, we don't distinguish between every symbol
+ * but rather we combine many symbols into a single "observation type".  For
+ * literals we only look at the high bits and low bits, and for matches we only
+ * look at whether the match is long or not.  The assumption is that for typical
+ * "real" data, places that are good block boundaries will tend to be noticeable
+ * based only on changes in these aggregate frequencies, without looking for
+ * subtle differences in individual symbols.  For example, a change from ASCII
+ * bytes to non-ASCII bytes, or from few matches (generally less compressible)
+ * to many matches (generally more compressible), would be easily noticed based
+ * on the aggregates.
+ *
+ * For determining whether the frequency distributions are "different enough" to
+ * start a new block, the simply heuristic of splitting when the sum of absolute
+ * differences exceeds a constant seems to be good enough.
+ *
+ * Finally, for an approximation, it is not strictly necessary that the exact
+ * symbols being used are considered.  With "near-optimal parsing", for example,
+ * the actual symbols that will be used are unknown until after the block
+ * boundary is chosen and the block has been optimized.  Since the final choices
+ * cannot be used, we can use preliminary "greedy" choices instead.
+ */
+
+/* Initialize the block split statistics when starting a new block. */
+static void
+lzx_init_block_split_stats(struct lzx_block_split_stats *stats)
+{
+        memset(stats, 0, sizeof(*stats));
+}
+
+/* Literal observation.  Heuristic: use the top 2 bits and low 1 bits of the
+ * literal, for 8 possible literal observation types.  */
+static attrib_forceinline void
+lzx_observe_literal(struct lzx_block_split_stats *stats, uint8_t lit)
+{
+        stats->new_observations[((lit >> 5) & 0x6) | (lit & 1)]++;
+        stats->num_new_observations++;
+}
+
+/* Match observation.  Heuristic: use one observation type for "short match" and
+ * one observation type for "long match".  */
+static attrib_forceinline void
+lzx_observe_match(struct lzx_block_split_stats *stats, unsigned length)
+{
+        stats->new_observations[NUM_LITERAL_OBSERVATION_TYPES + (length >= 5)]++;
+        stats->num_new_observations++;
+}
+
+static bool
+lzx_should_end_block(struct lzx_block_split_stats *stats)
+{
+        if (stats->num_observations > 0) {
+
+                /* Note: to avoid slow divisions, we do not divide by
+                 * 'num_observations', but rather do all math with the numbers
+                 * multiplied by 'num_observations'. */
+                uint32_t total_delta = 0;
+                for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+                        uint32_t expected = stats->observations[i] *
+                                       stats->num_new_observations;
+                        uint32_t actual = stats->new_observations[i] *
+                                     stats->num_observations;
+                        uint32_t delta = (actual > expected) ? actual - expected :
+                                                          expected - actual;
+                        total_delta += delta;
+                }
+
+                /* Ready to end the block? */
+                if (total_delta >=
+                    stats->num_new_observations * 7 / 8 * stats->num_observations)
+                        return true;
+        }
+
+        for (int i = 0; i < NUM_OBSERVATION_TYPES; i++) {
+                stats->num_observations += stats->new_observations[i];
+                stats->observations[i] += stats->new_observations[i];
+                stats->new_observations[i] = 0;
+        }
+        stats->num_new_observations = 0;
+        return false;
+}
+
+/******************************************************************************/
+/*                   Slower ("near-optimal") compression algorithm            */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Least-recently-used queue for match offsets.
+ *
+ * This is represented as a 64-bit integer for efficiency.  There are three
+ * offsets of 21 bits each.  Bit 64 is garbage.
+ */
+struct attrib_aligned(8) lzx_lru_queue {
+        uint64_t R;
+};
+
+#define LZX_QUEUE_OFFSET_SHIFT        21
+#define LZX_QUEUE_OFFSET_MASK        (((uint64_t)1 << LZX_QUEUE_OFFSET_SHIFT) - 1)
+
+#define LZX_QUEUE_R0_SHIFT (0 * LZX_QUEUE_OFFSET_SHIFT)
+#define LZX_QUEUE_R1_SHIFT (1 * LZX_QUEUE_OFFSET_SHIFT)
+#define LZX_QUEUE_R2_SHIFT (2 * LZX_QUEUE_OFFSET_SHIFT)
+
+#define LZX_QUEUE_R0_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R0_SHIFT)
+#define LZX_QUEUE_R1_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R1_SHIFT)
+#define LZX_QUEUE_R2_MASK (LZX_QUEUE_OFFSET_MASK << LZX_QUEUE_R2_SHIFT)
+
+static attrib_forceinline uint64_t
+lzx_lru_queue_R0(struct lzx_lru_queue queue)
+{
+        return (queue.R >> LZX_QUEUE_R0_SHIFT) & LZX_QUEUE_OFFSET_MASK;
+}
+
+static attrib_forceinline uint64_t
+lzx_lru_queue_R1(struct lzx_lru_queue queue)
+{
+        return (queue.R >> LZX_QUEUE_R1_SHIFT) & LZX_QUEUE_OFFSET_MASK;
+}
+
+static attrib_forceinline uint64_t
+lzx_lru_queue_R2(struct lzx_lru_queue queue)
+{
+        return (queue.R >> LZX_QUEUE_R2_SHIFT) & LZX_QUEUE_OFFSET_MASK;
+}
+
+static attrib_forceinline void
+lzx_lru_queue_save(uint32_t * restrict out_queue,
+                   const struct lzx_lru_queue * restrict in_queue)
+{
+        struct lzx_lru_queue queue = *in_queue;
+        out_queue[0] = lzx_lru_queue_R0(queue);
+        out_queue[1] = lzx_lru_queue_R1(queue);
+        out_queue[2] = lzx_lru_queue_R2(queue);
+}
+
+static attrib_forceinline void
+lzx_lru_queue_load(struct lzx_lru_queue *restrict out_queue,
+                   const uint32_t *restrict in_queue)
+{
+        uint64_t r = 0;
+        r |= (uint64_t)(in_queue[0]) << LZX_QUEUE_R0_SHIFT;
+        r |= (uint64_t)(in_queue[1]) << LZX_QUEUE_R1_SHIFT;
+        r |= (uint64_t)(in_queue[2]) << LZX_QUEUE_R2_SHIFT;
+        out_queue->R = r;
+}
+
+/* Push a match offset onto the front (most recently used) end of the queue.  */
+static attrib_forceinline struct lzx_lru_queue
+lzx_lru_queue_push(struct lzx_lru_queue queue, uint32_t offset)
+{
+        return (struct lzx_lru_queue) {
+                .R = (queue.R << LZX_QUEUE_OFFSET_SHIFT) | offset,
+        };
+}
+
+/* Swap a match offset to the front of the queue.  */
+static attrib_forceinline struct lzx_lru_queue
+lzx_lru_queue_swap(struct lzx_lru_queue queue, unsigned idx)
+{
+        unsigned shift = idx * 21;
+        const uint64_t mask = LZX_QUEUE_R0_MASK;
+        const uint64_t mask_high = mask << shift;
+
+        return (struct lzx_lru_queue) {
+                (queue.R & ~(mask | mask_high)) |
+                ((queue.R & mask_high) >> shift) |
+                ((queue.R & mask) << shift)
+        };
+}
+
+static attrib_forceinline uint32_t
+lzx_walk_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit,
+                   bool record)
+{
+        struct lzx_sequence *seq =
+                &c->chosen_sequences[ARRAY_LEN(c->chosen_sequences) - 1];
+        uint32_t node_idx = block_size;
+        uint32_t litrun_end; /* if record=true: end of the current literal run */
+
+        if (record) {
+                /* The last sequence has matchlen 0 */
+                seq->litrunlen_and_matchlen = 0;
+                litrun_end = node_idx;
+        }
+
+        for (;;) {
+                uint32_t item;
+                unsigned matchlen;
+                uint32_t adjusted_offset;
+                unsigned mainsym;
+
+                /* Tally literals until either a match or the beginning of the
+                 * block is reached.  Note: the item in the node at the
+                 * beginning of the block (c->optimum_nodes[0]) has all bits
+                 * set, causing this loop to end when it is reached. */
+                for (;;) {
+                        item = c->optimum_nodes[node_idx].item;
+                        if (item & OPTIMUM_LEN_MASK)
+                                break;
+                        c->freqs.main[item >> OPTIMUM_OFFSET_SHIFT]++;
+                        node_idx--;
+                }
+
+        #if CONSIDER_GAP_MATCHES
+                if (item & OPTIMUM_GAP_MATCH) {
+                        if (node_idx == 0)
+                                break;
+                        /* Tally/record the rep0 match after the gap. */
+                        matchlen = item & OPTIMUM_LEN_MASK;
+                        mainsym = lzx_tally_main_and_lensyms(c, matchlen, 0,
+                                                             is_16_bit);
+                        if (record) {
+                                seq->litrunlen_and_matchlen |=
+                                        (litrun_end - node_idx) <<
+                                         SEQ_MATCHLEN_BITS;
+                                seq--;
+                                seq->litrunlen_and_matchlen = matchlen;
+                                seq->adjusted_offset_and_mainsym = mainsym;
+                                litrun_end = node_idx - matchlen;
+                        }
+
+                        /* Tally the literal in the gap. */
+                        c->freqs.main[(uint8_t)(item >> OPTIMUM_OFFSET_SHIFT)]++;
+
+                        /* Fall through and tally the match before the gap.
+                         * (It was temporarily saved in the 'cost' field of the
+                         * previous node, which was free to reuse.) */
+                        item = c->optimum_nodes[--node_idx].cost;
+                        node_idx -= matchlen;
+                }
+        #else /* CONSIDER_GAP_MATCHES */
+                if (node_idx == 0)
+                        break;
+        #endif /* !CONSIDER_GAP_MATCHES */
+
+                /* Tally/record a match. */
+                matchlen = item & OPTIMUM_LEN_MASK;
+                adjusted_offset = item >> OPTIMUM_OFFSET_SHIFT;
+                mainsym = lzx_tally_main_and_lensyms(c, matchlen,
+                                                     adjusted_offset,
+                                                     is_16_bit);
+                if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET +
+                                       LZX_OFFSET_ADJUSTMENT)
+                        c->freqs.aligned[adjusted_offset &
+                                         LZX_ALIGNED_OFFSET_BITMASK]++;
+                if (record) {
+                        seq->litrunlen_and_matchlen |=
+                                (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
+                        seq--;
+                        seq->litrunlen_and_matchlen = matchlen;
+                        seq->adjusted_offset_and_mainsym =
+                                (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
+                        litrun_end = node_idx - matchlen;
+                }
+                node_idx -= matchlen;
+        }
+
+        /* Record the literal run length for the first sequence. */
+        if (record) {
+                seq->litrunlen_and_matchlen |=
+                        (litrun_end - node_idx) << SEQ_MATCHLEN_BITS;
+        }
+
+        /* Return the index in chosen_sequences at which the sequences begin. */
+        return seq - &c->chosen_sequences[0];
+}
+
+/*
+ * Given the minimum-cost path computed through the item graph for the current
+ * block, walk the path and count how many of each symbol in each Huffman-coded
+ * alphabet would be required to output the items (matches and literals) along
+ * the path.
+ *
+ * Note that the path will be walked backwards (from the end of the block to the
+ * beginning of the block), but this doesn't matter because this function only
+ * computes frequencies.
+ */
+static attrib_forceinline void
+lzx_tally_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit)
+{
+        lzx_walk_item_list(c, block_size, is_16_bit, false);
+}
+
+/*
+ * Like lzx_tally_item_list(), but this function also generates the list of
+ * lzx_sequences for the minimum-cost path and writes it to c->chosen_sequences,
+ * ready to be output to the bitstream after the Huffman codes are computed.
+ * The lzx_sequences will be written to decreasing memory addresses as the path
+ * is walked backwards, which means they will end up in the expected
+ * first-to-last order.  The return value is the index in c->chosen_sequences at
+ * which the lzx_sequences begin.
+ */
+static attrib_forceinline uint32_t
+lzx_record_item_list(struct liblzx_compressor *c, uint32_t block_size, bool is_16_bit)
+{
+        return lzx_walk_item_list(c, block_size, is_16_bit, true);
+}
+
+/*
+ * Find an inexpensive path through the graph of possible match/literal choices
+ * for the current block.  The nodes of the graph are
+ * c->optimum_nodes[0...block_size].  They correspond directly to the bytes in
+ * the current block, plus one extra node for end-of-block.  The edges of the
+ * graph are matches and literals.  The goal is to find the minimum cost path
+ * from 'c->optimum_nodes[0]' to 'c->optimum_nodes[block_size]', given the cost
+ * model 'c->costs'.
+ *
+ * The algorithm works forwards, starting at 'c->optimum_nodes[0]' and
+ * proceeding forwards one node at a time.  At each node, a selection of matches
+ * (len >= 2), as well as the literal byte (len = 1), is considered.  An item of
+ * length 'len' provides a new path to reach the node 'len' bytes later.  If
+ * such a path is the lowest cost found so far to reach that later node, then
+ * that later node is updated with the new cost and the "arrival" which provided
+ * that cost.
+ *
+ * Note that although this algorithm is based on minimum cost path search, due
+ * to various simplifying assumptions the result is not guaranteed to be the
+ * true minimum cost, or "optimal", path over the graph of all valid LZX
+ * representations of this block.
+ *
+ * Also, note that because of the presence of the recent offsets queue (which is
+ * a type of adaptive state), the algorithm cannot work backwards and compute
+ * "cost to end" instead of "cost to beginning".  Furthermore, the way the
+ * algorithm handles this adaptive state in the "minimum cost" parse is actually
+ * only an approximation.  It's possible for the globally optimal, minimum cost
+ * path to contain a prefix, ending at a position, where that path prefix is
+ * *not* the minimum cost path to that position.  This can happen if such a path
+ * prefix results in a different adaptive state which results in lower costs
+ * later.  The algorithm does not solve this problem in general; it only looks
+ * one step ahead, with the exception of special consideration for "gap
+ * matches".
+ */
+static attrib_forceinline struct lzx_lru_queue
+lzx_find_min_cost_path(struct liblzx_compressor * const restrict c,
+                       const uint8_t * const restrict block_begin,
+                       const uint32_t block_size,
+                       const struct lzx_lru_queue initial_queue,
+                       bool is_16_bit)
+{
+        struct lzx_optimum_node *cur_node = c->optimum_nodes;
+        struct lzx_optimum_node * const end_node = cur_node + block_size;
+        struct lz_match *cache_ptr = c->match_cache;
+        const uint8_t *in_next = block_begin;
+        const uint8_t * const block_end = block_begin + block_size;
+
+        /*
+         * Instead of storing the match offset LRU queues in the
+         * 'lzx_optimum_node' structures, we save memory (and cache lines) by
+         * storing them in a smaller array.  This works because the algorithm
+         * only requires a limited history of the adaptive state.  Once a given
+         * state is more than LZX_MAX_MATCH_LEN bytes behind the current node
+         * (more if gap match consideration is enabled; we just round up to 512
+         * so it's a power of 2), it is no longer needed.
+         *
+         * The QUEUE() macro finds the queue for the given node.  This macro has
+         * been optimized by taking advantage of 'struct lzx_lru_queue' and
+         * 'struct lzx_optimum_node' both being 8 bytes in size and alignment.
+         */
+        struct lzx_lru_queue queues[512];
+        STATIC_ASSERT(ARRAY_LEN(queues) >= LZX_MAX_MATCH_LEN + 1);
+        STATIC_ASSERT(sizeof(c->optimum_nodes[0]) == sizeof(queues[0]));
+#define QUEUE(node) \
+        (*(struct lzx_lru_queue *)((char *)queues + \
+                        ((uintptr_t)(node) % (ARRAY_LEN(queues) * sizeof(queues[0])))))
+        /*(queues[(uintptr_t)(node) / sizeof(*(node)) % ARRAY_LEN(queues)])*/
+
+#if CONSIDER_GAP_MATCHES
+        uint32_t matches_before_gap[ARRAY_LEN(queues)];
+#define MATCH_BEFORE_GAP(node) \
+        (matches_before_gap[(uintptr_t)(node) / sizeof(*(node)) % \
+                            ARRAY_LEN(matches_before_gap)])
+#endif
+
+        /*
+         * Initially, the cost to reach each node is "infinity".
+         *
+         * The first node actually should have cost 0, but "infinity"
+         * (0xFFFFFFFF) works just as well because it immediately overflows.
+         *
+         * The following statement also intentionally sets the 'item' of the
+         * first node, which would otherwise have no meaning, to 0xFFFFFFFF for
+         * use as a sentinel.  See lzx_walk_item_list().
+         */
+        memset(c->optimum_nodes, 0xFF,
+               (block_size + 1) * sizeof(c->optimum_nodes[0]));
+
+        /* Initialize the recent offsets queue for the first node. */
+        QUEUE(cur_node) = initial_queue;
+
+        do { /* For each node in the block in position order... */
+
+                unsigned num_matches;
+                unsigned literal;
+                uint32_t cost;
+
+                /*
+                 * A selection of matches for the block was already saved in
+                 * memory so that we don't have to run the uncompressed data
+                 * through the matchfinder on every optimization pass.  However,
+                 * we still search for repeat offset matches during each
+                 * optimization pass because we cannot predict the state of the
+                 * recent offsets queue.  But as a heuristic, we don't bother
+                 * searching for repeat offset matches if the general-purpose
+                 * matchfinder failed to find any matches.
+                 *
+                 * Note that a match of length n at some offset implies there is
+                 * also a match of length l for LZX_MIN_MATCH_LEN <= l <= n at
+                 * that same offset.  In other words, we don't necessarily need
+                 * to use the full length of a match.  The key heuristic that
+                 * saves a significicant amount of time is that for each
+                 * distinct length, we only consider the smallest offset for
+                 * which that length is available.  This heuristic also applies
+                 * to repeat offsets, which we order specially: R0 < R1 < R2 <
+                 * any explicit offset.  Of course, this heuristic may be
+                 * produce suboptimal results because offset slots in LZX are
+                 * subject to entropy encoding, but in practice this is a useful
+                 * heuristic.
+                 */
+
+                num_matches = cache_ptr->length;
+                cache_ptr++;
+
+                if (num_matches) {
+                        struct lz_match *end_matches = cache_ptr + num_matches;
+                        unsigned next_len = LZX_MIN_MATCH_LEN;
+                        unsigned max_len =
+                            min_uint(block_end - in_next, LZX_MAX_MATCH_LEN);
+                        const uint8_t *matchptr;
+
+                        /* Consider rep0 matches. */
+                        matchptr = in_next - lzx_lru_queue_R0(QUEUE(cur_node));
+                        if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
+                                goto rep0_done;
+                        STATIC_ASSERT_STMT(LZX_MIN_MATCH_LEN == 2);
+                        do {
+                                uint32_t cost = cur_node->cost +
+                                           c->costs.match_cost[0][
+                                                        next_len - LZX_MIN_MATCH_LEN];
+                                if (cost <= (cur_node + next_len)->cost) {
+                                        (cur_node + next_len)->cost = cost;
+                                        (cur_node + next_len)->item =
+                                                (0 << OPTIMUM_OFFSET_SHIFT) | next_len;
+                                }
+                                if (unlikely(++next_len > max_len)) {
+                                        cache_ptr = end_matches;
+                                        goto done_matches;
+                                }
+                        } while (in_next[next_len - 1] == matchptr[next_len - 1]);
+
+                rep0_done:
+
+                        /* Consider rep1 matches. */
+                        matchptr = in_next - lzx_lru_queue_R1(QUEUE(cur_node));
+                        if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
+                                goto rep1_done;
+                        if (matchptr[next_len - 1] != in_next[next_len - 1])
+                                goto rep1_done;
+                        for (unsigned len = 2; len < next_len - 1; len++)
+                                if (matchptr[len] != in_next[len])
+                                        goto rep1_done;
+                        do {
+                                uint32_t cost = cur_node->cost +
+                                           c->costs.match_cost[1][
+                                                        next_len - LZX_MIN_MATCH_LEN];
+                                if (cost <= (cur_node + next_len)->cost) {
+                                        (cur_node + next_len)->cost = cost;
+                                        (cur_node + next_len)->item =
+                                                (1 << OPTIMUM_OFFSET_SHIFT) | next_len;
+                                }
+                                if (unlikely(++next_len > max_len)) {
+                                        cache_ptr = end_matches;
+                                        goto done_matches;
+                                }
+                        } while (in_next[next_len - 1] == matchptr[next_len - 1]);
+
+                rep1_done:
+
+                        /* Consider rep2 matches. */
+                        matchptr = in_next - lzx_lru_queue_R2(QUEUE(cur_node));
+                        if (load_u16_unaligned(matchptr) != load_u16_unaligned(in_next))
+                                goto rep2_done;
+                        if (matchptr[next_len - 1] != in_next[next_len - 1])
+                                goto rep2_done;
+                        for (unsigned len = 2; len < next_len - 1; len++)
+                                if (matchptr[len] != in_next[len])
+                                        goto rep2_done;
+                        do {
+                                uint32_t cost = cur_node->cost +
+                                           c->costs.match_cost[2][
+                                                        next_len - LZX_MIN_MATCH_LEN];
+                                if (cost <= (cur_node + next_len)->cost) {
+                                        (cur_node + next_len)->cost = cost;
+                                        (cur_node + next_len)->item =
+                                                (2 << OPTIMUM_OFFSET_SHIFT) | next_len;
+                                }
+                                if (unlikely(++next_len > max_len)) {
+                                        cache_ptr = end_matches;
+                                        goto done_matches;
+                                }
+                        } while (in_next[next_len - 1] == matchptr[next_len - 1]);
+
+                rep2_done:
+
+                        while (next_len > cache_ptr->length)
+                                if (++cache_ptr == end_matches)
+                                        goto done_matches;
+
+                        /* Consider explicit offset matches. */
+                        for (;;) {
+                                uint32_t offset = cache_ptr->offset;
+                                uint32_t adjusted_offset = offset + LZX_OFFSET_ADJUSTMENT;
+                                unsigned offset_slot = lzx_get_offset_slot(c, adjusted_offset, is_16_bit);
+                                uint32_t base_cost = cur_node->cost;
+                                uint32_t cost;
+
+                        #if CONSIDER_ALIGNED_COSTS
+                                if (offset >= LZX_MIN_ALIGNED_OFFSET)
+                                        base_cost += c->costs.aligned[adjusted_offset &
+                                                                      LZX_ALIGNED_OFFSET_BITMASK];
+                        #endif
+                                do {
+                                        cost = base_cost +
+                                               c->costs.match_cost[offset_slot][
+                                                                next_len - LZX_MIN_MATCH_LEN];
+                                        if (cost < (cur_node + next_len)->cost) {
+                                                (cur_node + next_len)->cost = cost;
+                                                (cur_node + next_len)->item =
+                                                        (adjusted_offset << OPTIMUM_OFFSET_SHIFT) | next_len;
+                                        }
+                                } while (++next_len <= cache_ptr->length);
+
+                                if (++cache_ptr == end_matches) {
+                                #if CONSIDER_GAP_MATCHES
+                                        /* Also consider the longest explicit
+                                         * offset match as a "gap match": match
+                                         * + lit + rep0. */
+                                        int32_t remaining = (block_end - in_next) - (int32_t)next_len;
+                                        if (likely(remaining >= 2)) {
+                                                const uint8_t *strptr = in_next + next_len;
+                                                const uint8_t *matchptr = strptr - offset;
+                                                if (load_u16_unaligned(strptr) == load_u16_unaligned(matchptr)) {
+                                                        STATIC_ASSERT(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2 >= 250);
+                                                        STATIC_ASSERT(ARRAY_LEN(queues) == ARRAY_LEN(matches_before_gap));
+                                                        unsigned limit = min_uint(remaining,
+                                                                             min_uint(ARRAY_LEN(queues) - LZX_MAX_MATCH_LEN - 2,
+                                                                                 LZX_MAX_MATCH_LEN));
+                                                        unsigned rep0_len = lz_extend(strptr, matchptr, 2, limit);
+                                                        uint8_t lit = strptr[-1];
+                                                        unsigned total_len = next_len + rep0_len;
+                                                        cost += c->costs.main[lit] +
+                                                                c->costs.match_cost[0][rep0_len - LZX_MIN_MATCH_LEN];
+                                                        if (cost < (cur_node + total_len)->cost) {
+                                                                (cur_node + total_len)->cost = cost;
+                                                                (cur_node + total_len)->item =
+                                                                        OPTIMUM_GAP_MATCH |
+                                                                        ((uint32_t)lit << OPTIMUM_OFFSET_SHIFT) |
+                                                                        rep0_len;
+                                                                MATCH_BEFORE_GAP(cur_node + total_len) =
+                                                                        (adjusted_offset << OPTIMUM_OFFSET_SHIFT) |
+                                                                        (next_len - 1);
+                                                        }
+                                                }
+                                        }
+                                #endif /* CONSIDER_GAP_MATCHES */
+                                        break;
+                                }
+                        }
+                }
+
+        done_matches:
+
+                /* Consider coding a literal.
+
+                 * To avoid an extra branch, actually checking the preferability
+                 * of coding the literal is integrated into the queue update
+                 * code below. */
+                literal = *in_next++;
+                cost = cur_node->cost + c->costs.main[literal];
+
+                /* Advance to the next position. */
+                cur_node++;
+
+                /* The lowest-cost path to the current position is now known.
+                 * Finalize the recent offsets queue that results from taking
+                 * this lowest-cost path. */
+
+                if (cost <= cur_node->cost) {
+                        /* Literal: queue remains unchanged. */
+                        cur_node->cost = cost;
+                        cur_node->item = (uint32_t)literal << OPTIMUM_OFFSET_SHIFT;
+                        QUEUE(cur_node) = QUEUE(cur_node - 1);
+                } else {
+                        /* Match: queue update is needed. */
+                        unsigned len = cur_node->item & OPTIMUM_LEN_MASK;
+                #if CONSIDER_GAP_MATCHES
+                        int32_t adjusted_offset = (int32_t)cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+                        STATIC_ASSERT(OPTIMUM_GAP_MATCH == 0x80000000); /* assuming sign extension */
+                #else
+                        uint32_t adjusted_offset = cur_node->item >> OPTIMUM_OFFSET_SHIFT;
+                #endif
+
+                        if (adjusted_offset >= LZX_NUM_RECENT_OFFSETS) {
+                                /* Explicit offset match: insert offset at front. */
+                                QUEUE(cur_node) =
+                                        lzx_lru_queue_push(QUEUE(cur_node - len),
+                                                           adjusted_offset - LZX_OFFSET_ADJUSTMENT);
+                        }
+                #if CONSIDER_GAP_MATCHES
+                        else if (adjusted_offset < 0) {
+                                /* "Gap match": Explicit offset match, then a
+                                 * literal, then rep0 match.  Save the explicit
+                                 * offset match information in the cost field of
+                                 * the previous node, which isn't needed
+                                 * anymore.  Then insert the offset at the front
+                                 * of the queue. */
+                                uint32_t match_before_gap = MATCH_BEFORE_GAP(cur_node);
+                                (cur_node - 1)->cost = match_before_gap;
+                                QUEUE(cur_node) =
+                                        lzx_lru_queue_push(QUEUE(cur_node - len - 1 -
+                                                                 (match_before_gap & OPTIMUM_LEN_MASK)),
+                                                           (match_before_gap >> OPTIMUM_OFFSET_SHIFT) -
+                                                           LZX_OFFSET_ADJUSTMENT);
+                        }
+                #endif
+                        else {
+                                /* Repeat offset match: swap offset to front. */
+                                QUEUE(cur_node) =
+                                        lzx_lru_queue_swap(QUEUE(cur_node - len),
+                                                           adjusted_offset);
+                        }
+                }
+        } while (cur_node != end_node);
+
+        /* Return the recent offsets queue at the end of the path. */
+        return QUEUE(cur_node);
+}
+
+/*
+ * Given the costs for the main and length codewords (c->costs.main and
+ * c->costs.len), initialize the match cost array (c->costs.match_cost) which
+ * directly provides the cost of every possible (length, offset slot) pair.
+ */
+static void
+lzx_compute_match_costs(struct liblzx_compressor *c)
+{
+        unsigned num_offset_slots = (c->num_main_syms - LZX_NUM_CHARS) /
+                                        LZX_NUM_LEN_HEADERS;
+        struct lzx_costs *costs = &c->costs;
+        unsigned main_symbol = LZX_NUM_CHARS;
+
+        for (unsigned offset_slot = 0; offset_slot < num_offset_slots;
+             offset_slot++)
+        {
+                uint32_t extra_cost = lzx_extra_offset_bits[offset_slot] * BIT_COST;
+                unsigned i;
+
+        #if CONSIDER_ALIGNED_COSTS
+                if (offset_slot >= LZX_MIN_ALIGNED_OFFSET_SLOT)
+                        extra_cost -= LZX_NUM_ALIGNED_OFFSET_BITS * BIT_COST;
+        #endif
+
+                for (i = 0; i < LZX_NUM_PRIMARY_LENS; i++) {
+                        costs->match_cost[offset_slot][i] =
+                                costs->main[main_symbol++] + extra_cost;
+                }
+
+                extra_cost += costs->main[main_symbol++];
+
+                for (; i < LZX_NUM_LENS; i++) {
+                        costs->match_cost[offset_slot][i] =
+                                costs->len[i - LZX_NUM_PRIMARY_LENS] +
+                                extra_cost;
+                }
+        }
+}
+
+typedef struct fixed32frac_s {
+        uint32_t value;
+} fixed32frac;
+
+typedef struct fixed32_s {
+        uint64_t value;
+} fixed32;
+
+/*
+ * Fast approximation for log2f(x).  This is not as accurate as the standard C
+ * version.  It does not need to be perfectly accurate because it is only used
+ * for estimating symbol costs, which is very approximate anyway.
+ */
+struct log2_fixed_table_pair
+{
+        uint32_t multiplier;
+        uint32_t log_add;
+};
+
+static const struct log2_fixed_table_pair log2_fixed_table_0[255] = {
+        { 0xff01fc08u, 0x16f2dcfu },  { 0xfe05ee36u, 0x2dceffeu },
+        { 0xfd0bd0beu, 0x4494959u },  { 0xfc139debu, 0x5b43ca6u },
+        { 0xfb1d5020u, 0x71dcca2u },  { 0xfa28e1d4u, 0x885fc02u },
+        { 0xf9364d94u, 0x9eccd73u },  { 0xf8458e02u, 0xb52439au },
+        { 0xf7569dd6u, 0xcb66115u },  { 0xf66977dau, 0xe19287au },
+        { 0xf57e16edu, 0xf7a9c58u },  { 0xf4947602u, 0x10dabf37u },
+        { 0xf3ac901fu, 0x12399395u }, { 0xf2c6605bu, 0x13971beeu },
+        { 0xf1e1e1e2u, 0x14f35ab3u }, { 0xf0ff0ff1u, 0x164e524fu },
+        { 0xf01de5d7u, 0x17a80527u }, { 0xef3e5ef4u, 0x19007598u },
+        { 0xee6076bau, 0x1a57a5f9u }, { 0xed8428aau, 0x1bad989cu },
+        { 0xeca97059u, 0x1d024fc9u }, { 0xebd04968u, 0x1e55cdc7u },
+        { 0xeaf8af8bu, 0x1fa814d2u }, { 0xea229e85u, 0x20f92722u },
+        { 0xe94e1228u, 0x224906e7u }, { 0xe87b0655u, 0x2397b64fu },
+        { 0xe7a976fdu, 0x24e5377du }, { 0xe6d9601du, 0x26318c93u },
+        { 0xe60abdc3u, 0x277cb7acu }, { 0xe53d8c0bu, 0x28c6bad9u },
+        { 0xe471c71du, 0x2a0f982cu }, { 0xe3a76b2fu, 0x2b5751aeu },
+        { 0xe2de7486u, 0x2c9de963u }, { 0xe216df74u, 0x2de36147u },
+        { 0xe150a854u, 0x2f27bb59u }, { 0xe08bcb94u, 0x306af989u },
+        { 0xdfc845a9u, 0x31ad1dc8u }, { 0xdf061318u, 0x32ee29feu },
+        { 0xde45306fu, 0x342e2014u }, { 0xdd859a4au, 0x356d01e8u },
+        { 0xdcc74d51u, 0x36aad154u }, { 0xdc0a4635u, 0x37e79032u },
+        { 0xdb4e81b5u, 0x39234051u }, { 0xda93fc99u, 0x3a5de380u },
+        { 0xd9dab3b6u, 0x3b977b86u }, { 0xd922a3e9u, 0x3cd00a2au },
+        { 0xd86bca1bu, 0x3e07912bu }, { 0xd7b62341u, 0x3f3e1241u },
+        { 0xd701ac57u, 0x40738f27u }, { 0xd64e6266u, 0x41a8098eu },
+        { 0xd59c427fu, 0x42db8323u }, { 0xd4eb49bcu, 0x440dfd94u },
+        { 0xd43b7544u, 0x453f7a82u }, { 0xd38cc244u, 0x466ffb93u },
+        { 0xd2df2df3u, 0x479f8265u }, { 0xd232b593u, 0x48ce108eu },
+        { 0xd187566cu, 0x49fba7a9u }, { 0xd0dd0dd1u, 0x4b284946u },
+        { 0xd033d91du, 0x4c53f6f4u }, { 0xcf8bb5b4u, 0x4d7eb23bu },
+        { 0xcee4a102u, 0x4ea87ca4u }, { 0xce3e987au, 0x4fd157b4u },
+        { 0xcd99999au, 0x50f944e7u }, { 0xccf5a1e5u, 0x522045bcu },
+        { 0xcc52aee8u, 0x53465baau }, { 0xcbb0be38u, 0x546b8825u },
+        { 0xcb0fcd6fu, 0x558fcca0u }, { 0xca6fda31u, 0x56b32a89u },
+        { 0xc9d0e229u, 0x57d5a34au }, { 0xc932e309u, 0x58f7384au },
+        { 0xc895da89u, 0x5a17eaf0u }, { 0xc7f9c66bu, 0x5b37bc99u },
+        { 0xc75ea476u, 0x5c56aea2u }, { 0xc6c47277u, 0x5d74c26au },
+        { 0xc62b2e44u, 0x5e91f945u }, { 0xc592d5b8u, 0x5fae5488u },
+        { 0xc4fb66b5u, 0x60c9d584u }, { 0xc464df24u, 0x61e47d87u },
+        { 0xc3cf3cf4u, 0x62fe4dddu }, { 0xc33a7e1au, 0x641747cdu },
+        { 0xc2a6a091u, 0x652f6c9eu }, { 0xc213a25cu, 0x6646bd8fu },
+        { 0xc1818182u, 0x675d3be2u }, { 0xc0f03c0fu, 0x6872e8d5u },
+        { 0xc05fd018u, 0x6987c59fu }, { 0xbfd03bb5u, 0x6a9bd379u },
+        { 0xbf417d06u, 0x6baf1395u }, { 0xbeb3922eu, 0x6cc18729u },
+        { 0xbe267957u, 0x6dd32f61u }, { 0xbd9a30b1u, 0x6ee40d69u },
+        { 0xbd0eb670u, 0x6ff4226du }, { 0xbc8408cdu, 0x71036f95u },
+        { 0xbbfa2609u, 0x7211f601u }, { 0xbb710c66u, 0x731fb6d9u },
+        { 0xbae8ba2fu, 0x742cb339u }, { 0xba612db0u, 0x7538ec41u },
+        { 0xb9da653eu, 0x7644630au }, { 0xb9545f30u, 0x774f18adu },
+        { 0xb8cf19e3u, 0x78590e41u }, { 0xb84a93b8u, 0x796244d9u },
+        { 0xb7c6cb15u, 0x7a6abd86u }, { 0xb743be65u, 0x7b727958u },
+        { 0xb6c16c17u, 0x7c79795bu }, { 0xb63fd29du, 0x7d7fbe9eu },
+        { 0xb5bef071u, 0x7e854a23u }, { 0xb53ec40eu, 0x7f8a1cf4u },
+        { 0xb4bf4bf5u, 0x808e3815u }, { 0xb44086aau, 0x81919c88u },
+        { 0xb3c272b6u, 0x82944b4cu }, { 0xb3450ea6u, 0x83964560u },
+        { 0xb2c8590bu, 0x84978bc0u }, { 0xb24c507au, 0x85981f63u },
+        { 0xb1d0f38cu, 0x86980143u }, { 0xb15640ddu, 0x87973255u },
+        { 0xb0dc370eu, 0x8895b38du }, { 0xb062d4c3u, 0x899385ddu },
+        { 0xafea18a4u, 0x8a90aa35u }, { 0xaf72015du, 0x8b8d2181u },
+        { 0xaefa8d9eu, 0x8c88ecadu }, { 0xae83bc18u, 0x8d840ca6u },
+        { 0xae0d8b83u, 0x8e7e8252u }, { 0xad97fa99u, 0x8f784e96u },
+        { 0xad230816u, 0x90717259u }, { 0xacaeb2bbu, 0x9169ee7eu },
+        { 0xac3af94cu, 0x9261c3e6u }, { 0xabc7da92u, 0x9358f36cu },
+        { 0xab555555u, 0x944f7df3u }, { 0xaae36865u, 0x95456453u },
+        { 0xaa721292u, 0x963aa768u }, { 0xaa0152b0u, 0x972f4809u },
+        { 0xa9912796u, 0x9823470fu }, { 0xa9219020u, 0x9916a54au },
+        { 0xa8b28b29u, 0x9a096393u }, { 0xa8441792u, 0x9afb82bau },
+        { 0xa7d6343fu, 0x9bed038du }, { 0xa768e015u, 0x9cdde6ddu },
+        { 0xa6fc19fdu, 0x9dce2d77u }, { 0xa68fe0e4u, 0x9ebdd823u },
+        { 0xa62433b8u, 0x9face7adu }, { 0xa5b91169u, 0xa09b5cdfu },
+        { 0xa54e78edu, 0xa189387du }, { 0xa4e46939u, 0xa2767b4fu },
+        { 0xa47ae148u, 0xa3632616u }, { 0xa411e014u, 0xa44f3999u },
+        { 0xa3a9649eu, 0xa53ab692u }, { 0xa3416de5u, 0xa6259dc7u },
+        { 0xa2d9faeeu, 0xa70feff2u }, { 0xa2730abfu, 0xa7f9add0u },
+        { 0xa20c9c60u, 0xa8e2d81du }, { 0xa1a6aedcu, 0xa9cb6f95u },
+        { 0xa1414141u, 0xaab374eeu }, { 0xa0dc529fu, 0xab9ae8dfu },
+        { 0xa077e207u, 0xac81cc1fu }, { 0xa013ee8fu, 0xad681f62u },
+        { 0x9fb0774du, 0xae4de35au }, { 0x9f4d7b5au, 0xaf3318bau },
+        { 0x9eeaf9d1u, 0xb017c033u }, { 0x9e88f1d0u, 0xb0fbda74u },
+        { 0x9e276276u, 0xb1df682bu }, { 0x9dc64ae5u, 0xb2c26a06u },
+        { 0x9d65aa42u, 0xb3a4e0acu }, { 0x9d057fb2u, 0xb486cccbu },
+        { 0x9ca5ca5du, 0xb5682f0cu }, { 0x9c46896du, 0xb6490816u },
+        { 0x9be7bc0eu, 0xb7295893u }, { 0x9b896170u, 0xb8092121u },
+        { 0x9b2b78c1u, 0xb8e8626cu }, { 0x9ace0134u, 0xb9c71d13u },
+        { 0x9a70f9fdu, 0xbaa551b9u }, { 0x9a146253u, 0xbb8300fdu },
+        { 0x99b8396cu, 0xbc602b82u }, { 0x995c7e82u, 0xbd3cd1e6u },
+        { 0x990130d1u, 0xbe18f4c6u }, { 0x98a64f97u, 0xbef494bdu },
+        { 0x984bda13u, 0xbfcfb267u }, { 0x97f1cf85u, 0xc0aa4e5fu },
+        { 0x97982f30u, 0xc184693fu }, { 0x973ef859u, 0xc25e039eu },
+        { 0x96e62a46u, 0xc3371e12u }, { 0x968dc43fu, 0xc40fb932u },
+        { 0x9635c58du, 0xc4e7d594u }, { 0x95de2d7cu, 0xc5bf73cau },
+        { 0x9586fb58u, 0xc696946au }, { 0x95302e70u, 0xc76d3803u },
+        { 0x94d9c615u, 0xc8435f25u }, { 0x9483c197u, 0xc9190a64u },
+        { 0x942e204au, 0xc9ee3a4eu }, { 0x93d8e182u, 0xcac2ef71u },
+        { 0x93840497u, 0xcb972a58u }, { 0x932f88e0u, 0xcc6aeb90u },
+        { 0x92db6db7u, 0xcd3e33a3u }, { 0x9287b275u, 0xce110320u },
+        { 0x92345678u, 0xcee35a8du }, { 0x91e1591eu, 0xcfb53a70u },
+        { 0x918eb9c5u, 0xd086a355u }, { 0x913c77ceu, 0xd15795c2u },
+        { 0x90ea929bu, 0xd228123cu }, { 0x90990990u, 0xd2f81946u },
+        { 0x9047dc12u, 0xd3c7ab65u }, { 0x8ff70986u, 0xd496c91du },
+        { 0x8fa69154u, 0xd56572f1u }, { 0x8f5672e4u, 0xd633a963u },
+        { 0x8f06ada2u, 0xd7016cf0u }, { 0x8eb740f9u, 0xd7cebe18u },
+        { 0x8e682c54u, 0xd89b9d5fu }, { 0x8e196f23u, 0xd9680b3du },
+        { 0x8dcb08d4u, 0xda340833u }, { 0x8d7cf8d8u, 0xdaff94bcu },
+        { 0x8d2f3ea0u, 0xdbcab157u }, { 0x8ce1d9a0u, 0xdc955e7au },
+        { 0x8c94c94cu, 0xdd5f9ca0u }, { 0x8c480d19u, 0xde296c45u },
+        { 0x8bfba47eu, 0xdef2cddeu }, { 0x8baf8ef2u, 0xdfbbc1e5u },
+        { 0x8b63cbeeu, 0xe08448d1u }, { 0x8b185aedu, 0xe14c6316u },
+        { 0x8acd3b69u, 0xe214112du }, { 0x8a826cdeu, 0xe2db5389u },
+        { 0x8a37eecau, 0xe3a22a9fu }, { 0x89edc0acu, 0xe46896deu },
+        { 0x89a3e202u, 0xe52e98bfu }, { 0x895a524eu, 0xe5f430b0u },
+        { 0x89111111u, 0xe6b95f22u }, { 0x88c81dceu, 0xe77e2486u },
+        { 0x887f7808u, 0xe842814eu }, { 0x88371f45u, 0xe90675e5u },
+        { 0x87ef130au, 0xe9ca02bcu }, { 0x87a752dfu, 0xea8d283du },
+        { 0x875fde4au, 0xeb4fe6dau }, { 0x8718b4d4u, 0xec123effu },
+        { 0x86d1d608u, 0xecd43114u }, { 0x868b4170u, 0xed95bd86u },
+        { 0x8644f698u, 0xee56e4beu }, { 0x85fef50du, 0xef17a726u },
+        { 0x85b93c5bu, 0xefd8052bu }, { 0x8573cc12u, 0xf097ff30u },
+        { 0x852ea3c2u, 0xf157959cu }, { 0x84e9c2f9u, 0xf216c8ddu },
+        { 0x84a5294au, 0xf2d59955u }, { 0x8460d647u, 0xf3940768u },
+        { 0x841cc982u, 0xf4521381u }, { 0x83d90290u, 0xf50fbdffu },
+        { 0x83958106u, 0xf5cd0747u }, { 0x83524478u, 0xf689efc0u },
+        { 0x830f4c7eu, 0xf74677c9u }, { 0x82cc98afu, 0xf8029fc5u },
+        { 0x828a28a2u, 0xf8be6819u }, { 0x8247fbf2u, 0xf979d120u },
+        { 0x82061236u, 0xfa34db42u }, { 0x81c46b0bu, 0xfaef86d9u },
+        { 0x8183060cu, 0xfba9d445u }, { 0x8141e2d4u, 0xfc63c3e8u },
+        { 0x81010101u, 0xfd1d561du }, { 0x80c06030u, 0xfdd68b44u },
+        { 0x80800000u, 0xfe8f63b9u },
+};
+
+static const uint32_t log2_fixed_table_1[255] = {
+        0x17152u,   0x2e2a3u,        0x453f2u,   0x5c540u,        0x7368du,   0x8a7d8u,
+        0xa1921u,   0xb8a69u,        0xcfbb0u,   0xe6cf6u,        0xfde39u,   0x114f7cu,
+        0x12c0bdu,  0x1431fcu,        0x15a33au,  0x171477u,        0x1885b2u,  0x19f6ecu,
+        0x1b6824u,  0x1cd95bu,        0x1e4a91u,  0x1fbbc5u,        0x212cf7u,  0x229e28u,
+        0x240f58u,  0x258086u,        0x26f1b3u,  0x2862deu,        0x29d408u,  0x2b4530u,
+        0x2cb657u,  0x2e277du,        0x2f98a1u,  0x3109c4u,        0x327ae5u,  0x33ec05u,
+        0x355d23u,  0x36ce40u,        0x383f5bu,  0x39b077u,        0x3b218fu,  0x3c92a6u,
+        0x3e03bcu,  0x3f74d0u,        0x40e5e3u,  0x4256f4u,        0x43c804u,  0x453912u,
+        0x46aa1fu,  0x481b2bu,        0x498c36u,  0x4afd3fu,        0x4c6e46u,  0x4ddf4cu,
+        0x4f5050u,  0x50c153u,        0x523254u,  0x53a355u,        0x551454u,  0x568551u,
+        0x57f64cu,  0x596747u,        0x5ad83fu,  0x5c4938u,        0x5dba2eu,  0x5f2b22u,
+        0x609c15u,  0x620d06u,        0x637df8u,  0x64eee6u,        0x665fd3u,  0x67d0bfu,
+        0x6941abu,  0x6ab293u,        0x6c237bu,  0x6d9461u,        0x6f0546u,  0x707629u,
+        0x71e70bu,  0x7357ecu,        0x74c8cbu,  0x7639a8u,        0x77aa84u,  0x791b5fu,
+        0x7a8c38u,  0x7bfd10u,        0x7d6de7u,  0x7edebbu,        0x804f8eu,  0x81c061u,
+        0x833131u,  0x84a202u,        0x8612cfu,  0x87839au,        0x88f466u,  0x8a652fu,
+        0x8bd5f8u,  0x8d46beu,        0x8eb784u,  0x902847u,        0x91990au,  0x9309cau,
+        0x947a89u,  0x95eb47u,        0x975c03u,  0x98ccbfu,        0x9a3d79u,  0x9bae31u,
+        0x9d1ee8u,  0x9e8f9cu,        0xa00051u,  0xa17103u,        0xa2e1b4u,  0xa45263u,
+        0xa5c312u,  0xa733bfu,        0xa8a469u,  0xaa1513u,        0xab85bcu,  0xacf662u,
+        0xae6708u,  0xafd7adu,        0xb1484eu,  0xb2b8f0u,        0xb42990u,  0xb59a2du,
+        0xb70acbu,  0xb87b67u,        0xb9ec01u,  0xbb5c98u,        0xbccd30u,  0xbe3dc6u,
+        0xbfae5au,  0xc11eedu,        0xc28f7fu,  0xc4000eu,        0xc5709cu,  0xc6e12au,
+        0xc851b5u,  0xc9c240u,        0xcb32c9u,  0xcca350u,        0xce13d6u,  0xcf845bu,
+        0xd0f4deu,  0xd2655fu,        0xd3d5dfu,  0xd5465eu,        0xd6b6dbu,  0xd82757u,
+        0xd997d2u,  0xdb084au,        0xdc78c2u,  0xdde938u,        0xdf59acu,  0xe0ca1fu,
+        0xe23a91u,  0xe3ab01u,        0xe51b70u,  0xe68bdfu,        0xe7fc4au,  0xe96cb5u,
+        0xeadd1du,  0xec4d85u,        0xedbdecu,  0xef2e51u,        0xf09eb4u,  0xf20f17u,
+        0xf37f77u,  0xf4efd6u,        0xf66033u,  0xf7d090u,        0xf940eau,  0xfab144u,
+        0xfc219cu,  0xfd91f2u,        0xff0248u,  0x100729bu, 0x101e2eeu, 0x103533eu,
+        0x104c38eu, 0x10633dbu, 0x107a428u, 0x1091472u, 0x10a84bdu, 0x10bf504u,
+        0x10d654bu, 0x10ed591u, 0x11045d4u, 0x111b617u, 0x1132658u, 0x1149697u,
+        0x11606d6u, 0x1177713u, 0x118e74du, 0x11a5787u, 0x11bc7c0u, 0x11d37f7u,
+        0x11ea82du, 0x1201860u, 0x1218892u, 0x122f8c4u, 0x12468f4u, 0x125d922u,
+        0x127494fu, 0x128b97bu, 0x12a29a5u, 0x12b99ceu, 0x12d09f5u, 0x12e7a1bu,
+        0x12fea3fu, 0x1315a62u, 0x132ca83u, 0x1343aa3u, 0x135aac1u, 0x1371adeu,
+        0x1388afau, 0x139fb15u, 0x13b6b2eu, 0x13cdb45u, 0x13e4b5bu, 0x13fbb6fu,
+        0x1412b83u, 0x1429b94u, 0x1440ba4u, 0x1457bb4u, 0x146ebc1u, 0x1485bccu,
+        0x149cbd8u, 0x14b3be0u, 0x14cabe8u, 0x14e1beeu, 0x14f8bf2u, 0x150fbf6u,
+        0x1526bf9u, 0x153dbf9u, 0x1554bf8u, 0x156bbf5u, 0x1582bf2u, 0x1599becu,
+        0x15b0be6u, 0x15c7bdeu, 0x15debd3u, 0x15f5bc9u, 0x160cbbdu, 0x1623bafu,
+        0x163ab9fu, 0x1651b8eu, 0x1668b7du, 0x167fb69u, 0x1696b54u, 0x16adb3eu,
+        0x16c4b26u, 0x16dbb0du, 0x16f2af3u,
+};
+
+static int32_t
+log2_fixed_fast_normalized(const fixed32 *value, int multiplier_bits)
+{
+        int32_t base_pos = 0;
+        uint32_t mantissa = 0;
+        uint32_t mantissa_log2 = 0;
+        uint8_t mantissa_byte = 0;
+        int64_t final_log = 0;
+
+        if (value->value == 0)
+                return 0;
+
+        base_pos = (int)bsr64(value->value) - 32;
+        if (base_pos > 0) {
+                mantissa = (value->value >> base_pos) & 0xffffffffu;
+        } else {
+                mantissa = (value->value << -base_pos) & 0xffffffffu;
+        }
+
+        /* Get the first byte */
+        mantissa_byte = (mantissa >> 24) & 0xffu;
+
+        if (mantissa_byte != 0) {
+                const struct log2_fixed_table_pair *pair = log2_fixed_table_0 + (mantissa_byte - 1);
+                mantissa = (uint32_t)((mantissa * (uint64_t)pair->multiplier) >> 32) + pair->multiplier;
+
+                mantissa_log2 += pair->log_add;
+        }
+
+        mantissa_byte = (mantissa >> 16) & 0xffu;
+
+        if (mantissa_byte != 0) {
+                mantissa_log2 += log2_fixed_table_1[mantissa_byte - 1];
+        }
+
+        final_log = (int64_t)mantissa_log2 + ((int64_t)base_pos << 32);
+        final_log /= ((int64_t)1 << (32 - multiplier_bits));
+
+        return (int32_t)final_log;
+}
+
+/*
+ * Return the estimated cost of a symbol which has been estimated to have the
+ * given probability.
+ */
+static uint32_t
+lzx_cost_for_probability(const fixed32* prob)
+{
+        /*
+         * The basic formula is:
+         *
+         *        entropy = -log2(probability)
+         *
+         * Use this to get the cost in fractional bits.  Then multiply by our
+         * scaling factor of BIT_COST and convert to an integer.
+         *
+         * In addition, the minimum cost is BIT_COST (one bit) because the
+         * entropy coding method will be Huffman codes.
+         *
+         * Careful: even though 'prob' should be <= 1.0, 'log2f_fast(prob)' may
+         * be positive due to inaccuracy in our log2 approximation.  Therefore,
+         * we cannot, in general, assume the computed cost is non-negative, and
+         * we should make sure negative costs get rounded up correctly.
+         */
+        int32_t cost = -log2_fixed_fast_normalized(prob, BIT_COST_BITS);
+        return max_u32(cost, BIT_COST);
+}
+
+/*
+ * Mapping: number of used literals => heuristic probability of a literal times
+ * 6870.  Generated by running this R command:
+ *
+ *        cat(paste(round(6870*2^-((304+(0:256))/64)), collapse=", "))
+ */
+static const uint8_t literal_scaled_probs[257] = {
+        255, 253, 250, 247, 244, 242, 239, 237, 234, 232, 229, 227, 224, 222,
+        219, 217, 215, 212, 210, 208, 206, 203, 201, 199, 197, 195, 193, 191,
+        189, 186, 184, 182, 181, 179, 177, 175, 173, 171, 169, 167, 166, 164,
+        162, 160, 159, 157, 155, 153, 152, 150, 149, 147, 145, 144, 142, 141,
+        139, 138, 136, 135, 133, 132, 130, 129, 128, 126, 125, 124, 122, 121,
+        120, 118, 117, 116, 115, 113, 112, 111, 110, 109, 107, 106, 105, 104,
+        103, 102, 101, 100, 98, 97, 96, 95, 94, 93, 92, 91, 90, 89, 88, 87, 86,
+        86, 85, 84, 83, 82, 81, 80, 79, 78, 78, 77, 76, 75, 74, 73, 73, 72, 71,
+        70, 70, 69, 68, 67, 67, 66, 65, 65, 64, 63, 62, 62, 61, 60, 60, 59, 59,
+        58, 57, 57, 56, 55, 55, 54, 54, 53, 53, 52, 51, 51, 50, 50, 49, 49, 48,
+        48, 47, 47, 46, 46, 45, 45, 44, 44, 43, 43, 42, 42, 41, 41, 40, 40, 40,
+        39, 39, 38, 38, 38, 37, 37, 36, 36, 36, 35, 35, 34, 34, 34, 33, 33, 33,
+        32, 32, 32, 31, 31, 31, 30, 30, 30, 29, 29, 29, 28, 28, 28, 27, 27, 27,
+        27, 26, 26, 26, 25, 25, 25, 25, 24, 24, 24, 24, 23, 23, 23, 23, 22, 22,
+        22, 22, 21, 21, 21, 21, 20, 20, 20, 20, 20, 19, 19, 19, 19, 19, 18, 18,
+        18, 18, 18, 17, 17, 17, 17, 17, 16, 16, 16, 16
+};
+
+/*
+ * Mapping: length symbol => default cost of that symbol.  This is derived from
+ * sample data but has been slightly edited to add more bias towards the
+ * shortest lengths, which are the most common.
+ */
+static const uint16_t lzx_default_len_costs[LZX_LENCODE_NUM_SYMBOLS] = {
+        300, 310, 320, 330, 360, 396, 399, 416, 451, 448, 463, 466, 505, 492,
+        503, 514, 547, 531, 566, 561, 589, 563, 592, 586, 623, 602, 639, 627,
+        659, 643, 657, 650, 685, 662, 661, 672, 685, 686, 696, 680, 657, 682,
+        666, 699, 674, 699, 679, 709, 688, 712, 692, 714, 694, 716, 698, 712,
+        706, 727, 714, 727, 713, 723, 712, 718, 719, 719, 720, 735, 725, 735,
+        728, 740, 727, 739, 727, 742, 716, 733, 733, 740, 738, 746, 737, 747,
+        738, 745, 736, 748, 742, 749, 745, 749, 743, 748, 741, 752, 745, 752,
+        747, 750, 747, 752, 748, 753, 750, 752, 753, 753, 749, 744, 752, 755,
+        753, 756, 745, 748, 746, 745, 723, 757, 755, 758, 755, 758, 752, 757,
+        754, 757, 755, 759, 755, 758, 753, 755, 755, 758, 757, 761, 755, 750,
+        758, 759, 759, 760, 758, 751, 757, 757, 759, 759, 758, 759, 758, 761,
+        750, 761, 758, 760, 759, 761, 758, 761, 760, 752, 759, 760, 759, 759,
+        757, 762, 760, 761, 761, 748, 761, 760, 762, 763, 752, 762, 762, 763,
+        762, 762, 763, 763, 762, 763, 762, 763, 762, 763, 763, 764, 763, 762,
+        763, 762, 762, 762, 764, 764, 763, 764, 763, 763, 763, 762, 763, 763,
+        762, 764, 764, 763, 762, 763, 763, 763, 763, 762, 764, 763, 762, 764,
+        764, 763, 763, 765, 764, 764, 762, 763, 764, 765, 763, 764, 763, 764,
+        762, 764, 764, 754, 763, 764, 763, 763, 762, 763, 584,
+};
+
+static void
+fixed_rcp_approx(fixed32frac *result, uint32_t i)
+{
+        result->value = ((uint64_t)0x100000000ull) / i;
+}
+
+static void
+fixed_set(fixed32 *result, uint32_t i)
+{
+        result->value = ((uint64_t)i << 32);
+}
+
+static void
+fixed_set_fraction(fixed32frac *result, uint32_t num, uint32_t denom)
+{
+        result->value = (((uint64_t)num) << 32) / denom;
+}
+
+static void
+fixed_mul_uint_frac(fixed32 *result, uint32_t a, const fixed32frac *b)
+{
+        result->value = ((uint64_t)a) * (uint64_t)b->value;
+}
+
+static void
+fixed_mul_uint_frac_to_frac(fixed32frac *result, uint32_t a, const fixed32frac *b)
+{
+        fixed32 fixed;
+        fixed_mul_uint_frac(&fixed, a, b);
+        result->value = (uint32_t)fixed.value;
+}
+
+static void
+fixed_add_frac(fixed32 *result, const fixed32 *a, const fixed32frac *b)
+{
+        result->value = a->value + b->value;
+}
+
+static void
+fixed_sub(fixed32 *result, const fixed32 *a, const fixed32 *b)
+{
+        result->value = a->value + b->value;
+}
+
+static void
+fixed_max_frac(fixed32 *result, const fixed32 *a, const fixed32frac *b)
+{
+        result->value = max_u64(a->value, b->value);
+}
+
+static void
+fixed_div_uint(fixed32 *result, const fixed32 *a, uint32_t b)
+{
+        result->value = a->value / b;
+}
+
+/* Set default costs to bootstrap the iterative optimization algorithm. */
+static void
+lzx_set_default_costs(struct liblzx_compressor *c)
+{
+        unsigned i;
+        uint32_t num_literals = 0;
+        uint32_t num_used_literals = 0;
+        fixed32frac inv_num_matches;
+        fixed32frac half_inv_num_items;
+        fixed32frac half_inv_6870;
+        fixed32 prob_match;
+        fixed32frac frac_15_100;
+        uint32_t match_cost;
+        fixed32frac half_base_literal_prob;
+        fixed32 temp_fixed;
+
+        fixed_rcp_approx(&inv_num_matches, c->freqs.main[LZX_NUM_CHARS]);
+        fixed_rcp_approx(&half_inv_6870, 6870 * 2);
+        fixed_set(&prob_match, 1);
+        fixed_set_fraction(&frac_15_100, 15, 100);
+
+        /* Some numbers here have been hardcoded to assume a bit cost of 64. */
+        STATIC_ASSERT_STMT(BIT_COST == 64);
+
+        /* Estimate the number of literals that will used.  'num_literals' is
+         * the total number, whereas 'num_used_literals' is the number of
+         * distinct symbols. */
+        for (i = 0; i < LZX_NUM_CHARS; i++) {
+                num_literals += c->freqs.main[i];
+                num_used_literals += (c->freqs.main[i] != 0);
+        }
+
+        /* Note: all match headers were tallied as symbol 'LZX_NUM_CHARS'.  We
+         * don't attempt to estimate which ones will be used. */
+
+        fixed_rcp_approx(&half_inv_num_items,
+                         (num_literals + c->freqs.main[LZX_NUM_CHARS]) * 2);
+        fixed_mul_uint_frac_to_frac(&half_base_literal_prob,
+                            literal_scaled_probs[num_used_literals],
+                            &half_inv_6870);
+
+        /* Literal costs.  We use two different methods to compute the
+         * probability of each literal and mix together their results. */
+        for (i = 0; i < LZX_NUM_CHARS; i++) {
+                uint32_t freq = c->freqs.main[i];
+                if (freq != 0) {
+                        fixed32 prob;
+                        fixed_mul_uint_frac(&prob, freq, &half_inv_num_items);
+                        fixed_add_frac(&prob, &prob, &half_base_literal_prob);
+
+                        c->costs.main[i] = lzx_cost_for_probability(&prob);
+                        fixed_sub(&prob_match, &prob_match, &prob);
+                } else {
+                        c->costs.main[i] = 11 * BIT_COST;
+                }
+        }
+
+        /* Match header costs.  We just assume that all match headers are
+         * equally probable, but we do take into account the relative cost of a
+         * match header vs. a literal depending on how common matches are
+         * expected to be vs. literals. */
+        fixed_max_frac(&prob_match, &prob_match, &frac_15_100);
+        fixed_div_uint(&temp_fixed, &prob_match,
+                       (c->num_main_syms - LZX_NUM_CHARS));
+        match_cost = lzx_cost_for_probability(&temp_fixed);
+        for (; i < c->num_main_syms; i++)
+                c->costs.main[i] = match_cost;
+
+        /* Length symbol costs.  These are just set to fixed values which
+         * reflect the fact the smallest lengths are typically the most common,
+         * and therefore are typically the cheapest. */
+        for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++)
+                c->costs.len[i] = lzx_default_len_costs[i];
+
+#if CONSIDER_ALIGNED_COSTS
+        /* Aligned offset symbol costs.  These are derived from the estimated
+         * probability of each aligned offset symbol. */
+        for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+                /* We intentionally tallied the frequencies in the wrong slots,
+                 * not accounting for LZX_OFFSET_ADJUSTMENT, since doing the
+                 * fixup here is faster: a constant 8 subtractions here vs. one
+                 * addition for every match. */
+                unsigned j = (i - LZX_OFFSET_ADJUSTMENT) & LZX_ALIGNED_OFFSET_BITMASK;
+                if (c->freqs.aligned[j] != 0) {
+                        fixed32 prob;
+                        fixed_mul_uint_frac(&prob, c->freqs.aligned[j],
+                                            &inv_num_matches);
+                        c->costs.aligned[i] = lzx_cost_for_probability(&prob);
+                } else {
+                        c->costs.aligned[i] =
+                                (2 * LZX_NUM_ALIGNED_OFFSET_BITS) * BIT_COST;
+                }
+        }
+#endif
+}
+
+/* Update the current cost model to reflect the computed Huffman codes.  */
+static void
+lzx_set_costs_from_codes(struct liblzx_compressor *c)
+{
+        unsigned i;
+        const struct lzx_lens *lens = &c->codes[c->codes_index].lens;
+
+        for (i = 0; i < c->num_main_syms; i++) {
+                c->costs.main[i] = (lens->main[i] ? lens->main[i] :
+                                    MAIN_CODEWORD_LIMIT) * BIT_COST;
+        }
+
+        for (i = 0; i < LZX_LENCODE_NUM_SYMBOLS; i++) {
+                c->costs.len[i] = (lens->len[i] ? lens->len[i] :
+                                   LENGTH_CODEWORD_LIMIT) * BIT_COST;
+        }
+
+#if CONSIDER_ALIGNED_COSTS
+        for (i = 0; i < LZX_ALIGNEDCODE_NUM_SYMBOLS; i++) {
+                c->costs.aligned[i] = (lens->aligned[i] ? lens->aligned[i] :
+                                       ALIGNED_CODEWORD_LIMIT) * BIT_COST;
+        }
+#endif
+}
+
+/*
+ * Choose a "near-optimal" literal/match sequence to use for the current block,
+ * then flush the block.  Because the cost of each Huffman symbol is unknown
+ * until the Huffman codes have been built and the Huffman codes themselves
+ * depend on the symbol frequencies, this uses an iterative optimization
+ * algorithm to approximate an optimal solution.  The first optimization pass
+ * for the block uses default costs; additional passes use costs derived from
+ * the Huffman codes computed in the previous pass.
+ */
+static attrib_forceinline struct lzx_lru_queue
+lzx_optimize_and_flush_block(struct liblzx_compressor * const restrict c,
+                             struct lzx_output_bitstream * const restrict os,
+                             const uint8_t * const restrict block_begin,
+                             const uint32_t block_size,
+                             const struct lzx_lru_queue initial_queue,
+                             bool is_16_bit)
+{
+        unsigned num_passes_remaining = c->num_optim_passes;
+        struct lzx_lru_queue new_queue;
+        uint32_t seq_idx;
+
+        lzx_set_default_costs(c);
+
+        for (;;) {
+                lzx_compute_match_costs(c);
+                new_queue = lzx_find_min_cost_path(c, block_begin, block_size,
+                                                   initial_queue, is_16_bit);
+
+                if (--num_passes_remaining == 0)
+                        break;
+
+                /* At least one optimization pass remains.  Update the costs. */
+                lzx_reset_symbol_frequencies(c);
+                lzx_tally_item_list(c, block_size, is_16_bit);
+                lzx_build_huffman_codes(c);
+                lzx_set_costs_from_codes(c);
+        }
+
+        /* Done optimizing.  Generate the sequence list and flush the block. */
+        lzx_reset_symbol_frequencies(c);
+        seq_idx = lzx_record_item_list(c, block_size, is_16_bit);
+        lzx_flush_block(c, os, block_begin, block_size, seq_idx);
+        return new_queue;
+}
+
+/*
+ * This is the "near-optimal" LZX compressor.
+ *
+ * For each block, it performs a relatively thorough graph search to find an
+ * inexpensive (in terms of compressed size) way to output the block.
+ *
+ * Note: there are actually many things this algorithm leaves on the table in
+ * terms of compression ratio.  So although it may be "near-optimal", it is
+ * certainly not "optimal".  The goal is not to produce the optimal compression
+ * ratio, which for LZX is probably impossible within any practical amount of
+ * time, but rather to produce a compression ratio significantly better than a
+ * simpler "greedy" or "lazy" parse while still being relatively fast.
+ */
+static attrib_forceinline void
+lzx_reset_near_optimal(struct liblzx_compressor *c, bool is_16_bit)
+{
+        /* Initialize the matchfinder. */
+        CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
+}
+
+static void
+lzx_reset_near_optimal_16(struct liblzx_compressor *c)
+{
+        lzx_reset_near_optimal(c, true);
+}
+
+static void
+lzx_reset_near_optimal_32(struct liblzx_compressor *c)
+{
+        lzx_reset_near_optimal(c, false);
+}
+
+static attrib_forceinline void
+lzx_compress_near_optimal(struct liblzx_compressor * restrict c,
+                          const uint8_t *restrict in_begin,
+                          size_t in_nchunk, size_t in_ndata,
+                          struct lzx_output_bitstream * restrict os,
+                          bool is_16_bit)
+{
+        uint32_t max_offset                 = c->window_size;
+        const uint8_t *         in_next = in_begin;
+        const uint8_t * const in_chunk_end = in_begin + in_nchunk;
+        const uint8_t * const in_data_end = in_begin + in_ndata;
+        uint32_t max_find_len = LZX_MAX_MATCH_LEN;
+        uint32_t max_produce_len = LZX_MAX_MATCH_LEN;
+        uint32_t nice_len = min_u32(c->nice_match_length, max_find_len);
+        uint32_t next_hashes[2] = {0, 0};
+        struct lzx_lru_queue queue;
+
+        if (max_offset >= LZX_MAX_WINDOW_SIZE) {
+                /* Slightly shrink window to avoid offset values that are
+                 * greater than 21 bits. */
+                max_offset = LZX_MAX_WINDOW_SIZE - 1 - LZX_OFFSET_ADJUSTMENT;
+        }
+
+        in_begin -= c->in_prefix_size;
+
+        if (c->variant == LIBLZX_VARIANT_WIM) {
+                CALL_BT_MF(is_16_bit, c, bt_matchfinder_init);
+        } else {
+                /* Load the LRU queue */
+                lzx_lru_queue_load(&queue, c->lru_queue);
+        }
+
+        do {
+                /* Starting a new block */
+
+                const uint8_t * const in_block_begin = in_next;
+                const uint8_t * const in_max_block_end =
+                        in_next + min_size(SOFT_MAX_BLOCK_SIZE, in_chunk_end - in_next);
+                struct lz_match *cache_ptr = c->match_cache;
+                const uint8_t *next_search_pos = in_next;
+                const uint8_t *next_observation = in_next;
+                const uint8_t *next_pause_point =
+                        min_constptr(in_next + min_size(MIN_BLOCK_SIZE,
+                                          in_max_block_end - in_next),
+                    in_max_block_end - min_size(LZX_MAX_MATCH_LEN - 1,
+                                                   in_max_block_end - in_next));
+
+                lzx_init_block_split_stats(&c->split_stats);
+                lzx_reset_symbol_frequencies(c);
+
+                if (in_next >= next_pause_point)
+                        goto pause;
+
+                /*
+                 * Run the input buffer through the matchfinder, caching the
+                 * matches, until we decide to end the block.
+                 *
+                 * For a tighter matchfinding loop, we compute a "pause point",
+                 * which is the next position at which we may need to check
+                 * whether to end the block or to decrease max_len.  We then
+                 * only do these extra checks upon reaching the pause point.
+                 */
+        resume_matchfinding:
+                do {
+                        size_t min_match_pos = in_next - in_begin;
+                        min_match_pos -= min_size(min_match_pos, max_offset);
+
+                        if (in_next >= next_search_pos &&
+                                likely(nice_len >= LZX_MIN_MATCH_LEN)) {
+                                /* Search for matches at this position. */
+                                struct lz_match *lz_matchptr;
+                                uint32_t best_len;
+
+                                lz_matchptr = CALL_BT_MF(is_16_bit, c,
+                                                         bt_matchfinder_get_matches,
+                                                         in_begin,
+                                                         min_match_pos,
+                                                         in_next - in_begin,
+                                                         max_find_len,
+                                                         max_produce_len,
+                                                         nice_len,
+                                                         c->max_search_depth,
+                                                         next_hashes,
+                                                         &best_len,
+                                                         cache_ptr + 1);
+                                cache_ptr->length = lz_matchptr - (cache_ptr + 1);
+                                cache_ptr = lz_matchptr;
+
+                                /* Accumulate literal/match statistics for block
+                                 * splitting and for generating the initial cost
+                                 * model. */
+                                if (in_next >= next_observation) {
+                                        best_len = cache_ptr[-1].length;
+                                        if (best_len >= 3) {
+                                                /* Match (len >= 3) */
+
+                                                /*
+                                                 * Note: for performance reasons this has
+                                                 * been simplified significantly:
+                                                 *
+                                                 * - We wait until later to account for
+                                                 *   LZX_OFFSET_ADJUSTMENT.
+                                                 * - We don't account for repeat offsets.
+                                                 * - We don't account for different match headers.
+                                                 */
+                                                c->freqs.aligned[cache_ptr[-1].offset &
+                                                        LZX_ALIGNED_OFFSET_BITMASK]++;
+                                                c->freqs.main[LZX_NUM_CHARS]++;
+
+                                                lzx_observe_match(&c->split_stats, best_len);
+                                                next_observation = in_next + best_len;
+                                        } else {
+                                                /* Literal */
+                                                c->freqs.main[*in_next]++;
+                                                lzx_observe_literal(&c->split_stats, *in_next);
+                                                next_observation = in_next + 1;
+                                        }
+                                }
+
+                                /*
+                                 * If there was a very long match found, then
+                                 * don't cache any matches for the bytes covered
+                                 * by that match.  This avoids degenerate
+                                 * behavior when compressing highly redundant
+                                 * data, where the number of matches can be very
+                                 * large.
+                                 *
+                                 * This heuristic doesn't actually hurt the
+                                 * compression ratio *too* much.  If there's a
+                                 * long match, then the data must be highly
+                                 * compressible, so it doesn't matter as much
+                                 * what we do.
+                                 */
+                                if (best_len >= nice_len)
+                                        next_search_pos = in_next + best_len;
+                        } else {
+                                /* Don't search for matches at this position. */
+                                CALL_BT_MF(is_16_bit, c,
+                                           bt_matchfinder_skip_byte,
+                                           in_begin,
+                                           min_match_pos,
+                                           in_next - in_begin,
+                                           nice_len,
+                                           c->max_search_depth,
+                                           next_hashes);
+                                cache_ptr->length = 0;
+                                cache_ptr++;
+                        }
+                } while (++in_next < next_pause_point &&
+                         likely(cache_ptr < &c->match_cache[CACHE_LENGTH]));
+
+        pause:
+
+                /* Adjust max_len and nice_len if we're nearing the end of the
+                 * input buffer.  In addition, if we are so close to the end of
+                 * the input buffer that there cannot be any more matches, then
+                 * just advance through the last few positions and record no
+                 * matches. */
+                if (unlikely(max_produce_len > in_data_end - in_next)) {
+                        max_produce_len = in_chunk_end - in_next;
+                        max_find_len = in_data_end - in_next;
+                        nice_len = min_u32(max_produce_len, nice_len);
+                        if (max_find_len < BT_MATCHFINDER_REQUIRED_NBYTES) {
+                                while (in_next != in_chunk_end) {
+                                        cache_ptr->length = 0;
+                                        cache_ptr++;
+                                        in_next++;
+                                }
+                        }
+                }
+
+                /* End the block if the match cache may overflow. */
+                if (unlikely(cache_ptr >= &c->match_cache[CACHE_LENGTH]))
+                        goto end_block;
+
+                /* End the block if the soft maximum size has been reached. */
+                if (in_next >= in_max_block_end)
+                        goto end_block;
+
+                /* End the block if the block splitting algorithm thinks this is
+                 * a good place to do so. */
+                if (c->split_stats.num_new_observations >=
+                                NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
+                    in_max_block_end - in_next >= MIN_BLOCK_SIZE &&
+                    lzx_should_end_block(&c->split_stats))
+                        goto end_block;
+
+                /* It's not time to end the block yet.  Compute the next pause
+                 * point and resume matchfinding. */
+                next_pause_point =
+                        min_constptr(in_next + min_size(NUM_OBSERVATIONS_PER_BLOCK_CHECK * 2 -
+                                            c->split_stats.num_new_observations,
+                                          in_max_block_end - in_next),
+                            in_max_block_end - min_size(LZX_MAX_MATCH_LEN - 1,
+                                                   in_max_block_end - in_next));
+                goto resume_matchfinding;
+
+        end_block:
+                /* We've decided on a block boundary and cached matches.  Now
+                 * choose a match/literal sequence and flush the block. */
+                queue = lzx_optimize_and_flush_block(c, os, in_block_begin,
+                                                     in_next - in_block_begin,
+                                                     queue, is_16_bit);
+        } while (in_next != in_chunk_end);
+
+        /* Save the LRU queue and next hashes */
+        lzx_lru_queue_save(c->lru_queue, &queue);
+}
+
+static void
+lzx_compress_near_optimal_16(struct liblzx_compressor *c, const uint8_t *in,
+                             size_t in_nchunk, size_t in_ndata,
+                             struct lzx_output_bitstream *os)
+{
+        lzx_compress_near_optimal(c, in, in_nchunk, in_ndata, os, true);
+}
+
+static void
+lzx_compress_near_optimal_32(struct liblzx_compressor *c, const uint8_t *in,
+                             size_t in_nchunk, size_t in_ndata,
+                             struct lzx_output_bitstream *os)
+{
+        lzx_compress_near_optimal(c, in, in_nchunk, in_ndata, os, false);
+}
+
+static attrib_forceinline void
+lzx_cull_near_optimal(struct liblzx_compressor *c, size_t nbytes, const bool is_16_bit)
+{
+        CALL_BT_MF(is_16_bit, c, bt_matchfinder_cull, nbytes, c->window_size);
+}
+
+static void
+lzx_cull_near_optimal_16(struct liblzx_compressor *c, size_t nbytes)
+{
+        lzx_cull_near_optimal(c, nbytes, true);
+}
+
+static void
+lzx_cull_near_optimal_32(struct liblzx_compressor *c, size_t nbytes)
+{
+        lzx_cull_near_optimal(c, nbytes, false);
+}
+
+/******************************************************************************/
+/*                     Faster ("lazy") compression algorithm                  */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Called when the compressor chooses to use a literal.  This tallies the
+ * Huffman symbol for the literal, increments the current literal run length,
+ * and "observes" the literal for the block split statistics.
+ */
+static attrib_forceinline void
+lzx_choose_literal(struct liblzx_compressor *c, unsigned literal, uint32_t *litrunlen_p)
+{
+        lzx_observe_literal(&c->split_stats, literal);
+        c->freqs.main[literal]++;
+        ++*litrunlen_p;
+}
+
+/*
+ * Called when the compressor chooses to use a match.  This tallies the Huffman
+ * symbol(s) for a match, saves the match data and the length of the preceding
+ * literal run, updates the recent offsets queue, and "observes" the match for
+ * the block split statistics.
+ */
+static attrib_forceinline void
+lzx_choose_match(struct liblzx_compressor *c, unsigned length, uint32_t adjusted_offset,
+                 uint32_t recent_offsets[LZX_NUM_RECENT_OFFSETS], bool is_16_bit,
+                 uint32_t *litrunlen_p, struct lzx_sequence **next_seq_p)
+{
+        struct lzx_sequence *next_seq = *next_seq_p;
+        unsigned mainsym;
+
+        lzx_observe_match(&c->split_stats, length);
+
+        mainsym = lzx_tally_main_and_lensyms(c, length, adjusted_offset,
+                                             is_16_bit);
+        next_seq->litrunlen_and_matchlen =
+                (*litrunlen_p << SEQ_MATCHLEN_BITS) | length;
+        next_seq->adjusted_offset_and_mainsym =
+                (adjusted_offset << SEQ_MAINSYM_BITS) | mainsym;
+
+        /* Update the recent offsets queue. */
+        if (adjusted_offset < LZX_NUM_RECENT_OFFSETS) {
+                /* Repeat offset match. */
+                uint32_t temp = recent_offsets[adjusted_offset];
+                recent_offsets[adjusted_offset] = recent_offsets[0];
+                recent_offsets[0] = temp;
+        } else {
+                /* Explicit offset match. */
+
+                /* Tally the aligned offset symbol if needed. */
+                if (adjusted_offset >= LZX_MIN_ALIGNED_OFFSET + LZX_OFFSET_ADJUSTMENT)
+                        c->freqs.aligned[adjusted_offset & LZX_ALIGNED_OFFSET_BITMASK]++;
+
+                recent_offsets[2] = recent_offsets[1];
+                recent_offsets[1] = recent_offsets[0];
+                recent_offsets[0] = adjusted_offset - LZX_OFFSET_ADJUSTMENT;
+        }
+
+        /* Reset the literal run length and advance to the next sequence. */
+        *next_seq_p = next_seq + 1;
+        *litrunlen_p = 0;
+}
+
+/*
+ * Called when the compressor ends a block.  This finshes the last lzx_sequence,
+ * which is just a literal run with no following match.  This literal run might
+ * be empty.
+ */
+static attrib_forceinline void
+lzx_finish_sequence(struct lzx_sequence *last_seq, uint32_t litrunlen)
+{
+        last_seq->litrunlen_and_matchlen = litrunlen << SEQ_MATCHLEN_BITS;
+}
+
+/*
+ * Find the longest repeat offset match with the current position.  If a match
+ * is found, return its length and set *best_rep_idx_ret to the index of its
+ * offset in @recent_offsets.  Otherwise, return 0.
+ *
+ * Don't bother with length 2 matches; consider matches of length >= 3 only.
+ * Also assume that max_len >= 3.
+ */
+static unsigned
+lzx_find_longest_repeat_offset_match(const uint8_t * const in_next,
+                                     const uint32_t recent_offsets[],
+                                     const unsigned max_len,
+                                     unsigned *best_rep_idx_ret)
+{
+        STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3); /* loop is unrolled */
+
+        const uint32_t seq3 = load_u24_unaligned(in_next);
+        const uint8_t *matchptr;
+        unsigned best_rep_len = 0;
+        unsigned best_rep_idx = 0;
+        unsigned rep_len;
+
+        /* Check for rep0 match (most recent offset) */
+        matchptr = in_next - recent_offsets[0];
+        if (load_u24_unaligned(matchptr) == seq3)
+                best_rep_len = lz_extend(in_next, matchptr, 3, max_len);
+
+        /* Check for rep1 match (second most recent offset) */
+        matchptr = in_next - recent_offsets[1];
+        if (load_u24_unaligned(matchptr) == seq3) {
+                rep_len = lz_extend(in_next, matchptr, 3, max_len);
+                if (rep_len > best_rep_len) {
+                        best_rep_len = rep_len;
+                        best_rep_idx = 1;
+                }
+        }
+
+        /* Check for rep2 match (third most recent offset) */
+        matchptr = in_next - recent_offsets[2];
+        if (load_u24_unaligned(matchptr) == seq3) {
+                rep_len = lz_extend(in_next, matchptr, 3, max_len);
+                if (rep_len > best_rep_len) {
+                        best_rep_len = rep_len;
+                        best_rep_idx = 2;
+                }
+        }
+
+        *best_rep_idx_ret = best_rep_idx;
+        return best_rep_len;
+}
+
+/*
+ * Fast heuristic scoring for lazy parsing: how "good" is this match?
+ * This is mainly determined by the length: longer matches are better.
+ * However, we also give a bonus to close (small offset) matches and to repeat
+ * offset matches, since those require fewer bits to encode.
+ */
+
+static attrib_forceinline unsigned
+lzx_explicit_offset_match_score(unsigned len, uint32_t adjusted_offset)
+{
+        unsigned score = len;
+
+        if (adjusted_offset < 4096)
+                score++;
+        if (adjusted_offset < 256)
+                score++;
+
+        return score;
+}
+
+static attrib_forceinline unsigned
+lzx_repeat_offset_match_score(unsigned rep_len, unsigned rep_idx)
+{
+        return rep_len + 3;
+}
+
+/*
+ * This is the "lazy" LZX compressor.  The basic idea is that before it chooses
+ * a match, it checks to see if there's a longer match at the next position.  If
+ * yes, it chooses a literal and continues to the next position.  If no, it
+ * chooses the match.
+ *
+ * Some additional heuristics are used as well.  Repeat offset matches are
+ * considered favorably and sometimes are chosen immediately.  In addition, long
+ * matches (at least "nice_len" bytes) are chosen immediately as well.  Finally,
+ * when we decide whether a match is "better" than another, we take the offset
+ * into consideration as well as the length.
+ */
+static attrib_forceinline void
+lzx_reset_lazy(struct liblzx_compressor *c, bool is_16_bit)
+{
+        bool streaming = (c->variant != LIBLZX_VARIANT_WIM);
+
+        /* Initialize the matchfinder. */
+        CALL_HC_MF(is_16_bit, c, hc_matchfinder_init, c->window_size,
+                   streaming);
+}
+
+static void
+lzx_reset_lazy_16(struct liblzx_compressor *c)
+{
+        lzx_reset_lazy(c, true);
+}
+
+static void
+lzx_reset_lazy_32(struct liblzx_compressor *c)
+{
+        lzx_reset_lazy(c, false);
+}
+
+static attrib_forceinline void
+lzx_compress_lazy(struct liblzx_compressor * restrict c,
+                  const uint8_t * restrict in_begin, size_t in_nchunk,
+                  size_t in_ndata, struct lzx_output_bitstream * restrict os,
+                  bool is_16_bit)
+{
+        uint32_t max_offset                 = c->window_size;
+        const uint8_t *         in_next = in_begin;
+        const uint8_t * const in_chunk_end = in_begin + in_nchunk;
+        const uint8_t *const in_data_end = in_begin + in_ndata;
+        unsigned max_find_len = LZX_MAX_MATCH_LEN;
+        unsigned max_produce_len = LZX_MAX_MATCH_LEN;
+        unsigned nice_len = min_uint(c->nice_match_length, max_find_len);
+        STATIC_ASSERT(LZX_NUM_RECENT_OFFSETS == 3);
+        uint32_t recent_offsets[LZX_NUM_RECENT_OFFSETS];
+        uint32_t next_hashes[2];
+
+        if (max_offset >= LZX_MAX_WINDOW_SIZE) {
+                /* Slightly shrink window to avoid offset values that are
+                 * greater than 21 bits. */
+                max_offset = LZX_MAX_WINDOW_SIZE - 1 - LZX_OFFSET_ADJUSTMENT;
+        }
+
+        in_begin -= c->in_prefix_size;
+
+        /* Load the LRU queue and next hashes. */
+        {
+                int i;
+                for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
+                        recent_offsets[i] = c->lru_queue[i];
+                }
+
+                next_hashes[0] = c->next_hashes[0];
+                next_hashes[1] = c->next_hashes[1];
+        }
+
+        do {
+                /* Starting a new block */
+
+                const uint8_t * const in_block_begin = in_next;
+                const uint8_t * const in_max_block_end =
+                        in_next + min_size(SOFT_MAX_BLOCK_SIZE, in_chunk_end - in_next);
+                struct lzx_sequence *next_seq = c->chosen_sequences;
+                uint32_t litrunlen = 0;
+                unsigned cur_len;
+                uint32_t cur_offset;
+                uint32_t cur_adjusted_offset;
+                unsigned cur_score;
+                unsigned next_len;
+                uint32_t next_offset;
+                uint32_t next_adjusted_offset;
+                unsigned next_score;
+                unsigned best_rep_len;
+                unsigned best_rep_idx;
+                unsigned rep_score;
+                unsigned skip_len;
+
+                lzx_reset_symbol_frequencies(c);
+                lzx_init_block_split_stats(&c->split_stats);
+
+                do {
+                        /* Adjust max_len and nice_len if we're nearing the end
+                         * of the input buffer. */
+                        if (unlikely(max_produce_len >
+                                     in_chunk_end - in_next)) {
+                                max_produce_len = in_chunk_end - in_next;
+                                max_find_len = in_data_end - in_next;
+                                nice_len =
+                                    min_uint(max_produce_len, nice_len);
+                        }
+
+                        /* Find the longest match (subject to the
+                         * max_search_depth cutoff parameter) with the current
+                         * position.  Don't bother with length 2 matches; only
+                         * look for matches of length >= 3. */
+                        {
+                                size_t min_match_pos = in_next - in_begin;
+                                min_match_pos -=
+                                    min_size(min_match_pos, max_offset);
+
+                                cur_len = CALL_HC_MF(is_16_bit, c,
+                                                     hc_matchfinder_longest_match,
+                                                     in_begin,
+                                                     min_match_pos,
+                                                     in_next,
+                                                     2,
+                                                     max_find_len,
+                                                     max_produce_len,
+                                                     nice_len,
+                                                     c->max_search_depth,
+                                                     next_hashes,
+                                                     &cur_offset);
+                        }
+
+                        /* If there was no match found, or the only match found
+                         * was a distant short match, then choose a literal. */
+                        if (cur_len < 3 ||
+                            (cur_len == 3 &&
+                             cur_offset >= 8192 - LZX_OFFSET_ADJUSTMENT &&
+                             cur_offset != recent_offsets[0] &&
+                             cur_offset != recent_offsets[1] &&
+                             cur_offset != recent_offsets[2]))
+                        {
+                                lzx_choose_literal(c, *in_next, &litrunlen);
+                                in_next++;
+                                continue;
+                        }
+
+                        /* Heuristic: if this match has the most recent offset,
+                         * then go ahead and choose it as a rep0 match. */
+                        if (cur_offset == recent_offsets[0]) {
+                                in_next++;
+                                skip_len = cur_len - 1;
+                                cur_adjusted_offset = 0;
+                                goto choose_cur_match;
+                        }
+
+                        /* Compute the longest match's score as an explicit
+                         * offset match. */
+                        cur_adjusted_offset = cur_offset + LZX_OFFSET_ADJUSTMENT;
+                        cur_score = lzx_explicit_offset_match_score(cur_len, cur_adjusted_offset);
+
+                        /* Find the longest repeat offset match at this
+                         * position.  If we find one and it's "better" than the
+                         * explicit offset match we found, then go ahead and
+                         * choose the repeat offset match immediately. */
+                        best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
+                                                                            recent_offsets,
+                                                                            max_produce_len,
+                                                                            &best_rep_idx);
+                        in_next++;
+
+                        if (best_rep_len != 0 &&
+                            (rep_score = lzx_repeat_offset_match_score(best_rep_len,
+                                                                       best_rep_idx)) >= cur_score)
+                        {
+                                cur_len = best_rep_len;
+                                cur_adjusted_offset = best_rep_idx;
+                                skip_len = best_rep_len - 1;
+                                goto choose_cur_match;
+                        }
+
+                have_cur_match:
+                        /*
+                         * We have a match at the current position.  If the
+                         * match is very long, then choose it immediately.
+                         * Otherwise, see if there's a better match at the next
+                         * position.
+                         */
+
+                        if (cur_len >= nice_len) {
+                                skip_len = cur_len - 1;
+                                goto choose_cur_match;
+                        }
+
+                        if (unlikely(max_produce_len >
+                                     in_chunk_end - in_next)) {
+                                max_produce_len = in_chunk_end - in_next;
+                                max_find_len = in_data_end - in_next;
+                                nice_len =
+                                    min_uint(max_produce_len, nice_len);
+                        }
+
+                        {
+                                size_t min_match_pos = in_next - in_begin;
+                                min_match_pos -=
+                                    min_uint(min_match_pos, max_offset);
+
+                                next_len = CALL_HC_MF(
+                                                      is_16_bit, c,
+                                                      hc_matchfinder_longest_match,
+                                                      in_begin,
+                                                      min_match_pos,
+                                                      in_next,
+                                                      cur_len - 2,
+                                                      max_find_len,
+                                                      max_produce_len,
+                                                      nice_len,
+                                                      c->max_search_depth / 2,
+                                                      next_hashes,
+                                                      &next_offset);
+                        }
+
+                        if (next_len <= cur_len - 2) {
+                                /* No potentially better match was found. */
+                                in_next++;
+                                skip_len = cur_len - 2;
+                                goto choose_cur_match;
+                        }
+
+                        next_adjusted_offset = next_offset + LZX_OFFSET_ADJUSTMENT;
+                        next_score = lzx_explicit_offset_match_score(next_len, next_adjusted_offset);
+
+                        best_rep_len = lzx_find_longest_repeat_offset_match(in_next,
+                                                                            recent_offsets,
+                                                                            max_produce_len,
+                                                                            &best_rep_idx);
+                        in_next++;
+
+                        if (best_rep_len != 0 &&
+                            (rep_score = lzx_repeat_offset_match_score(best_rep_len,
+                                                                       best_rep_idx)) >= next_score)
+                        {
+
+                                if (rep_score > cur_score) {
+                                        /* The next match is better, and it's a
+                                         * repeat offset match. */
+                                        lzx_choose_literal(c, *(in_next - 2),
+                                                           &litrunlen);
+                                        cur_len = best_rep_len;
+                                        cur_adjusted_offset = best_rep_idx;
+                                        skip_len = cur_len - 1;
+                                        goto choose_cur_match;
+                                }
+                        } else {
+                                if (next_score > cur_score) {
+                                        /* The next match is better, and it's an
+                                         * explicit offset match. */
+                                        lzx_choose_literal(c, *(in_next - 2),
+                                                           &litrunlen);
+                                        cur_len = next_len;
+                                        cur_adjusted_offset = next_adjusted_offset;
+                                        cur_score = next_score;
+                                        goto have_cur_match;
+                                }
+                        }
+
+                        /* The original match was better; choose it. */
+                        skip_len = cur_len - 2;
+
+                choose_cur_match:
+                        /* Choose a match and have the matchfinder skip over its
+                         * remaining bytes. */
+                        lzx_choose_match(c, cur_len, cur_adjusted_offset,
+                                         recent_offsets, is_16_bit,
+                                         &litrunlen, &next_seq);
+
+                        CALL_HC_MF(is_16_bit, c,
+                                   hc_matchfinder_skip_bytes,
+                                   in_begin,
+                                   in_next,
+                                   in_chunk_end,
+                                   skip_len,
+                                   next_hashes);
+                        in_next += skip_len;
+
+                        /* Keep going until it's time to end the block. */
+                } while (in_next < in_max_block_end &&
+                         !(c->split_stats.num_new_observations >=
+                                        NUM_OBSERVATIONS_PER_BLOCK_CHECK &&
+                           in_next - in_block_begin >= MIN_BLOCK_SIZE &&
+                           in_chunk_end - in_next >= MIN_BLOCK_SIZE &&
+                           lzx_should_end_block(&c->split_stats)));
+
+                /* Flush the block. */
+                lzx_finish_sequence(next_seq, litrunlen);
+                lzx_flush_block(c, os, in_block_begin, in_next - in_block_begin, 0);
+
+                /* Keep going until we've reached the end of the input buffer. */
+        } while (in_next != in_chunk_end);
+
+        /* Save the LRU queue and next hashes */
+        {
+                int i;
+                for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
+                        c->lru_queue[i] = recent_offsets[i];
+                }
+                c->next_hashes[0] = next_hashes[0];
+                c->next_hashes[1] = next_hashes[1];
+        }
+}
+
+static void
+lzx_compress_lazy_16(struct liblzx_compressor *c, const uint8_t *in,
+                     size_t in_nchunk, size_t in_navail,
+                     struct lzx_output_bitstream *os)
+{
+        lzx_compress_lazy(c, in, in_nchunk, in_navail, os, true);
+}
+
+static void
+lzx_compress_lazy_32(struct liblzx_compressor *c, const uint8_t *in,
+                     size_t in_nchunk, size_t in_navail,
+                     struct lzx_output_bitstream *os)
+{
+        lzx_compress_lazy(c, in, in_nchunk, in_navail, os, false);
+}
+
+static void
+lzx_cull_lazy_16(struct liblzx_compressor *c, size_t nbytes)
+{
+        CALL_HC_MF(true, c, hc_matchfinder_cull, nbytes, c->window_size);
+}
+
+static void
+lzx_cull_lazy_32(struct liblzx_compressor *c, size_t nbytes)
+{
+        CALL_HC_MF(false, c, hc_matchfinder_cull, nbytes, c->window_size);
+}
+
+/******************************************************************************/
+/*                          Compressor operations                             */
+/*----------------------------------------------------------------------------*/
+
+/*
+ * Generate tables for mapping match offsets (actually, "adjusted" match
+ * offsets) to offset slots.
+ */
+static void
+lzx_init_offset_slot_tabs(struct liblzx_compressor *c)
+{
+        uint32_t adjusted_offset = 0;
+        unsigned slot = 0;
+
+        /* slots [0, 29] */
+        for (; adjusted_offset < ARRAY_LEN(c->offset_slot_tab_1);
+             adjusted_offset++)
+        {
+                if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
+                                       LZX_OFFSET_ADJUSTMENT)
+                        slot++;
+                c->offset_slot_tab_1[adjusted_offset] = slot;
+        }
+
+        /* slots [30, 49] */
+        for (; adjusted_offset < LZX_MAX_WINDOW_SIZE;
+             adjusted_offset += (uint32_t)1 << 14)
+        {
+                if (adjusted_offset >= lzx_offset_slot_base[slot + 1] +
+                                       LZX_OFFSET_ADJUSTMENT)
+                        slot++;
+                c->offset_slot_tab_2[adjusted_offset >> 14] = slot;
+        }
+}
+
+static size_t
+lzx_bt_max_search_depth(unsigned compression_level)
+{
+        return (24 * compression_level) / 50;
+}
+
+static size_t
+lzx_get_compressor_size(size_t window_size, unsigned compression_level,
+        bool streaming)
+{
+
+        if (compression_level <= MAX_FAST_LEVEL) {
+                if (lzx_is_16_bit(window_size))
+                        return offsetof(struct liblzx_compressor, hc_mf_16) +
+                               hc_matchfinder_size_16(window_size, streaming);
+                else
+                        return offsetof(struct liblzx_compressor, hc_mf_32) +
+                               hc_matchfinder_size_32(window_size, streaming);
+        } else {
+                if (lzx_is_16_bit(window_size))
+                        return offsetof(struct liblzx_compressor, bt_mf_16) +
+                               bt_matchfinder_size_16(window_size, streaming);
+                else
+                        return offsetof(struct liblzx_compressor, bt_mf_32) +
+                               bt_matchfinder_size_32(window_size, streaming);
+        }
+}
+
+/* Compress a buffer of data. */
+static void
+lzx_reset(struct liblzx_compressor *c)
+{
+        /* Initially, the previous Huffman codeword lengths are all zeroes. */
+        c->codes_index = 0;
+        memset(&c->codes[1].lens, 0, sizeof(struct lzx_lens));
+
+        /* Reset the E8 preprocessor offset */
+        c->e8_chunk_offset = 0;
+
+        /* Reset the streaming prefix */
+        c->in_prefix_size = 0;
+
+        /* Reset the LRU queue */
+        {
+                int i;
+                for (i = 0; i < LZX_NUM_RECENT_OFFSETS; i++) {
+                        c->lru_queue[i] = 1;
+                }
+        }
+
+        /* Reset next hashes */
+        c->next_hashes[0] = 0;
+        c->next_hashes[1] = 0;
+
+        c->reset(c);
+}
+
+/* Allocate an LZX compressor. */
+liblzx_compressor_t *
+liblzx_compress_create(const struct liblzx_compress_properties *props)
+{
+        unsigned window_order;
+        struct liblzx_compressor *c;
+        bool streaming = (props->lzx_variant != LIBLZX_VARIANT_WIM);
+
+        /* Validate the maximum buffer size and get the window order from it. */
+        window_order = lzx_get_window_order(props->window_size);
+        if (window_order == 0)
+                return NULL;
+
+        /* Allocate the compressor. */
+        c = props->alloc_func(props->userdata,
+            lzx_get_compressor_size(props->window_size, props->compression_level, streaming));
+        if (!c)
+                goto oom0;
+
+        c->alloc_func = props->alloc_func;
+        c->free_func = props->free_func;
+        c->alloc_userdata = props->userdata;
+        c->window_size = props->window_size;
+        c->window_order = window_order;
+        c->num_main_syms = lzx_get_num_main_syms(window_order);
+        c->variant = props->lzx_variant;
+        c->first_block = true;
+        c->out_chunk.data = NULL;
+        c->out_chunk.size = 0;
+        c->flushing = false;
+        c->e8_chunk_offset = 0;
+        c->e8_file_size = props->e8_file_size;
+        c->in_buffer_capacity = c->window_size;
+        c->in_prefix_size = 0;
+        c->in_used = 0;
+        c->chunk_size = props->chunk_granularity;
+
+        /* Allocate the buffer for preprocessed data if needed. */
+        if (streaming) {
+                /* Pad out to include past blocks and extra
+                 * matchfinding space  */
+                c->in_buffer_capacity *= 2;
+                c->in_buffer_capacity +=
+                    LZX_MAX_MATCH_LEN + LZX_E8_FILTER_TAIL_SIZE;
+        }
+
+        if (c->variant == LIBLZX_VARIANT_WIM)
+                c->e8_file_size = LZX_WIM_MAGIC_FILESIZE;
+
+        c->in_buffer =
+            props->alloc_func(props->userdata, c->in_buffer_capacity);
+
+        if (!c->in_buffer)
+                goto oom1;
+
+        c->out_buffer_capacity = c->chunk_size;
+        if (c->variant != LIBLZX_VARIANT_WIM)
+                c->out_buffer_capacity += 6144;
+
+        c->out_chunk.data = c->out_buffer =
+            props->alloc_func(props->userdata, c->out_buffer_capacity);
+
+        if (!c->out_buffer)
+                goto oom2;
+
+        if (props->compression_level <= MAX_FAST_LEVEL) {
+
+                /* Fast compression: Use lazy parsing. */
+                if (lzx_is_16_bit(props->window_size)) {
+                        c->reset = lzx_reset_lazy_16;
+                        c->impl = lzx_compress_lazy_16;
+                        c->cull = lzx_cull_lazy_16;
+                } else {
+                        c->reset = lzx_reset_lazy_32;
+                        c->impl = lzx_compress_lazy_32;
+                        c->cull = lzx_cull_lazy_32;
+                }
+
+                /* Scale max_search_depth and nice_match_length with the
+                 * compression level. */
+                c->max_search_depth = (60 * props->compression_level) / 20;
+                c->nice_match_length = (80 * props->compression_level) / 20;
+
+                /* lzx_compress_lazy() needs max_search_depth >= 2 because it
+                 * halves the max_search_depth when attempting a lazy match, and
+                 * max_search_depth must be at least 1. */
+                c->max_search_depth = max_uint(c->max_search_depth, 2);
+        } else {
+
+                /* Normal / high compression: Use near-optimal parsing. */
+                if (lzx_is_16_bit(c->window_size)) {
+                        c->reset = lzx_reset_near_optimal_16;
+                        c->impl = lzx_compress_near_optimal_16;
+                        c->cull = lzx_cull_near_optimal_16;
+                } else {
+                        c->reset = lzx_reset_near_optimal_32;
+                        c->impl = lzx_compress_near_optimal_32;
+                        c->cull = lzx_cull_near_optimal_32;
+                }
+
+                /* Scale max_search_depth and nice_match_length with the
+                 * compression level. */
+                c->max_search_depth = lzx_bt_max_search_depth(props->compression_level);
+                c->nice_match_length = (48 * props->compression_level) / 50;
+
+                /* Also scale num_optim_passes with the compression level.  But
+                 * the more passes there are, the less they help --- so don't
+                 * add them linearly.  */
+                c->num_optim_passes = 1;
+                c->num_optim_passes += (props->compression_level >= 45);
+                c->num_optim_passes += (props->compression_level >= 70);
+                c->num_optim_passes += (props->compression_level >= 100);
+                c->num_optim_passes += (props->compression_level >= 150);
+                c->num_optim_passes += (props->compression_level >= 200);
+                c->num_optim_passes += (props->compression_level >= 300);
+
+                /* max_search_depth must be at least 1. */
+                c->max_search_depth = max_uint(c->max_search_depth, 1);
+        }
+
+        /* Prepare the offset => offset slot mapping. */
+        lzx_init_offset_slot_tabs(c);
+
+        lzx_reset(c);
+
+        return c;
+
+oom2:
+        props->free_func(props->userdata, c->in_buffer);
+oom1:
+        props->free_func(props->userdata, c);
+oom0:
+        return NULL;
+}
+
+/* Compress a buffer of data. */
+static size_t
+lzx_compress_chunk(struct liblzx_compressor *c)
+{
+        struct lzx_output_bitstream os;
+        size_t result;
+        bool e8_preprocess_enabled = (c->e8_chunk_offset < 0x40000000);
+        bool next_e8_preprocess_enabled =
+            (c->e8_chunk_offset + c->chunk_size < 0x40000000);
+        uint32_t chunk_size = min_u32(c->chunk_size, c->in_used);
+        uint32_t next_chunk_preprocess_size = 0;
+
+        uint8_t *in = (uint8_t *)c->in_buffer + c->in_prefix_size;
+
+        /* Preprocess the input data. */
+        if (e8_preprocess_enabled) {
+                lzx_preprocess(in, chunk_size, c->e8_chunk_offset,
+                               c->e8_file_size);
+        }
+
+        if (c->in_used > c->chunk_size && next_e8_preprocess_enabled) {
+                next_chunk_preprocess_size =
+                    min_u32(LZX_MAX_MATCH_LEN + LZX_E8_FILTER_TAIL_SIZE,
+                            c->in_used - c->chunk_size);
+        }
+
+        /* Preprocess enough of the next block input data for the
+           matchfinder */
+        if (next_chunk_preprocess_size > 0) {
+                lzx_preprocess(in + c->chunk_size, next_chunk_preprocess_size,
+                               c->e8_chunk_offset + c->chunk_size,
+                               c->e8_file_size);
+        }
+
+        /* Initialize the output bitstream. */
+        lzx_init_output(&os, c->out_buffer, c->out_buffer_capacity);
+
+        /* Call the compression level-specific compress() function. */
+        (*c->impl)(c, in, chunk_size, c->in_used, &os);
+
+        /* Undo next block preprocessing */
+        if (next_chunk_preprocess_size > 0) {
+                lzx_postprocess(in + c->chunk_size, next_chunk_preprocess_size,
+                               c->e8_chunk_offset + c->chunk_size,
+                               c->e8_file_size);
+        }
+
+        /* Flush the output bitstream. */
+        result = lzx_flush_output(&os);
+
+        /* Update the E8 chunk offset. */
+        c->e8_chunk_offset += (uint32_t)chunk_size;
+
+        /* Update the prefix and used amounts. */
+        c->in_prefix_size += (uint32_t)chunk_size;
+        c->in_used -= chunk_size;
+
+        if (c->in_prefix_size >= c->window_size * 2) {
+                uint32_t cull_amount = (c->in_prefix_size - c->window_size);
+
+                in = (uint8_t *)c->in_buffer + c->in_prefix_size;
+
+                memmove(c->in_buffer, in - c->window_size,
+                        c->in_used + c->window_size);
+                c->in_prefix_size = c->window_size;
+
+                (*c->cull)(c, cull_amount);
+        }
+
+        /* Return the number of compressed bytes, or 0 if the input did not
+         * compress to less than its original size. */
+        return result;
+}
+
+void
+liblzx_compress_destroy(liblzx_compressor_t *c)
+{
+        c->free_func(c->alloc_userdata, c->out_buffer);
+        c->free_func(c->alloc_userdata, c->in_buffer);
+        c->free_func(c->alloc_userdata, c);
+}
+
+size_t
+liblzx_compress_add_input(liblzx_compressor_t *c, const void *in_data,
+                          size_t in_data_size)
+{
+        uint32_t max_used = 0;
+        size_t fill_amount = 0;
+
+        if (c->out_chunk.size > 0 || c->flushing)
+                return 0;
+
+        max_used = min_uint(c->in_buffer_capacity - c->in_prefix_size,
+                            c->chunk_size + LZX_MAX_MATCH_LEN +
+                                LZX_E8_FILTER_TAIL_SIZE);
+        fill_amount = min_size(in_data_size, max_used - c->in_used);
+
+        memcpy(((uint8_t *)c->in_buffer) + c->in_prefix_size + c->in_used, in_data,
+               fill_amount);
+
+        c->in_used += fill_amount;
+
+        if (c->in_used == max_used) {
+                c->out_chunk.size = lzx_compress_chunk(c);
+        }
+
+        return fill_amount;
+}
+
+const liblzx_output_chunk_t *
+liblzx_compress_get_next_chunk(const liblzx_compressor_t *c)
+{
+        if (c->out_chunk.size > 0)
+                return &c->out_chunk;
+        else
+                return NULL;
+}
+
+void
+liblzx_compress_release_next_chunk(liblzx_compressor_t *c)
+{
+        c->out_chunk.size = 0;
+        if (c->flushing && c->in_used > 0) {
+                c->out_chunk.size = lzx_compress_chunk(c);
+        }
+}
+
+void
+liblzx_compress_end_input(liblzx_compressor_t *c)
+{
+        if (!c->flushing) {
+                c->flushing = true;
+                if (c->in_used > 0 && c->out_chunk.size == 0) {
+                        c->out_chunk.size = lzx_compress_chunk(c);
+                }
+        }
+}
diff --git a/dlls/cabinet/liblzx_lzx_constants.h b/dlls/cabinet/liblzx_lzx_constants.h
new file mode 100644
index 00000000000..f11ce407873
--- /dev/null
+++ b/dlls/cabinet/liblzx_lzx_constants.h
@@ -0,0 +1,108 @@
+/*
+ * lzx_constants.h
+ *
+ * Constants for the LZX compression format.
+ */
+
+#ifndef _LZX_CONSTANTS_H
+#define _LZX_CONSTANTS_H
+
+/* Number of literal byte values.  */
+#define LZX_NUM_CHARS        256
+
+/* The smallest and largest allowed match lengths.  */
+#define LZX_MIN_MATCH_LEN        2
+#define LZX_MAX_MATCH_LEN        257
+
+/* Number of distinct match lengths that can be represented.  */
+#define LZX_NUM_LENS                    (LZX_MAX_MATCH_LEN - LZX_MIN_MATCH_LEN + 1)
+
+/* Number of match lengths for which no length symbol is required.  */
+#define LZX_NUM_PRIMARY_LENS            7
+#define LZX_NUM_LEN_HEADERS             (LZX_NUM_PRIMARY_LENS + 1)
+
+/* The first length which requires a length symbol.  */
+#define LZX_MIN_SECONDARY_LEN           (LZX_MIN_MATCH_LEN + LZX_NUM_PRIMARY_LENS)
+
+/* Valid values of the 3-bit block type field.  */
+#define LZX_BLOCKTYPE_VERBATIM          1
+#define LZX_BLOCKTYPE_ALIGNED           2
+#define LZX_BLOCKTYPE_UNCOMPRESSED      3
+
+/* 'LZX_MIN_WINDOW_SIZE' and 'LZX_MAX_WINDOW_SIZE' are the minimum and maximum
+ * sizes of the sliding window.  */
+#define LZX_MIN_WINDOW_ORDER            15
+#define LZX_MAX_WINDOW_ORDER            21
+#define LZX_MIN_WINDOW_SIZE             (1UL << LZX_MIN_WINDOW_ORDER)  /* 32768   */
+#define LZX_MAX_WINDOW_SIZE             (1UL << LZX_MAX_WINDOW_ORDER)  /* 2097152 */
+
+/* Maximum number of offset slots.  (The actual number of offset slots depends
+ * on the window size.)  */
+#define LZX_MAX_OFFSET_SLOTS            50
+
+/* Maximum number of symbols in the main code.  (The actual number of symbols in
+ * the main code depends on the window size.)  */
+#define LZX_MAINCODE_MAX_NUM_SYMBOLS        \
+        (LZX_NUM_CHARS + (LZX_MAX_OFFSET_SLOTS * LZX_NUM_LEN_HEADERS))
+
+/* Number of symbols in the length code.  */
+#define LZX_LENCODE_NUM_SYMBOLS         (LZX_NUM_LENS - LZX_NUM_PRIMARY_LENS)
+
+/* Number of symbols in the pre-code.  */
+#define LZX_PRECODE_NUM_SYMBOLS         20
+
+/* Number of bits in which each pre-code codeword length is represented.  */
+#define LZX_PRECODE_ELEMENT_SIZE        4
+
+/* Number of low-order bits of each match offset that are entropy-encoded in
+ * aligned offset blocks.  */
+#define LZX_NUM_ALIGNED_OFFSET_BITS     3
+
+/* Number of symbols in the aligned offset code.  */
+#define LZX_ALIGNEDCODE_NUM_SYMBOLS     (1 << LZX_NUM_ALIGNED_OFFSET_BITS)
+
+/* Mask for the match offset bits that are entropy-encoded in aligned offset
+ * blocks.  */
+#define LZX_ALIGNED_OFFSET_BITMASK      ((1 << LZX_NUM_ALIGNED_OFFSET_BITS) - 1)
+
+/* Number of bits in which each aligned offset codeword length is represented.  */
+#define LZX_ALIGNEDCODE_ELEMENT_SIZE    3
+
+/* The first offset slot which requires an aligned offset symbol in aligned
+ * offset blocks.  */
+#define LZX_MIN_ALIGNED_OFFSET_SLOT     8
+
+/* The offset slot base for LZX_MIN_ALIGNED_OFFSET_SLOT.  */
+#define LZX_MIN_ALIGNED_OFFSET          14
+
+/* The maximum number of extra offset bits in verbatim blocks.  (One would need
+ * to subtract LZX_NUM_ALIGNED_OFFSET_BITS to get the number of extra offset
+ * bits in *aligned* blocks.)  */
+#define LZX_MAX_NUM_EXTRA_BITS          17
+
+/* Maximum lengths (in bits) for length-limited Huffman code construction.  */
+#define LZX_MAX_MAIN_CODEWORD_LEN       16
+#define LZX_MAX_LEN_CODEWORD_LEN        16
+#define LZX_MAX_PRE_CODEWORD_LEN        ((1 << LZX_PRECODE_ELEMENT_SIZE) - 1)
+#define LZX_MAX_ALIGNED_CODEWORD_LEN    ((1 << LZX_ALIGNEDCODE_ELEMENT_SIZE) - 1)
+
+/* For LZX-compressed blocks in WIM resources, this value is always used as the
+ * filesize parameter for the call instruction (0xe8 byte) preprocessing, even
+ * though the blocks themselves are not this size, and the size of the actual
+ * file resource in the WIM file is very likely to be something entirely
+ * different as well.  */
+#define LZX_WIM_MAGIC_FILESIZE          12000000
+
+/* Assumed LZX block size when the encoded block size begins with a 0 bit.
+ * This is probably WIM-specific.  */
+#define LZX_DEFAULT_BLOCK_SIZE          32768
+
+#define LZX_E8_FILTER_TAIL_SIZE         10
+
+/* Number of offsets in the recent (or "repeat") offsets queue.  */
+#define LZX_NUM_RECENT_OFFSETS          3
+
+/* An offset of n bytes is actually encoded as (n + LZX_OFFSET_ADJUSTMENT).  */
+#define LZX_OFFSET_ADJUSTMENT        (LZX_NUM_RECENT_OFFSETS - 1)
+
+#endif /* _LZX_CONSTANTS_H */
diff --git a/dlls/cabinet/liblzx_matchfinder_common.h b/dlls/cabinet/liblzx_matchfinder_common.h
new file mode 100644
index 00000000000..37f16fc3fa1
--- /dev/null
+++ b/dlls/cabinet/liblzx_matchfinder_common.h
@@ -0,0 +1,131 @@
+/*
+ * matchfinder_common.h - common code for Lempel-Ziv matchfinding
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LIBLZX_MATCHFINDER_COMMON_H
+#define _LIBLZX_MATCHFINDER_COMMON_H
+
+#include "liblzx_bitops.h"
+#include "liblzx_unaligned.h"
+
+/*
+ * Given a 32-bit value that was loaded with the platform's native endianness,
+ * return a 32-bit value whose high-order 8 bits are 0 and whose low-order 24
+ * bits contain the first 3 bytes, arranged in octets in a platform-dependent
+ * order, at the memory location from which the input 32-bit value was loaded.
+ */
+static attrib_forceinline uint32_t
+loaded_u32_to_u24(uint32_t v)
+{
+        if (CPU_IS_LITTLE_ENDIAN())
+                return v & 0xFFFFFF;
+        else
+                return v >> 8;
+}
+
+/*
+ * Load the next 3 bytes from @p into the 24 low-order bits of a 32-bit value.
+ * The order in which the 3 bytes will be arranged as octets in the 24 bits is
+ * platform-dependent.  At least 4 bytes (not 3) must be available at @p.
+ */
+static attrib_forceinline uint32_t
+load_u24_unaligned(const uint8_t *p)
+{
+#if UNALIGNED_ACCESS_IS_FAST
+        return loaded_u32_to_u24(load_u32_unaligned(p));
+#else
+        if (CPU_IS_LITTLE_ENDIAN())
+                return ((uint32_t)p[0] << 0) | ((uint32_t)p[1] << 8) | ((uint32_t)p[2] << 16);
+        else
+                return ((uint32_t)p[2] << 0) | ((uint32_t)p[1] << 8) | ((uint32_t)p[0] << 16);
+#endif
+}
+
+/*
+ * The hash function: given a sequence prefix held in the low-order bits of a
+ * 32-bit value, multiply by a carefully-chosen large constant.  Discard any
+ * bits of the product that don't fit in a 32-bit value, but take the
+ * next-highest @num_bits bits of the product as the hash value, as those have
+ * the most randomness.
+ */
+static attrib_forceinline uint32_t
+lz_hash(uint32_t seq, unsigned num_bits)
+{
+        return (uint32_t)(seq * 0x1E35A7BD) >> (32 - num_bits);
+}
+
+/*
+ * Return the number of bytes at @matchptr that match the bytes at @strptr, up
+ * to a maximum of @max_len.  Initially, @start_len bytes are matched.
+ */
+static attrib_forceinline unsigned
+lz_extend(const uint8_t * const strptr, const uint8_t * const matchptr,
+          const unsigned start_len, const unsigned max_len)
+{
+        unsigned len = start_len;
+        machine_word_t v_word;
+
+        if (UNALIGNED_ACCESS_IS_FAST) {
+
+                if (likely(max_len - len >= 4 * WORDBYTES)) {
+
+                #define COMPARE_WORD_STEP                               \
+                        v_word = load_word_unaligned(&matchptr[len]) ^  \
+                                 load_word_unaligned(&strptr[len]);     \
+                        if (v_word != 0)                                \
+                                goto word_differs;                      \
+                        len += WORDBYTES;                               \
+
+                        COMPARE_WORD_STEP
+                        COMPARE_WORD_STEP
+                        COMPARE_WORD_STEP
+                        COMPARE_WORD_STEP
+                #undef COMPARE_WORD_STEP
+                }
+
+                while (len + WORDBYTES <= max_len) {
+                        v_word = load_word_unaligned(&matchptr[len]) ^
+                                 load_word_unaligned(&strptr[len]);
+                        if (v_word != 0)
+                                goto word_differs;
+                        len += WORDBYTES;
+                }
+        }
+
+        while (len < max_len && matchptr[len] == strptr[len])
+                len++;
+        return len;
+
+word_differs:
+        if (CPU_IS_LITTLE_ENDIAN())
+                len += (bsfw(v_word) >> 3);
+        else
+                len += (WORDBITS - 1 - bsrw(v_word)) >> 3;
+        return len;
+}
+
+#endif /* _LIBLZX_MATCHFINDER_COMMON_H */
diff --git a/dlls/cabinet/liblzx_minmax.h b/dlls/cabinet/liblzx_minmax.h
new file mode 100644
index 00000000000..d4d76130caa
--- /dev/null
+++ b/dlls/cabinet/liblzx_minmax.h
@@ -0,0 +1,122 @@
+/*
+ * compiler.h
+ *
+ * Compiler-specific definitions.
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _LIBLZX_MINMAX_H
+#define _LIBLZX_MINMAX_H
+
+#include "liblzx_compiler.h"
+#include "liblzx_types.h"
+
+/* Get the minimum of two variables, without multiple evaluation.  */
+static attrib_forceinline double
+min_double(double a, double b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline float
+min_float(float a, float b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline unsigned
+min_uint(unsigned a, unsigned b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline unsigned
+min_u32(unsigned a, unsigned b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline size_t
+min_size(size_t a, size_t b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline intmax_t
+min_int(int a, int b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline void *
+min_ptr(void *a, void *b)
+{
+        return (a < b) ? a : b;
+}
+
+static attrib_forceinline const void *
+min_constptr(const void *a, const void *b)
+{
+        return (a < b) ? a : b;
+}
+
+/* Get the maximum of two variables, without multiple evaluation.  */
+static attrib_forceinline double
+max_float(double a, double b)
+{
+        return (a > b) ? a : b;
+}
+
+static attrib_forceinline unsigned
+max_uint(unsigned a, unsigned b)
+{
+        return (a > b) ? a : b;
+}
+
+static attrib_forceinline uint32_t
+max_u32(uint32_t a, uint32_t b)
+{
+        return (a > b) ? a : b;
+}
+
+static attrib_forceinline uint64_t
+max_u64(uint64_t a, uint64_t b)
+{
+        return (a > b) ? a : b;
+}
+
+static attrib_forceinline void *
+max_ptr(void *a, void *b)
+{
+        return (a > b) ? a : b;
+}
+
+static attrib_forceinline const void *
+max_constptr(const void *a, const void *b)
+{
+        return (a > b) ? a : b;
+}
+
+#endif
diff --git a/dlls/cabinet/liblzx_types.h b/dlls/cabinet/liblzx_types.h
new file mode 100644
index 00000000000..b63e4b31fc1
--- /dev/null
+++ b/dlls/cabinet/liblzx_types.h
@@ -0,0 +1,33 @@
+#ifndef _LIBLZX_TYPES_H
+#define _LIBLZX_TYPES_H
+
+#include <inttypes.h>
+#include <stdbool.h>
+#include <stddef.h>
+
+#include "liblzx_compiler.h"
+
+/* Unsigned little endian types of exact size */
+typedef uint16_t _bitwise_attr le16_t;
+typedef uint32_t _bitwise_attr le32_t;
+typedef uint64_t _bitwise_attr le64_t;
+
+/* Unsigned big endian types of exact size */
+typedef uint16_t _bitwise_attr be16_t;
+typedef uint32_t _bitwise_attr be32_t;
+typedef uint64_t _bitwise_attr be64_t;
+
+/* A pointer to 'utf16lechar' indicates a UTF-16LE encoded string */
+typedef le16_t utf16lechar;
+
+/*
+ * Type of a machine word.  'unsigned long' would be logical, but that is only
+ * 32 bits on x86_64 Windows.  The same applies to 'uint_fast32_t'.  So the best
+ * we can do without a bunch of #ifdefs appears to be 'size_t'.
+ */
+typedef size_t machine_word_t;
+
+#define WORDBYTES        sizeof(machine_word_t)
+#define WORDBITS         (8 * WORDBYTES)
+
+#endif /* _LIBLZX_TYPES_H */
diff --git a/dlls/cabinet/liblzx_unaligned.h b/dlls/cabinet/liblzx_unaligned.h
new file mode 100644
index 00000000000..23ee938fb62
--- /dev/null
+++ b/dlls/cabinet/liblzx_unaligned.h
@@ -0,0 +1,134 @@
+/*
+ * unaligned.h - inline functions for unaligned memory accesses
+ *
+ * Copyright (C) 2025 Eric Lasota
+ * Based on wimlib.  Copyright 2022 Eric Biggers
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use,
+ * copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following
+ * conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
+ * HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
+ * WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _LIBLZX_UNALIGNED_H
+#define _LIBLZX_UNALIGNED_H
+
+#include <string.h>
+
+#include "liblzx_compiler.h"
+#include "liblzx_endianness.h"
+#include "liblzx_types.h"
+
+#define DEFINE_UNALIGNED_TYPE(name, type)                        \
+static attrib_forceinline type                                   \
+load_##name##_unaligned(const void *p)                           \
+{                                                                \
+        type v;                                                  \
+        memcpy(&v, p, sizeof(v));                                \
+        return v;                                                \
+}                                                                \
+                                                                 \
+static attrib_forceinline void                                   \
+store_##name##_unaligned(type v, void *p)                        \
+{                                                                \
+        memcpy(p, &v, sizeof(v));                                \
+}
+
+DEFINE_UNALIGNED_TYPE(u16, uint16_t);
+DEFINE_UNALIGNED_TYPE(u32, uint32_t);
+DEFINE_UNALIGNED_TYPE(u64, uint64_t);
+DEFINE_UNALIGNED_TYPE(le16, le16_t);
+DEFINE_UNALIGNED_TYPE(le32, le32_t);
+DEFINE_UNALIGNED_TYPE(le64, le64_t);
+DEFINE_UNALIGNED_TYPE(be16, be16_t);
+DEFINE_UNALIGNED_TYPE(be32, be32_t);
+DEFINE_UNALIGNED_TYPE(be64, be64_t);
+DEFINE_UNALIGNED_TYPE(size_t, size_t);
+DEFINE_UNALIGNED_TYPE(machine_word_t, machine_word_t);
+
+#define load_word_unaligned        load_machine_word_t_unaligned
+#define store_word_unaligned        store_machine_word_t_unaligned
+
+static attrib_forceinline uint16_t
+get_unaligned_le16(const uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST)
+                return le16_to_cpu(load_le16_unaligned(p));
+        else
+                return ((uint16_t)p[1] << 8) | p[0];
+}
+
+static attrib_forceinline uint32_t
+get_unaligned_le32(const uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST)
+                return le32_to_cpu(load_le32_unaligned(p));
+        else
+                return ((uint32_t)p[3] << 24) | ((uint32_t)p[2] << 16) |
+                        ((uint32_t)p[1] << 8) | p[0];
+}
+
+static attrib_forceinline uint32_t
+get_unaligned_be32(const uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST)
+                return be32_to_cpu(load_be32_unaligned(p));
+        else
+                return ((uint32_t)p[0] << 24) | ((uint32_t)p[1] << 16) |
+                        ((uint32_t)p[2] << 8) | p[3];
+}
+
+static attrib_forceinline void
+put_unaligned_le16(uint16_t v, uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST) {
+                store_le16_unaligned(cpu_to_le16(v), p);
+        } else {
+                p[0] = (uint8_t)(v >> 0);
+                p[1] = (uint8_t)(v >> 8);
+        }
+}
+
+static attrib_forceinline void
+put_unaligned_le32(uint32_t v, uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST) {
+                store_le32_unaligned(cpu_to_le32(v), p);
+        } else {
+                p[0] = (uint8_t)(v >> 0);
+                p[1] = (uint8_t)(v >> 8);
+                p[2] = (uint8_t)(v >> 16);
+                p[3] = (uint8_t)(v >> 24);
+        }
+}
+
+static attrib_forceinline void
+put_unaligned_be32(uint32_t v, uint8_t *p)
+{
+        if (UNALIGNED_ACCESS_IS_FAST) {
+                store_be32_unaligned(cpu_to_be32(v), p);
+        } else {
+                p[0] = (uint8_t)(v >> 24);
+                p[1] = (uint8_t)(v >> 16);
+                p[2] = (uint8_t)(v >> 8);
+                p[3] = (uint8_t)(v >> 0);
+        }
+}
+
+#endif /* _LIBLZX_UNALIGNED_H */
diff --git a/dlls/cabinet/liblzx_util.h b/dlls/cabinet/liblzx_util.h
new file mode 100644
index 00000000000..4d832ae0da6
--- /dev/null
+++ b/dlls/cabinet/liblzx_util.h
@@ -0,0 +1,20 @@
+/*
+ * util.h - utility functions and macros
+ */
+#ifndef _LIBLZX_UTIL_H
+#define _LIBLZX_UTIL_H
+
+#include "liblzx_compiler.h"
+#include "liblzx_types.h"
+
+/****************
+ * General macros
+ *****************/
+
+/* Calculate 'n / d', but round up instead of down.  */
+#define DIV_ROUND_UP(n, d)        (((n) + (d) - 1) / (d))
+
+/* Get the number of elements of an array type.  */
+#define ARRAY_LEN(array)        (sizeof(array) / sizeof((array)[0]))
+
+#endif /* _LIBLZX_UTIL_H */
diff --git a/dlls/cabinet/tests/extract.c b/dlls/cabinet/tests/extract.c
index 2e37dc7cda6..748115d67fd 100644
--- a/dlls/cabinet/tests/extract.c
+++ b/dlls/cabinet/tests/extract.c
@@ -104,9 +104,11 @@ static void create_test_files(void)
createTestFile("a.txt");
     createTestFile("b.txt");
+    createTestFile("c.txt");
+    createTestFile("d.txt");
     CreateDirectoryA("testdir", NULL);
-    createTestFile("testdir\c.txt");
-    createTestFile("testdir\d.txt");
+    createTestFile("testdir\e.txt");
+    createTestFile("testdir\f.txt");
     CreateDirectoryA("dest", NULL);
 }
@@ -114,11 +116,13 @@ static void delete_test_files(void)
 {
     DeleteFileA("a.txt");
     DeleteFileA("b.txt");
-    DeleteFileA("testdir\c.txt");
-    DeleteFileA("testdir\d.txt");
+    DeleteFileA("c.txt");
+    DeleteFileA("d.txt");
+    DeleteFileA("testdir\e.txt");
+    DeleteFileA("testdir\f.txt");
     RemoveDirectoryA("testdir");
-    DeleteFileA("extract.cab");
+    //DeleteFileA("extract.cab");
 }
/* the FCI callbacks */
@@ -269,7 +273,7 @@ static INT_PTR CDECL get_open_info(char *pszName, USHORT *pdate, USHORT *ptime,
     return (INT_PTR)handle;
 }
-static void add_file(HFCI hfci, char *file)
+static void add_file(HFCI hfci, char *file, TCOMP typeCompress)
 {
     char path[MAX_PATH];
     BOOL res;
@@ -279,7 +283,7 @@ static void add_file(HFCI hfci, char *file)
     lstrcatA(path, file);
res = FCIAddFile(hfci, path, file, FALSE, get_next_cabinet, progress,
-                     get_open_info, tcompTYPE_MSZIP);
+                     get_open_info, typeCompress);
     ok(res, "Expected FCIAddFile to succeed\n");
 }
@@ -300,10 +304,12 @@ static void create_cab_file(void)
     CCAB cabParams;
     HFCI hfci;
     ERF erf;
-    static CHAR a_txt[]         = "a.txt",
-                b_txt[]         = "b.txt",
-                testdir_c_txt[] = "testdir\c.txt",
-                testdir_d_txt[] = "testdir\d.txt";
+    static CHAR a_txt[]           = "a.txt",
+                b_txt[]           = "b.txt",
+                c_txt[]           = "c.txt",
+                d_txt[]           = "d.txt",
+                testdir_e_txt[]   = "testdir\e.txt",
+                testdir_f_txt[]   = "testdir\f.txt";
     BOOL res;
set_cab_parameters(&cabParams);
@@ -314,10 +320,12 @@ static void create_cab_file(void)
ok(hfci != NULL, "Failed to create an FCI context\n");
-    add_file(hfci, a_txt);
-    add_file(hfci, b_txt);
-    add_file(hfci, testdir_c_txt);
-    add_file(hfci, testdir_d_txt);
+    add_file(hfci, a_txt, tcompTYPE_MSZIP);
+    add_file(hfci, b_txt, tcompTYPE_MSZIP);
+    add_file(hfci, c_txt, TCOMPfromLZXWindow(21));
+    add_file(hfci, d_txt, TCOMPfromLZXWindow(21));
+    add_file(hfci, testdir_e_txt, tcompTYPE_MSZIP);
+    add_file(hfci, testdir_f_txt, tcompTYPE_MSZIP);
res = FCIFlushCabinet(hfci, FALSE, get_next_cabinet, progress);
     ok(res, "Failed to flush the cabinet\n");
@@ -380,26 +388,30 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount);
+    ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount);
     ok(session.Operation == (EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES),
        "Expected EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
     ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n");
+    ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n");
+    ok(DeleteFileA("dest\d.txt"), "Expected dest\d.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n");
     ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n");
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
     free_file_list(&session);
@@ -411,23 +423,25 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount);
+    ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount);
     ok(session.Operation == EXTRACT_FILLFILELIST,
        "Expected EXTRACT_FILLFILELIST, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n");
-    ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", TRUE), "list entry wrong\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n");
+    ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", TRUE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", TRUE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", TRUE), "list entry wrong\n");
@@ -436,27 +450,31 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount);
+    ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount);
     ok(session.Operation == EXTRACT_EXTRACTFILES,
        "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
     ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n");
+    ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n");
+    ok(DeleteFileA("dest\d.txt"), "Expected dest\d.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n");
     ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n");
     ok(RemoveDirectoryA("dest"), "Expected dest to exist\n");
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -464,25 +482,29 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount);
+    ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount);
     ok(session.Operation == EXTRACT_EXTRACTFILES,
        "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n");
     ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(!DeleteFileA("dest\c.txt"), "Expected dest\a.txt to not exist\n");
+    ok(!DeleteFileA("dest\d.txt"), "Expected dest\b.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -497,26 +519,30 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 4, "Expected 4, got %d\n", session.FileCount);
+    ok(session.FileCount == 6, "Expected 6, got %d\n", session.FileCount);
     ok(session.Operation == EXTRACT_EXTRACTFILES,
        "Expected EXTRACT_EXTRACTFILES, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n");
-    ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
+    ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
+    ok(DeleteFileA("dest\c.txt"), "Expected dest\c.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n");
+    ok(!DeleteFileA("dest\d.txt"), "Expected dest\d.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(!check_list(&node, "c.txt", FALSE), "list entry wrong\n");
+    ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(!check_list(&node, "a.txt", FALSE), "list entry wrong\n");
     free_file_list(&session);
@@ -525,25 +551,25 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount);
+    ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount);
     ok(session.Operation == EXTRACT_FILLFILELIST,
        "Expected EXTRACT_FILLFILELIST, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n");
     ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n");
-    ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n");
+    ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
     ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(!check_list(&node, "a.txt", FALSE), "list entry wrong\n");
@@ -551,24 +577,26 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount);
+    ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount);
     ok(session.Operation == 0, "Expected 0, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n");
     ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n");
-    ok(check_list(&node, "testdir\d.txt", TRUE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", TRUE), "list entry wrong\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n");
+    ok(check_list(&node, "testdir\f.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", TRUE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", TRUE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", TRUE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", TRUE), "list entry wrong\n");
@@ -577,28 +605,32 @@ static void test_Extract(void)
     res = pExtract(&session, "extract.cab");
     node = session.FileList;
     ok(res == S_OK, "Expected S_OK, got %ld\n", res);
-    ok(session.FileSize == 40, "Expected 40, got %d\n", session.FileSize);
+    ok(session.FileSize == 52, "Expected 52, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_NONE,
        "Expected FDIERROR_NONE, got %d\n", session.Error.erfOper);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Error.fError == FALSE, "Expected FALSE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 8, "Expected 8, got %d\n", session.FileCount);
+    ok(session.FileCount == 12, "Expected 12, got %d\n", session.FileCount);
     ok(session.Operation == 0, "Expected 0, got %d\n", session.Operation);
     ok(!lstrcmpA(session.Destination, "dest"), "Expected dest, got %s\n", session.Destination);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\d.txt"),
-       "Expected dest\testdir\d.txt, got %s\n", session.CurrentFile);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\f.txt"),
+       "Expected dest\testdir\f.txt, got %s\n", session.CurrentFile);
     ok(!*session.Reserved, "Expected empty string, got %s\n", session.Reserved);
     ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to exist\n");
+    ok(DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to exist\n");
     ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
-    ok(DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to exist\n");
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to exist\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
     node = session.FilterList;
-    ok(check_list(&node, "testdir\d.txt", FALSE), "list entry wrong\n");
-    ok(check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\f.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "d.txt", FALSE), "list entry wrong\n");
+    ok(check_list(&node, "c.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(check_list(&node, "a.txt", FALSE), "list entry wrong\n");
     free_file_list(&session);
@@ -625,10 +657,10 @@ static void test_Extract(void)
     ok(!session.FilterList, "Expected empty filter list\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to not exist\n");
     ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n");
-    ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry should not exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n");
+    ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry should not exist\n");
     ok(!check_list(&node, "b.txt", FALSE), "list entry should not exist\n");
     ok(!check_list(&node, "a.txt", FALSE), "list entry should not exist\n");
     free_file_list(&session);
@@ -660,10 +692,10 @@ static void test_Extract(void)
     ok(getFileSize("dest\a.txt") == 11, "Expected dest\a.txt to be 11 bytes\n");
     ok(!DeleteFileA("dest\a.txt"), "Expected dest\a.txt to be read-only\n");
     ok(!DeleteFileA("dest\b.txt"), "Expected dest\b.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to not exist\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n");
-    ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry should not exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to not exist\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n");
+    ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry should not exist\n");
     ok(!check_list(&node, "b.txt", FALSE), "list entry should not exist\n");
     ok(!check_list(&node, "a.txt", FALSE), "list entry should not exist\n");
     free_file_list(&session);
@@ -673,8 +705,8 @@ static void test_Extract(void)
/* first file exists and is writable, third file exists but is read-only */
     createTestFile("dest\a.txt");
-    createTestFile("dest\testdir\c.txt");
-    SetFileAttributesA("dest\testdir\c.txt", FILE_ATTRIBUTE_READONLY);
+    createTestFile("dest\testdir\e.txt");
+    SetFileAttributesA("dest\testdir\e.txt", FILE_ATTRIBUTE_READONLY);
     ZeroMemory(&session, sizeof(SESSION));
     lstrcpyA(session.Destination, "dest");
     session.Operation = EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES;
@@ -682,12 +714,12 @@ static void test_Extract(void)
     node = session.FileList;
     ok(res == HRESULT_FROM_WIN32(ERROR_ACCESS_DENIED) || res == E_FAIL,
        "Expected HRESULT_FROM_WIN32(ERROR_ACCESS_DENIED) or E_FAIL, got %08lx\n", res);
-    ok(session.FileSize == 26, "Expected 26, got %d\n", session.FileSize);
+    ok(session.FileSize == 38, "Expected 38, got %d\n", session.FileSize);
     ok(session.Error.erfOper == FDIERROR_USER_ABORT,
        "Expected FDIERROR_USER_ABORT, got %d\n", session.Error.erfOper);
     ok(session.Error.fError == TRUE, "Expected TRUE, got %d\n", session.Error.fError);
-    ok(session.FileCount == 3, "Expected 3, got %d\n", session.FileCount);
-    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\c.txt"),
+    ok(session.FileCount == 5, "Expected 5, got %d\n", session.FileCount);
+    ok(!lstrcmpA(session.CurrentFile, "dest\testdir\e.txt"),
        "Expected dest\c.txt, got %s\n", session.CurrentFile);
     ok(session.Error.erfType == 0, "Expected 0, got %d\n", session.Error.erfType);
     ok(session.Operation == (EXTRACT_FILLFILELIST | EXTRACT_EXTRACTFILES),
@@ -698,19 +730,21 @@ static void test_Extract(void)
     ok(getFileSize("dest\a.txt") == 6, "Expected dest\a.txt to be 6 bytes\n");
     ok(DeleteFileA("dest\a.txt"), "Expected dest\a.txt to exist\n");
     ok(DeleteFileA("dest\b.txt"), "Expected dest\b.txt to exist\n");
-    ok(!DeleteFileA("dest\testdir\c.txt"), "Expected dest\testdir\c.txt to be read-only\n");
-    ok(!DeleteFileA("dest\testdir\d.txt"), "Expected dest\testdir\d.txt to not exist\n");
-    ok(!check_list(&node, "testdir\d.txt", FALSE), "list entry should not exist\n");
-    ok(!check_list(&node, "testdir\c.txt", FALSE), "list entry wrong\n");
+    ok(DeleteFileA("dest\c.txt"), "Expected dest\a.txt to exist\n");
+    ok(DeleteFileA("dest\d.txt"), "Expected dest\b.txt to exist\n");
+    ok(!DeleteFileA("dest\testdir\e.txt"), "Expected dest\testdir\e.txt to be read-only\n");
+    ok(!DeleteFileA("dest\testdir\f.txt"), "Expected dest\testdir\f.txt to not exist\n");
+    ok(!check_list(&node, "testdir\f.txt", FALSE), "list entry should not exist\n");
+    ok(!check_list(&node, "testdir\e.txt", FALSE), "list entry wrong\n");
     ok(!check_list(&node, "b.txt", FALSE), "list entry wrong\n");
     ok(!check_list(&node, "a.txt", TRUE), "list entry wrong\n");
     free_file_list(&session);
-    SetFileAttributesA("dest\testdir\c.txt", FILE_ATTRIBUTE_NORMAL);
-    DeleteFileA("dest\testdir\c.txt");
+    SetFileAttributesA("dest\testdir\e.txt", FILE_ATTRIBUTE_NORMAL);
+    DeleteFileA("dest\testdir\e.txt");
-    ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist\n");
-    ok(RemoveDirectoryA("dest"), "Expected dest to exist\n");
+    ok(RemoveDirectoryA("dest\testdir"), "Expected dest\testdir to exist and be empty\n");
+    ok(RemoveDirectoryA("dest"), "Expected dest to exist and be empty\n");
 }
START_TEST(extract)
-- 
GitLab


https://gitlab.winehq.org/wine/wine/-/merge_requests/9061

    

2025

2024

2023

2022

[PATCH v10 1/2] cabinet: LZX compression support