| // SPDX-License-Identifier: GPL-2.0-only |
| /* |
| * Copyright (C) 2024-2026, SUSE LLC |
| * |
| * Authors: Enzo Matsumiya <ematsumiya@suse.de> |
| * |
| * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec. |
| */ |
| #include <linux/slab.h> |
| #include <linux/sizes.h> |
| #include <linux/count_zeros.h> |
| #include <linux/unaligned.h> |
| |
| #include "lz77.h" |
| |
| /* |
| * Compression parameters. |
| * |
| * LZ77_MATCH_MAX_DIST: Farthest back a match can be from current position (can be 1 - 8K). |
| * LZ77_HASH_LOG: |
| * LZ77_HASH_SIZE: ilog2 hash size (recommended to be 13 - 18, default 15 (hash size |
| * 32k)). |
| * LZ77_RSTEP_SIZE: Number of bytes to read from input buffer for hashing and initial |
| * match check (default 4 bytes, this effectivelly makes this the min |
| * match len). |
| * LZ77_MSTEP_SIZE: Number of bytes to extend-compare a found match (default 8 bytes). |
| * LZ77_SKIP_TRIGGER: ilog2 value for adaptive skipping, i.e. to progressively skip input |
| * bytes when we can't find matches. Default is 4. |
| * Higher values (>0) will decrease compression time, but will result |
| * in worse compression ratio. Lower values will give better |
| * compression ratio (more matches found), but will increase time. |
| */ |
| #define LZ77_MATCH_MAX_DIST SZ_8K |
| #define LZ77_HASH_LOG 15 |
| #define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) |
| #define LZ77_RSTEP_SIZE sizeof(u32) |
| #define LZ77_MSTEP_SIZE sizeof(u64) |
| #define LZ77_SKIP_TRIGGER 4 |
| |
| #define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3) |
| #define LZ77_FLAG_MAX 32 |
| |
| static __always_inline u8 lz77_read8(const u8 *ptr) |
| { |
| return get_unaligned(ptr); |
| } |
| |
| static __always_inline u32 lz77_read32(const u32 *ptr) |
| { |
| return get_unaligned(ptr); |
| } |
| |
| static __always_inline u64 lz77_read64(const u64 *ptr) |
| { |
| return get_unaligned(ptr); |
| } |
| |
| static __always_inline void lz77_write8(u8 *ptr, u8 v) |
| { |
| put_unaligned(v, ptr); |
| } |
| |
| static __always_inline void lz77_write16(u16 *ptr, u16 v) |
| { |
| put_unaligned_le16(v, ptr); |
| } |
| |
| static __always_inline void lz77_write32(u32 *ptr, u32 v) |
| { |
| put_unaligned_le32(v, ptr); |
| } |
| |
| static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end) |
| { |
| const void *start = cur; |
| |
| /* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */ |
| do { |
| const u64 diff = lz77_read64(cur) ^ lz77_read64(match); |
| |
| if (!diff) { |
| cur += LZ77_MSTEP_SIZE; |
| match += LZ77_MSTEP_SIZE; |
| |
| continue; |
| } |
| |
| /* This computes the number of common bytes in @diff. */ |
| cur += count_trailing_zeros(diff) >> 3; |
| |
| return (cur - start); |
| } while (likely(cur + LZ77_MSTEP_SIZE <= end)); |
| |
| /* Fallback to byte-by-byte comparison for last <8 bytes. */ |
| while (cur < end && lz77_read8(cur) == lz77_read8(match)) { |
| cur++; |
| match++; |
| } |
| |
| return (cur - start); |
| } |
| |
| /** |
| * lz77_encode_match() - Match encoding. |
| * @dst: compressed buffer |
| * @nib: pointer to an address in @dst |
| * @dist: match distance |
| * @len: match length |
| * |
| * Assumes all args were previously checked. |
| * |
| * Return: @dst advanced to new position |
| * |
| * Ref: MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing" |
| */ |
| static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len) |
| { |
| len -= 3; |
| dist--; |
| dist <<= 3; |
| |
| if (len < 7) { |
| lz77_write16(dst, dist + len); |
| |
| return dst + sizeof(u16); |
| } |
| |
| dist |= 7; |
| lz77_write16(dst, dist); |
| dst += sizeof(u16); |
| len -= 7; |
| |
| if (!*nib) { |
| lz77_write8(dst, umin(len, 15)); |
| *nib = dst; |
| dst++; |
| } else { |
| u8 *b = *nib; |
| |
| lz77_write8(b, *b | umin(len, 15) << 4); |
| *nib = NULL; |
| } |
| |
| if (len < 15) |
| return dst; |
| |
| len -= 15; |
| if (len < 255) { |
| lz77_write8(dst, len); |
| |
| return dst + 1; |
| } |
| |
| lz77_write8(dst, 0xff); |
| dst++; |
| len += 7 + 15; |
| if (len <= 0xffff) { |
| lz77_write16(dst, len); |
| |
| return dst + sizeof(u16); |
| } |
| |
| lz77_write16(dst, 0); |
| dst += sizeof(u16); |
| lz77_write32(dst, len); |
| |
| return dst + sizeof(u32); |
| } |
| |
| /** |
| * lz77_encode_literals() - Literals encoding. |
| * @start: where to start copying literals (uncompressed buffer) |
| * @end: when to stop copying (uncompressed buffer) |
| * @dst: compressed buffer |
| * @f: pointer to current flag value |
| * @fc: pointer to current flag count |
| * @fp: pointer to current flag address |
| * |
| * Batch copy literals from @start to @dst, updating flag values accordingly. |
| * Assumes all args were previously checked. |
| * |
| * Return: @dst advanced to new position |
| * |
| * MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing" |
| */ |
| static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst, |
| long *f, u32 *fc, void **fp) |
| { |
| if (start >= end) |
| return dst; |
| |
| do { |
| const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc); |
| |
| memcpy(dst, start, len); |
| |
| dst += len; |
| start += len; |
| |
| *f <<= len; |
| *fc += len; |
| if (*fc == LZ77_FLAG_MAX) { |
| lz77_write32(*fp, *f); |
| *fc = 0; |
| *fp = dst; |
| dst += sizeof(u32); |
| } |
| } while (start < end); |
| |
| return dst; |
| } |
| |
| static __always_inline u32 lz77_hash(const u32 v) |
| { |
| return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG); |
| } |
| |
| noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen) |
| { |
| const void *srcp, *rlim, *end, *anchor; |
| u32 *htable, hash, flag_count = 0; |
| void *dstp, *nib, *flag_pos; |
| long flag = 0; |
| |
| /* This is probably a bug, so throw a warning. */ |
| if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen))) |
| return -EINVAL; |
| |
| srcp = anchor = src; |
| end = srcp + slen; /* absolute end */ |
| rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */ |
| dstp = dst; |
| flag_pos = dstp; |
| dstp += sizeof(u32); |
| nib = NULL; |
| |
| htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL); |
| if (!htable) |
| return -ENOMEM; |
| |
| LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE); |
| |
| /* |
| * Adjust @srcp so we don't get a false positive match on first iteration. |
| * Then prepare hash for first loop iteration (don't advance @srcp again). |
| */ |
| hash = lz77_hash(lz77_read32(srcp++)); |
| htable[hash] = 0; |
| hash = lz77_hash(lz77_read32(srcp)); |
| |
| /* |
| * Main loop. |
| * |
| * @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp. |
| * |
| * This code was crafted in a way to best utilise fetch-decode-execute CPU flow. |
| * Any attempt to optimize it, or even organize it, can lead to huge performance loss. |
| */ |
| do { |
| const void *match, *next = srcp; |
| u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER; |
| |
| /* Match finding (hot path -- don't change the read/check/write order). */ |
| do { |
| const u32 cur_hash = hash; |
| |
| srcp = next; |
| next += step; |
| |
| /* |
| * Adaptive skipping. |
| * |
| * Increment @step every (1 << LZ77_SKIP_TRIGGER, 16 in our case) bytes |
| * without a match. |
| * Reset to 1 when a match is found. |
| */ |
| step = (skip++ >> LZ77_SKIP_TRIGGER); |
| if (unlikely(next > rlim)) |
| goto out; |
| |
| hash = lz77_hash(lz77_read32(next)); |
| match = src + htable[cur_hash]; |
| htable[cur_hash] = srcp - src; |
| } while (likely(match + LZ77_MATCH_MAX_DIST < srcp) || |
| lz77_read32(match) != lz77_read32(srcp)); |
| |
| /* |
| * Match found. Warm/cold path; begin parsing @srcp and writing to @dstp: |
| * - flush literals |
| * - compute match length (*) |
| * - encode match |
| * |
| * (*) Current minimum match length is defined by the memory read size above, so |
| * here we already know that we have 4 matching bytes, but it's just faster to |
| * redundantly compute it again in lz77_match_len() than to adjust pointers/len. |
| */ |
| dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos); |
| len = lz77_match_len(match, srcp, end); |
| dstp = lz77_encode_match(dstp, &nib, srcp - match, len); |
| srcp += len; |
| anchor = srcp; |
| |
| LZ77_PREFETCH(srcp); |
| |
| flag = (flag << 1) | 1; |
| flag_count++; |
| if (flag_count == LZ77_FLAG_MAX) { |
| lz77_write32(flag_pos, flag); |
| flag_count = 0; |
| flag_pos = dstp; |
| dstp += sizeof(u32); |
| } |
| |
| if (unlikely(srcp > rlim)) |
| break; |
| |
| /* Prepare for next loop. */ |
| hash = lz77_hash(lz77_read32(srcp)); |
| } while (srcp < end); |
| out: |
| dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos); |
| |
| flag_count = LZ77_FLAG_MAX - flag_count; |
| flag <<= flag_count; |
| flag |= (1UL << flag_count) - 1; |
| lz77_write32(flag_pos, flag); |
| |
| *dlen = dstp - dst; |
| kvfree(htable); |
| |
| if (*dlen < slen) |
| return 0; |
| |
| return -EMSGSIZE; |
| } |