fs/smb/client/compress/lz77.c - third_party/linux - Git at Google

 // SPDX-License-Identifier: GPL-2.0-only
 /*
  * Copyright (C) 2024-2026, SUSE LLC
  *
  * Authors: Enzo Matsumiya <ematsumiya@suse.de>
  *
  * Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec.
  */
 #include <linux/slab.h>
 #include <linux/sizes.h>
 #include <linux/count_zeros.h>
 #include <linux/unaligned.h>

 #include "lz77.h"

 /*
  * Compression parameters.
  *
  * LZ77_MATCH_MAX_DIST:		Farthest back a match can be from current position (can be 1 - 8K).
  * LZ77_HASH_LOG:
  * LZ77_HASH_SIZE:		ilog2 hash size (recommended to be 13 - 18, default 15 (hash size
  *				32k)).
  * LZ77_RSTEP_SIZE:		Number of bytes to read from input buffer for hashing and initial
  *				match check (default 4 bytes, this effectivelly makes this the min
  *				match len).
  * LZ77_MSTEP_SIZE:		Number of bytes to extend-compare a found match (default 8 bytes).
  * LZ77_SKIP_TRIGGER:		ilog2 value for adaptive skipping, i.e. to progressively skip input
  *				bytes when we can't find matches.  Default is 4.
  *				Higher values (>0) will decrease compression time, but will result
  *				in worse compression ratio.  Lower values will give better
  *				compression ratio (more matches found), but will increase time.
  */
 #define LZ77_MATCH_MAX_DIST	SZ_8K
 #define LZ77_HASH_LOG		15
 #define LZ77_HASH_SIZE		(1 << LZ77_HASH_LOG)
 #define LZ77_RSTEP_SIZE		sizeof(u32)
 #define LZ77_MSTEP_SIZE		sizeof(u64)
 #define LZ77_SKIP_TRIGGER	4

 #define LZ77_PREFETCH(ptr)	__builtin_prefetch((ptr), 0, 3)
 #define LZ77_FLAG_MAX		32

 static __always_inline u8 lz77_read8(const u8 *ptr)
 {
 	return get_unaligned(ptr);
 }

 static __always_inline u32 lz77_read32(const u32 *ptr)
 {
 	return get_unaligned(ptr);
 }

 static __always_inline u64 lz77_read64(const u64 *ptr)
 {
 	return get_unaligned(ptr);
 }

 static __always_inline void lz77_write8(u8 *ptr, u8 v)
 {
 	put_unaligned(v, ptr);
 }

 static __always_inline void lz77_write16(u16 *ptr, u16 v)
 {
 	put_unaligned_le16(v, ptr);
 }

 static __always_inline void lz77_write32(u32 *ptr, u32 v)
 {
 	put_unaligned_le32(v, ptr);
 }

 static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end)
 {
 	const void *start = cur;

 	/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
 	do {
 		const u64 diff = lz77_read64(cur) ^ lz77_read64(match);

 		if (!diff) {
 			cur += LZ77_MSTEP_SIZE;
 			match += LZ77_MSTEP_SIZE;

 			continue;
 		}

 		/* This computes the number of common bytes in @diff. */
 		cur += count_trailing_zeros(diff) >> 3;

 		return (cur - start);
 	} while (likely(cur + LZ77_MSTEP_SIZE <= end));

 	/* Fallback to byte-by-byte comparison for last <8 bytes. */
 	while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
 		cur++;
 		match++;
 	}

 	return (cur - start);
 }

 /**
  * lz77_encode_match() - Match encoding.
  * @dst:	compressed buffer
  * @nib:	pointer to an address in @dst
  * @dist:	match distance
  * @len:	match length
  *
  * Assumes all args were previously checked.
  *
  * Return: @dst advanced to new position
  *
  * Ref: MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
  */
 static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len)
 {
 	len -= 3;
 	dist--;
 	dist <<= 3;

 	if (len < 7) {
 		lz77_write16(dst, dist + len);

 		return dst + sizeof(u16);
 	}

 	dist |= 7;
 	lz77_write16(dst, dist);
 	dst += sizeof(u16);
 	len -= 7;

 	if (!*nib) {
 		lz77_write8(dst, umin(len, 15));
 		*nib = dst;
 		dst++;
 	} else {
 		u8 *b = *nib;

 		lz77_write8(b, *b | umin(len, 15) << 4);
 		*nib = NULL;
 	}

 	if (len < 15)
 		return dst;

 	len -= 15;
 	if (len < 255) {
 		lz77_write8(dst, len);

 		return dst + 1;
 	}

 	lz77_write8(dst, 0xff);
 	dst++;
 	len += 7 + 15;
 	if (len <= 0xffff) {
 		lz77_write16(dst, len);

 		return dst + sizeof(u16);
 	}

 	lz77_write16(dst, 0);
 	dst += sizeof(u16);
 	lz77_write32(dst, len);

 	return dst + sizeof(u32);
 }

 /**
  * lz77_encode_literals() - Literals encoding.
  * @start:	where to start copying literals (uncompressed buffer)
  * @end:	when to stop copying (uncompressed buffer)
  * @dst:	compressed buffer
  * @f:		pointer to current flag value
  * @fc:		pointer to current flag count
  * @fp:		pointer to current flag address
  *
  * Batch copy literals from @start to @dst, updating flag values accordingly.
  * Assumes all args were previously checked.
  *
  * Return: @dst advanced to new position
  *
  * MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
  */
 static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst,
 						  long *f, u32 *fc, void **fp)
 {
 	if (start >= end)
 		return dst;

 	do {
 		const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);

 		memcpy(dst, start, len);

 		dst += len;
 		start += len;

 		*f <<= len;
 		*fc += len;
 		if (*fc == LZ77_FLAG_MAX) {
 			lz77_write32(*fp, *f);
 			*fc = 0;
 			*fp = dst;
 			dst += sizeof(u32);
 		}
 	} while (start < end);

 	return dst;
 }

 static __always_inline u32 lz77_hash(const u32 v)
 {
 	return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
 }

 noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen)
 {
 	const void *srcp, *rlim, *end, *anchor;
 	u32 *htable, hash, flag_count = 0;
 	void *dstp, *nib, *flag_pos;
 	long flag = 0;

 	/* This is probably a bug, so throw a warning. */
 	if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
 		return -EINVAL;

 	srcp = anchor = src;
 	end = srcp + slen; /* absolute end */
 	rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
 	dstp = dst;
 	flag_pos = dstp;
 	dstp += sizeof(u32);
 	nib = NULL;

 	htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
 	if (!htable)
 		return -ENOMEM;

 	LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);

 	/*
 	 * Adjust @srcp so we don't get a false positive match on first iteration.
 	 * Then prepare hash for first loop iteration (don't advance @srcp again).
 	 */
 	hash = lz77_hash(lz77_read32(srcp++));
 	htable[hash] = 0;
 	hash = lz77_hash(lz77_read32(srcp));

 	/*
 	 * Main loop.
 	 *
 	 * @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
 	 *
 	 * This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
 	 * Any attempt to optimize it, or even organize it, can lead to huge performance loss.
 	 */
 	do {
 		const void *match, *next = srcp;
 		u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;

 		/* Match finding (hot path -- don't change the read/check/write order). */
 		do {
 			const u32 cur_hash = hash;

 			srcp = next;
 			next += step;

 			/*
 			 * Adaptive skipping.
 			 *
 			 * Increment @step every (1 << LZ77_SKIP_TRIGGER, 16 in our case) bytes
 			 * without a match.
 			 * Reset to 1 when a match is found.
 			 */
 			step = (skip++ >> LZ77_SKIP_TRIGGER);
 			if (unlikely(next > rlim))
 				goto out;

 			hash = lz77_hash(lz77_read32(next));
 			match = src + htable[cur_hash];
 			htable[cur_hash] = srcp - src;
 		} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) ||
 			 lz77_read32(match) != lz77_read32(srcp));

 		/*
 		 * Match found.  Warm/cold path; begin parsing @srcp and writing to @dstp:
 		 * - flush literals
 		 * - compute match length (*)
 		 * - encode match
 		 *
 		 * (*) Current minimum match length is defined by the memory read size above, so
 		 * here we already know that we have 4 matching bytes, but it's just faster to
 		 * redundantly compute it again in lz77_match_len() than to adjust pointers/len.
 		 */
 		dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
 		len = lz77_match_len(match, srcp, end);
 		dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
 		srcp += len;
 		anchor = srcp;

 		LZ77_PREFETCH(srcp);

 		flag = (flag << 1) | 1;
 		flag_count++;
 		if (flag_count == LZ77_FLAG_MAX) {
 			lz77_write32(flag_pos, flag);
 			flag_count = 0;
 			flag_pos = dstp;
 			dstp += sizeof(u32);
 		}

 		if (unlikely(srcp > rlim))
 			break;

 		/* Prepare for next loop. */
 		hash = lz77_hash(lz77_read32(srcp));
 	} while (srcp < end);
 out:
 	dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);

 	flag_count = LZ77_FLAG_MAX - flag_count;
 	flag <<= flag_count;
 	flag |= (1UL << flag_count) - 1;
 	lz77_write32(flag_pos, flag);

 	*dlen = dstp - dst;
 	kvfree(htable);

 	if (*dlen < slen)
 		return 0;

 	return -EMSGSIZE;
 }
	// SPDX-License-Identifier: GPL-2.0-only
	/*
	* Copyright (C) 2024-2026, SUSE LLC
	*
	* Authors: Enzo Matsumiya <ematsumiya@suse.de>
	*
	* Implementation of the LZ77 "plain" compression algorithm, as per MS-XCA spec.
	*/
	#include <linux/slab.h>
	#include <linux/sizes.h>
	#include <linux/count_zeros.h>
	#include <linux/unaligned.h>

	#include "lz77.h"

	/*
	* Compression parameters.
	*
	* LZ77_MATCH_MAX_DIST: Farthest back a match can be from current position (can be 1 - 8K).
	* LZ77_HASH_LOG:
	* LZ77_HASH_SIZE: ilog2 hash size (recommended to be 13 - 18, default 15 (hash size
	* 32k)).
	* LZ77_RSTEP_SIZE: Number of bytes to read from input buffer for hashing and initial
	* match check (default 4 bytes, this effectivelly makes this the min
	* match len).
	* LZ77_MSTEP_SIZE: Number of bytes to extend-compare a found match (default 8 bytes).
	* LZ77_SKIP_TRIGGER: ilog2 value for adaptive skipping, i.e. to progressively skip input
	* bytes when we can't find matches. Default is 4.
	* Higher values (>0) will decrease compression time, but will result
	* in worse compression ratio. Lower values will give better
	* compression ratio (more matches found), but will increase time.
	*/
	#define LZ77_MATCH_MAX_DIST SZ_8K
	#define LZ77_HASH_LOG 15
	#define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG)
	#define LZ77_RSTEP_SIZE sizeof(u32)
	#define LZ77_MSTEP_SIZE sizeof(u64)
	#define LZ77_SKIP_TRIGGER 4

	#define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3)
	#define LZ77_FLAG_MAX 32

	static __always_inline u8 lz77_read8(const u8 *ptr)
	{
	return get_unaligned(ptr);
	}

	static __always_inline u32 lz77_read32(const u32 *ptr)
	{
	return get_unaligned(ptr);
	}

	static __always_inline u64 lz77_read64(const u64 *ptr)
	{
	return get_unaligned(ptr);
	}

	static __always_inline void lz77_write8(u8 *ptr, u8 v)
	{
	put_unaligned(v, ptr);
	}

	static __always_inline void lz77_write16(u16 *ptr, u16 v)
	{
	put_unaligned_le16(v, ptr);
	}

	static __always_inline void lz77_write32(u32 *ptr, u32 v)
	{
	put_unaligned_le32(v, ptr);
	}

	static __always_inline u32 lz77_match_len(const void match, const void cur, const void *end)
	{
	const void *start = cur;

	/* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */
	do {
	const u64 diff = lz77_read64(cur) ^ lz77_read64(match);

	if (!diff) {
	cur += LZ77_MSTEP_SIZE;
	match += LZ77_MSTEP_SIZE;

	continue;
	}

	/* This computes the number of common bytes in @diff. */
	cur += count_trailing_zeros(diff) >> 3;

	return (cur - start);
	} while (likely(cur + LZ77_MSTEP_SIZE <= end));

	/* Fallback to byte-by-byte comparison for last <8 bytes. */
	while (cur < end && lz77_read8(cur) == lz77_read8(match)) {
	cur++;
	match++;
	}

	return (cur - start);
	}

	/**
	* lz77_encode_match() - Match encoding.
	* @dst: compressed buffer
	* @nib: pointer to an address in @dst
	* @dist: match distance
	* @len: match length
	*
	* Assumes all args were previously checked.
	*
	* Return: @dst advanced to new position
	*
	* Ref: MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
	*/
	static __always_inline void lz77_encode_match(void dst, void **nib, u16 dist, u32 len)
	{
	len -= 3;
	dist--;
	dist <<= 3;

	if (len < 7) {
	lz77_write16(dst, dist + len);

	return dst + sizeof(u16);
	}

	dist \|= 7;
	lz77_write16(dst, dist);
	dst += sizeof(u16);
	len -= 7;

	if (!*nib) {
	lz77_write8(dst, umin(len, 15));
	*nib = dst;
	dst++;
	} else {
	u8 b = nib;

	lz77_write8(b, *b \| umin(len, 15) << 4);
	*nib = NULL;
	}

	if (len < 15)
	return dst;

	len -= 15;
	if (len < 255) {
	lz77_write8(dst, len);

	return dst + 1;
	}

	lz77_write8(dst, 0xff);
	dst++;
	len += 7 + 15;
	if (len <= 0xffff) {
	lz77_write16(dst, len);

	return dst + sizeof(u16);
	}

	lz77_write16(dst, 0);
	dst += sizeof(u16);
	lz77_write32(dst, len);

	return dst + sizeof(u32);
	}

	/**
	* lz77_encode_literals() - Literals encoding.
	* @start: where to start copying literals (uncompressed buffer)
	* @end: when to stop copying (uncompressed buffer)
	* @dst: compressed buffer
	* @f: pointer to current flag value
	* @fc: pointer to current flag count
	* @fp: pointer to current flag address
	*
	* Batch copy literals from @start to @dst, updating flag values accordingly.
	* Assumes all args were previously checked.
	*
	* Return: @dst advanced to new position
	*
	* MS-XCA 2.3.4 "Plain LZ77 Compression Algorithm Details" - "Processing"
	*/
	static __always_inline void lz77_encode_literals(const void start, const void end, void dst,
	long f, u32 fc, void **fp)
	{
	if (start >= end)
	return dst;

	do {
	const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc);

	memcpy(dst, start, len);

	dst += len;
	start += len;

	*f <<= len;
	*fc += len;
	if (*fc == LZ77_FLAG_MAX) {
	lz77_write32(fp, f);
	*fc = 0;
	*fp = dst;
	dst += sizeof(u32);
	}
	} while (start < end);

	return dst;
	}

	static __always_inline u32 lz77_hash(const u32 v)
	{
	return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG);
	}

	noinline int lz77_compress(const void src, const u32 slen, void dst, u32 *dlen)
	{
	const void srcp, rlim, end, anchor;
	u32 *htable, hash, flag_count = 0;
	void dstp, nib, *flag_pos;
	long flag = 0;

	/* This is probably a bug, so throw a warning. */
	if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen)))
	return -EINVAL;

	srcp = anchor = src;
	end = srcp + slen; /* absolute end */
	rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */
	dstp = dst;
	flag_pos = dstp;
	dstp += sizeof(u32);
	nib = NULL;

	htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL);
	if (!htable)
	return -ENOMEM;

	LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE);

	/*
	* Adjust @srcp so we don't get a false positive match on first iteration.
	* Then prepare hash for first loop iteration (don't advance @srcp again).
	*/
	hash = lz77_hash(lz77_read32(srcp++));
	htable[hash] = 0;
	hash = lz77_hash(lz77_read32(srcp));

	/*
	* Main loop.
	*
	* @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp.
	*
	* This code was crafted in a way to best utilise fetch-decode-execute CPU flow.
	* Any attempt to optimize it, or even organize it, can lead to huge performance loss.
	*/
	do {
	const void match, next = srcp;
	u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER;

	/* Match finding (hot path -- don't change the read/check/write order). */
	do {
	const u32 cur_hash = hash;

	srcp = next;
	next += step;

	/*
	* Adaptive skipping.
	*
	* Increment @step every (1 << LZ77_SKIP_TRIGGER, 16 in our case) bytes
	* without a match.
	* Reset to 1 when a match is found.
	*/
	step = (skip++ >> LZ77_SKIP_TRIGGER);
	if (unlikely(next > rlim))
	goto out;

	hash = lz77_hash(lz77_read32(next));
	match = src + htable[cur_hash];
	htable[cur_hash] = srcp - src;
	} while (likely(match + LZ77_MATCH_MAX_DIST < srcp) \|\|
	lz77_read32(match) != lz77_read32(srcp));

	/*
	* Match found. Warm/cold path; begin parsing @srcp and writing to @dstp:
	* - flush literals
	* - compute match length (*)
	* - encode match
	*
	* (*) Current minimum match length is defined by the memory read size above, so
	* here we already know that we have 4 matching bytes, but it's just faster to
	* redundantly compute it again in lz77_match_len() than to adjust pointers/len.
	*/
	dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos);
	len = lz77_match_len(match, srcp, end);
	dstp = lz77_encode_match(dstp, &nib, srcp - match, len);
	srcp += len;
	anchor = srcp;

	LZ77_PREFETCH(srcp);

	flag = (flag << 1) \| 1;
	flag_count++;
	if (flag_count == LZ77_FLAG_MAX) {
	lz77_write32(flag_pos, flag);
	flag_count = 0;
	flag_pos = dstp;
	dstp += sizeof(u32);
	}

	if (unlikely(srcp > rlim))
	break;

	/* Prepare for next loop. */
	hash = lz77_hash(lz77_read32(srcp));
	} while (srcp < end);
	out:
	dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos);

	flag_count = LZ77_FLAG_MAX - flag_count;
	flag <<= flag_count;
	flag \|= (1UL << flag_count) - 1;
	lz77_write32(flag_pos, flag);

	*dlen = dstp - dst;
	kvfree(htable);

	if (*dlen < slen)
	return 0;

	return -EMSGSIZE;
	}