From 63f3f4372d1d8ec9deb2180561ed9e9f3edf1468 Mon Sep 17 00:00:00 2001 From: Andy Balholm Date: Sat, 30 Dec 2023 15:56:13 -0800 Subject: [PATCH] matchfinder.M4: add LimitedSearch option Using LimitedSearch, it only checks for overlapping matches in one place instead of checking at each byte. This gains about 50% in compression speed while only losing about 2% in compression ratio. --- brotli_test.go | 8 ++++++++ matchfinder/m4.go | 11 ++++++++++- 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/brotli_test.go b/brotli_test.go index 4dd8b54..4ad417b 100644 --- a/brotli_test.go +++ b/brotli_test.go @@ -657,3 +657,11 @@ func TestEncodeM4(t *testing.T) { func BenchmarkEncodeM4(b *testing.B) { benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20}, 1<<16) } + +func TestEncodeM4Limited(t *testing.T) { + test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, LimitedSearch: true}, 1<<16) +} + +func BenchmarkEncodeM4Limited(b *testing.B) { + benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, LimitedSearch: true}, 1<<16) +} diff --git a/matchfinder/m4.go b/matchfinder/m4.go index fb5c83d..d16f892 100644 --- a/matchfinder/m4.go +++ b/matchfinder/m4.go @@ -10,7 +10,7 @@ import ( // interface that uses a simple hash table to find matches, // but the advanced parsing technique from // https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html, -// except that it looks for matches at every input position. +// except that it normally looks for matches at every input position. type M4 struct { // MaxDistance is the maximum distance (in bytes) to look back for // a match. The default is 65535. @@ -28,6 +28,11 @@ type M4 struct { // The default is 17 (128K entries). TableBits int + // When LimitedSearch is true, it only looks for matches at certain + // points in the input rather than at every bite. + // (This makes compression faster, but hurts the compression ratio.) + LimitedSearch bool + table []uint32 history []byte @@ -97,6 +102,10 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { candidate := int(q.table[h]) q.table[h] = uint32(i) + if q.LimitedSearch && i < matches[0].End && i != matches[0].End+2-q.HashLen { + continue + } + if candidate == 0 || i-candidate > q.MaxDistance || i-candidate == matches[0].Start-matches[0].Match { continue }