matchfinder.M4: add LimitedSearch option

Using LimitedSearch, it only checks for overlapping matches in one
place instead of checking at each byte.
This gains about 50% in compression speed while only losing about
2% in compression ratio.
This commit is contained in:
Andy Balholm 2023-12-30 15:56:13 -08:00
parent 924a0eb0c6
commit 63f3f4372d
2 changed files with 18 additions and 1 deletions

View File

@ -657,3 +657,11 @@ func TestEncodeM4(t *testing.T) {
func BenchmarkEncodeM4(b *testing.B) { func BenchmarkEncodeM4(b *testing.B) {
benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20}, 1<<16) benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20}, 1<<16)
} }
func TestEncodeM4Limited(t *testing.T) {
test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 18, LimitedSearch: true}, 1<<16)
}
func BenchmarkEncodeM4Limited(b *testing.B) {
benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, LimitedSearch: true}, 1<<16)
}

View File

@ -10,7 +10,7 @@ import (
// interface that uses a simple hash table to find matches, // interface that uses a simple hash table to find matches,
// but the advanced parsing technique from // but the advanced parsing technique from
// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html, // https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html,
// except that it looks for matches at every input position. // except that it normally looks for matches at every input position.
type M4 struct { type M4 struct {
// MaxDistance is the maximum distance (in bytes) to look back for // MaxDistance is the maximum distance (in bytes) to look back for
// a match. The default is 65535. // a match. The default is 65535.
@ -28,6 +28,11 @@ type M4 struct {
// The default is 17 (128K entries). // The default is 17 (128K entries).
TableBits int TableBits int
// When LimitedSearch is true, it only looks for matches at certain
// points in the input rather than at every bite.
// (This makes compression faster, but hurts the compression ratio.)
LimitedSearch bool
table []uint32 table []uint32
history []byte history []byte
@ -97,6 +102,10 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match {
candidate := int(q.table[h]) candidate := int(q.table[h])
q.table[h] = uint32(i) q.table[h] = uint32(i)
if q.LimitedSearch && i < matches[0].End && i != matches[0].End+2-q.HashLen {
continue
}
if candidate == 0 || i-candidate > q.MaxDistance || i-candidate == matches[0].Start-matches[0].Match { if candidate == 0 || i-candidate > q.MaxDistance || i-candidate == matches[0].Start-matches[0].Match {
continue continue
} }