matchfinder: remove MultiHash

It was an interesting experiment, but it didn't do any better than M4.
2024-01-09 06:29:08 -08:00 · 2024-01-09 06:29:08 -08:00 · 1b6cf3696e
parent 265f3afc2a
commit 1b6cf3696e
2 changed files with 0 additions and 270 deletions
--- a/brotli_test.go
+++ b/brotli_test.go
@ -693,27 +693,3 @@ func BenchmarkEncodeM4Chain64(b *testing.B) {
 func BenchmarkEncodeM4Chain128(b *testing.B) {
 	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.M4{MaxDistance: 1 << 20, ChainLength: 128, HashLen: 5, DistanceBitCost: 57}, 1<<16)
 }
-
-func TestEncodeMultiHash6(t *testing.T) {
-	test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16)
-}
-
-func TestEncodeMultiHash6_8(t *testing.T) {
-	test(t, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 18, DistanceBitCost: 57, HashLengths: []int{6, 8}}, 1<<16)
-}
-
-func BenchmarkEncodeMultiHash6(b *testing.B) {
-	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{6}}, 1<<16)
-}
-
-func BenchmarkEncodeMultiHash5_8(b *testing.B) {
-	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 8}}, 1<<16)
-}
-
-func BenchmarkEncodeMultiHash5_7_9(b *testing.B) {
-	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 7, 9}}, 1<<16)
-}
-
-func BenchmarkEncodeMultiHash5_6_7_9(b *testing.B) {
-	benchmark(b, "testdata/Isaac.Newton-Opticks.txt", &matchfinder.MultiHash{MaxDistance: 1 << 20, DistanceBitCost: 57, HashLengths: []int{5, 6, 7, 9}}, 1<<16)
-}
--- a/matchfinder/multihash.go
+++ b/matchfinder/multihash.go
@ -1,246 +0,0 @@
-package matchfinder
-
-import (
-	"encoding/binary"
-	"math/bits"
-	"sort"
-)
-
-// MultiHash is an implementation of the MatchFinder
-// interface that uses multiple hashes of different lengths.
-type MultiHash struct {
-	// MaxDistance is the maximum distance (in bytes) to look back for
-	// a match. The default is 65535.
-	MaxDistance int
-
-	// MinLength is the length of the shortest match to return.
-	// The default is 4.
-	MinLength int
-
-	// HashLengths is a list of the hashes to use, with the number of
-	// bytes to use for each. For example, to to use 4-byte, 7-byte, and
-	// 10-byte hashes, set HashLengths to []int{4, 7, 10}.
-	// The minimum length is 4.
-	HashLengths []int
-
-	// TableBits is the number of bits in the hash table indexes.
-	// The default is 17 (128K entries).
-	TableBits int
-
-	// DistanceBitCost is used when comparing two matches to see
-	// which is better. The comparison is primarily based on the length
-	// of the matches, but it can also take the distance into account,
-	// in terms of the number of bits needed to represent the distance.
-	// One byte of length is given a score of 256, so 32 (256/8) would
-	// be a reasonable first guess for the value of one bit.
-	// (The default is 0, which bases the comparison solely on length.)
-	DistanceBitCost int
-
-	tables [][]uint32
-
-	history []byte
-}
-
-func (q *MultiHash) Reset() {
-	for _, t := range q.tables {
-		for i := range t {
-			t[i] = 0
-		}
-	}
-	q.history = q.history[:0]
-}
-
-func (q *MultiHash) score(m absoluteMatch) int {
-	return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost
-}
-
-func (q *MultiHash) FindMatches(dst []Match, src []byte) []Match {
-	if q.MaxDistance == 0 {
-		q.MaxDistance = 65535
-	}
-	if q.MinLength == 0 {
-		q.MinLength = 4
-	}
-	if q.TableBits == 0 {
-		q.TableBits = 17
-	}
-	if len(q.tables) < len(q.HashLengths) {
-		q.tables = make([][]uint32, len(q.HashLengths))
-		for i := range q.tables {
-			q.tables[i] = make([]uint32, 1<<q.TableBits)
-		}
-	}
-	sort.Ints(q.HashLengths)
-	maxHashLen := q.HashLengths[len(q.HashLengths)-1]
-
-	e := matchEmitter{Dst: dst}
-
-	if len(q.history) > q.MaxDistance*2 {
-		// Trim down the history buffer.
-		delta := len(q.history) - q.MaxDistance
-		copy(q.history, q.history[delta:])
-		q.history = q.history[:q.MaxDistance]
-
-		for _, t := range q.tables {
-			for i, v := range t {
-				newV := int(v) - delta
-				if newV < 0 {
-					newV = 0
-				}
-				t[i] = uint32(newV)
-			}
-		}
-	}
-
-	// Append src to the history buffer.
-	e.NextEmit = len(q.history)
-	q.history = append(q.history, src...)
-	src = q.history
-
-	// matches stores the matches that have been found but not emitted,
-	// in reverse order. (matches[0] is the most recent one.)
-	var matches [3]absoluteMatch
-
-	candidates := make([]int, len(q.HashLengths))
-
-	for i := e.NextEmit; i < len(src)-maxHashLen; i++ {
-		if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
-			// We have found some matches, and we're far enough along that we probably
-			// won't find overlapping matches, so we might as well emit them.
-			if matches[1] != (absoluteMatch{}) {
-				e.trim(matches[1], matches[0].Start, q.MinLength)
-			}
-			e.emit(matches[0])
-			matches = [3]absoluteMatch{}
-		}
-
-		// Calculate and store the hashes.
-		h := uint32(0x811c9dc5) // FNV-32 offset basis
-		nb := 0
-		for j, hashLen := range q.HashLengths {
-			for nb < hashLen {
-				h ^= uint32(src[i+nb])
-				h *= 0x01000193 // FNV-32 prime
-				nb++
-			}
-			index := h >> (32 - q.TableBits)
-			candidates[j] = int(q.tables[j][index])
-			q.tables[j][index] = uint32(i)
-		}
-
-		// Look for a match.
-		var currentMatch absoluteMatch
-
-		if i < matches[0].End {
-			// If we're looking for an overlapping match, we only need to check the
-			// hash that ends 2 bytes after the end of the previous match.
-			for j, candidate := range candidates {
-				if i+q.HashLengths[j] != matches[0].End+2 {
-					continue
-				}
-				if candidate == 0 || i-candidate > q.MaxDistance {
-					break
-				}
-				if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) {
-					break
-				}
-				m := extendMatch2(src, i, candidate, e.NextEmit)
-				if m.End-m.Start >= q.HashLengths[j] {
-					currentMatch = m
-				}
-			}
-		} else {
-			for j, candidate := range candidates {
-				if candidate == 0 || i-candidate > q.MaxDistance {
-					break
-				}
-				if i-candidate == matches[0].Start-matches[0].Match {
-					// Don't bother to check for the same match we already have.
-					continue
-				}
-				if currentMatch.End-currentMatch.Start > q.HashLengths[j] {
-					// Don't bother with hashes that are shorter than the current match.
-					continue
-				}
-				if binary.LittleEndian.Uint32(src[candidate:]) != binary.LittleEndian.Uint32(src[i:]) {
-					break
-				}
-				m := extendMatch2(src, i, candidate, e.NextEmit)
-				if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
-					currentMatch = m
-				}
-			}
-		}
-
-		if currentMatch.End-currentMatch.Start < q.MinLength {
-			continue
-		}
-
-		overlapPenalty := 0
-		if matches[0] != (absoluteMatch{}) {
-			overlapPenalty = 275
-			if currentMatch.Start <= matches[1].End {
-				// This match would completely replace the previous match,
-				// so there is no penalty for overlap.
-				overlapPenalty = 0
-			}
-		}
-
-		if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
-			continue
-		}
-
-		matches = [3]absoluteMatch{
-			currentMatch,
-			matches[0],
-			matches[1],
-		}
-
-		if matches[2] == (absoluteMatch{}) {
-			continue
-		}
-
-		// We have three matches, so it's time to emit one and/or eliminate one.
-		switch {
-		case matches[0].Start < matches[2].End:
-			// The first and third matches overlap; discard the one in between.
-			matches = [3]absoluteMatch{
-				matches[0],
-				matches[2],
-				absoluteMatch{},
-			}
-
-		case matches[0].Start < matches[2].End+q.MinLength:
-			// The first and third matches don't overlap, but there's no room for
-			// another match between them. Emit the first match and discard the second.
-			e.emit(matches[2])
-			matches = [3]absoluteMatch{
-				matches[0],
-				absoluteMatch{},
-				absoluteMatch{},
-			}
-
-		default:
-			// Emit the first match, shortening it if necessary to avoid overlap with the second.
-			e.trim(matches[2], matches[1].Start, q.MinLength)
-			matches[2] = absoluteMatch{}
-		}
-	}
-
-	// We've found all the matches now; emit the remaining ones.
-	if matches[1] != (absoluteMatch{}) {
-		e.trim(matches[1], matches[0].Start, q.MinLength)
-	}
-	if matches[0] != (absoluteMatch{}) {
-		e.emit(matches[0])
-	}
-
-	dst = e.Dst
-	if e.NextEmit < len(src) {
-		dst = append(dst, Match{
-			Unmatched: len(src) - e.NextEmit,
-		})
-	}
-
-	return dst
-}