From 924a0eb0c6cfee88f3847ef957f38e10cfbfde17 Mon Sep 17 00:00:00 2001 From: Andy Balholm Date: Thu, 28 Dec 2023 17:21:34 -0800 Subject: [PATCH] matchfinder.M4: more refactoring Factor out matchEmitter.trim, and make TableBits configurable. --- matchfinder/emitter.go | 11 ++++++++++ matchfinder/m4.go | 48 ++++++++++++++++++------------------------ 2 files changed, 31 insertions(+), 28 deletions(-) diff --git a/matchfinder/emitter.go b/matchfinder/emitter.go index 507d1ca..37ed8e1 100644 --- a/matchfinder/emitter.go +++ b/matchfinder/emitter.go @@ -32,3 +32,14 @@ func (e *matchEmitter) emit(m absoluteMatch) { }) e.NextEmit = m.End } + +// trim shortens m if it extends past maxEnd. Then if the length is at least +// minLength, the match is emitted. +func (e *matchEmitter) trim(m absoluteMatch, maxEnd int, minLength int) { + if m.End > maxEnd { + m.End = maxEnd + } + if m.End-m.Start >= minLength { + e.emit(m) + } +} diff --git a/matchfinder/m4.go b/matchfinder/m4.go index 6a356a2..fb5c83d 100644 --- a/matchfinder/m4.go +++ b/matchfinder/m4.go @@ -6,11 +6,6 @@ import ( "runtime" ) -const ( - ssapBits = 17 - ssapMask = (1 << ssapBits) - 1 -) - // M4 is an implementation of the MatchFinder // interface that uses a simple hash table to find matches, // but the advanced parsing technique from @@ -29,13 +24,19 @@ type M4 struct { // The maximum is 8 and the default is 6. HashLen int - table [1 << ssapBits]uint32 + // TableBits is the number of bits in the hash table indexes. + // The default is 17 (128K entries). + TableBits int + + table []uint32 history []byte } func (q *M4) Reset() { - q.table = [1 << ssapBits]uint32{} + for i := range q.table { + q.table[i] = 0 + } q.history = q.history[:0] } @@ -49,6 +50,12 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { if q.HashLen == 0 { q.HashLen = 6 } + if q.TableBits == 0 { + q.TableBits = 17 + } + if len(q.table) < 1< q.MaxDistance*2 { @@ -79,21 +86,16 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { // We have found some matches, and we're far enough along that we probably // won't find overlapping matches, so we might as well emit them. if matches[1] != (absoluteMatch{}) { - if matches[1].End > matches[0].Start { - matches[1].End = matches[0].Start - } - if matches[1].End-matches[1].Start >= q.MinLength { - e.emit(matches[1]) - } + e.trim(matches[1], matches[0].Start, q.MinLength) } e.emit(matches[0]) matches = [3]absoluteMatch{} } // Now look for a match. - h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - ssapBits) - candidate := int(q.table[h&ssapMask]) - q.table[h&ssapMask] = uint32(i) + h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits) + candidate := int(q.table[h]) + q.table[h] = uint32(i) if candidate == 0 || i-candidate > q.MaxDistance || i-candidate == matches[0].Start-matches[0].Match { continue @@ -151,24 +153,14 @@ func (q *M4) FindMatches(dst []Match, src []byte) []Match { default: // Emit the first match, shortening it if necessary to avoid overlap with the second. - if matches[2].End > matches[1].Start { - matches[2].End = matches[1].Start - } - if matches[2].End-matches[2].Start >= q.MinLength { - e.emit(matches[2]) - } + e.trim(matches[2], matches[1].Start, q.MinLength) matches[2] = absoluteMatch{} } } // We've found all the matches now; emit the remaining ones. if matches[1] != (absoluteMatch{}) { - if matches[1].End > matches[0].Start { - matches[1].End = matches[0].Start - } - if matches[1].End-matches[1].Start >= q.MinLength { - e.emit(matches[1]) - } + e.trim(matches[1], matches[0].Start, q.MinLength) } if matches[0] != (absoluteMatch{}) { e.emit(matches[0])