2023-12-29 03:09:32 +03:00
|
|
|
package matchfinder
|
|
|
|
|
|
|
|
import (
|
|
|
|
"encoding/binary"
|
|
|
|
"math/bits"
|
|
|
|
"runtime"
|
|
|
|
)
|
|
|
|
|
|
|
|
// M4 is an implementation of the MatchFinder
|
2024-01-02 03:13:22 +03:00
|
|
|
// interface that uses a hash table to find matches,
|
|
|
|
// optional match chains,
|
|
|
|
// and the advanced parsing technique from
|
|
|
|
// https://fastcompression.blogspot.com/2011/12/advanced-parsing-strategies.html.
|
2023-12-29 03:09:32 +03:00
|
|
|
type M4 struct {
|
|
|
|
// MaxDistance is the maximum distance (in bytes) to look back for
|
|
|
|
// a match. The default is 65535.
|
|
|
|
MaxDistance int
|
|
|
|
|
|
|
|
// MinLength is the length of the shortest match to return.
|
|
|
|
// The default is 4.
|
|
|
|
MinLength int
|
|
|
|
|
|
|
|
// HashLen is the number of bytes to use to calculate the hashes.
|
|
|
|
// The maximum is 8 and the default is 6.
|
|
|
|
HashLen int
|
|
|
|
|
2023-12-29 04:21:34 +03:00
|
|
|
// TableBits is the number of bits in the hash table indexes.
|
|
|
|
// The default is 17 (128K entries).
|
|
|
|
TableBits int
|
|
|
|
|
2024-01-02 03:13:22 +03:00
|
|
|
// ChainLength is how many entries to search on the "match chain" of older
|
|
|
|
// locations with the same hash as the current location.
|
|
|
|
ChainLength int
|
2023-12-31 02:56:13 +03:00
|
|
|
|
2024-01-09 16:40:40 +03:00
|
|
|
// DistanceBitCost is used when comparing two matches to see
|
|
|
|
// which is better. The comparison is primarily based on the length
|
|
|
|
// of the matches, but it can also take the distance into account,
|
|
|
|
// in terms of the number of bits needed to represent the distance.
|
|
|
|
// One byte of length is given a score of 256, so 32 (256/8) would
|
|
|
|
// be a reasonable first guess for the value of one bit.
|
|
|
|
// (The default is 0, which bases the comparison solely on length.)
|
|
|
|
DistanceBitCost int
|
2024-01-03 00:38:12 +03:00
|
|
|
|
2023-12-29 04:21:34 +03:00
|
|
|
table []uint32
|
2024-01-02 03:13:22 +03:00
|
|
|
chain []uint16
|
2023-12-29 03:09:32 +03:00
|
|
|
|
|
|
|
history []byte
|
|
|
|
}
|
|
|
|
|
|
|
|
func (q *M4) Reset() {
|
2023-12-29 04:21:34 +03:00
|
|
|
for i := range q.table {
|
|
|
|
q.table[i] = 0
|
|
|
|
}
|
2023-12-29 03:09:32 +03:00
|
|
|
q.history = q.history[:0]
|
2024-01-02 03:13:22 +03:00
|
|
|
q.chain = q.chain[:0]
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
|
2024-01-09 16:40:40 +03:00
|
|
|
func (q *M4) score(m absoluteMatch) int {
|
|
|
|
return (m.End-m.Start)*256 + bits.LeadingZeros32(uint32(m.Start-m.Match))*q.DistanceBitCost
|
|
|
|
}
|
|
|
|
|
2023-12-29 03:09:32 +03:00
|
|
|
func (q *M4) FindMatches(dst []Match, src []byte) []Match {
|
|
|
|
if q.MaxDistance == 0 {
|
|
|
|
q.MaxDistance = 65535
|
|
|
|
}
|
|
|
|
if q.MinLength == 0 {
|
|
|
|
q.MinLength = 4
|
|
|
|
}
|
|
|
|
if q.HashLen == 0 {
|
|
|
|
q.HashLen = 6
|
|
|
|
}
|
2023-12-29 04:21:34 +03:00
|
|
|
if q.TableBits == 0 {
|
|
|
|
q.TableBits = 17
|
|
|
|
}
|
|
|
|
if len(q.table) < 1<<q.TableBits {
|
|
|
|
q.table = make([]uint32, 1<<q.TableBits)
|
|
|
|
}
|
2024-01-03 00:38:12 +03:00
|
|
|
|
2023-12-29 04:01:08 +03:00
|
|
|
e := matchEmitter{Dst: dst}
|
2023-12-29 03:09:32 +03:00
|
|
|
|
|
|
|
if len(q.history) > q.MaxDistance*2 {
|
|
|
|
// Trim down the history buffer.
|
|
|
|
delta := len(q.history) - q.MaxDistance
|
|
|
|
copy(q.history, q.history[delta:])
|
|
|
|
q.history = q.history[:q.MaxDistance]
|
2024-01-02 03:13:22 +03:00
|
|
|
if q.ChainLength > 0 {
|
|
|
|
q.chain = q.chain[:q.MaxDistance]
|
|
|
|
}
|
2023-12-29 03:09:32 +03:00
|
|
|
|
|
|
|
for i, v := range q.table {
|
|
|
|
newV := int(v) - delta
|
|
|
|
if newV < 0 {
|
|
|
|
newV = 0
|
|
|
|
}
|
|
|
|
q.table[i] = uint32(newV)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Append src to the history buffer.
|
2023-12-29 04:01:08 +03:00
|
|
|
e.NextEmit = len(q.history)
|
2023-12-29 03:09:32 +03:00
|
|
|
q.history = append(q.history, src...)
|
2024-01-02 03:13:22 +03:00
|
|
|
if q.ChainLength > 0 {
|
|
|
|
q.chain = append(q.chain, make([]uint16, len(src))...)
|
|
|
|
}
|
2023-12-29 03:09:32 +03:00
|
|
|
src = q.history
|
|
|
|
|
|
|
|
// matches stores the matches that have been found but not emitted,
|
|
|
|
// in reverse order. (matches[0] is the most recent one.)
|
2024-01-09 16:40:40 +03:00
|
|
|
var matches [3]absoluteMatch
|
2023-12-29 04:01:08 +03:00
|
|
|
for i := e.NextEmit; i < len(src)-7; i++ {
|
2024-01-09 16:40:40 +03:00
|
|
|
if matches[0] != (absoluteMatch{}) && i >= matches[0].End {
|
2023-12-29 03:09:32 +03:00
|
|
|
// We have found some matches, and we're far enough along that we probably
|
|
|
|
// won't find overlapping matches, so we might as well emit them.
|
2024-01-09 16:40:40 +03:00
|
|
|
if matches[1] != (absoluteMatch{}) {
|
2023-12-29 04:21:34 +03:00
|
|
|
e.trim(matches[1], matches[0].Start, q.MinLength)
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
2023-12-29 04:01:08 +03:00
|
|
|
e.emit(matches[0])
|
2024-01-09 16:40:40 +03:00
|
|
|
matches = [3]absoluteMatch{}
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
|
2024-01-02 03:13:22 +03:00
|
|
|
// Calculate and store the hash.
|
2023-12-29 04:21:34 +03:00
|
|
|
h := ((binary.LittleEndian.Uint64(src[i:]) & (1<<(8*q.HashLen) - 1)) * hashMul64) >> (64 - q.TableBits)
|
|
|
|
candidate := int(q.table[h])
|
|
|
|
q.table[h] = uint32(i)
|
2024-01-02 03:13:22 +03:00
|
|
|
if q.ChainLength > 0 && candidate != 0 {
|
|
|
|
delta := i - candidate
|
|
|
|
if delta < 1<<16 {
|
|
|
|
q.chain[i] = uint16(delta)
|
|
|
|
}
|
2023-12-31 02:56:13 +03:00
|
|
|
}
|
|
|
|
|
2024-01-02 03:13:22 +03:00
|
|
|
if i < matches[0].End && i != matches[0].End+2-q.HashLen {
|
2023-12-29 03:09:32 +03:00
|
|
|
continue
|
|
|
|
}
|
2024-01-02 03:13:22 +03:00
|
|
|
if candidate == 0 || i-candidate > q.MaxDistance {
|
2023-12-29 03:09:32 +03:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-01-02 03:13:22 +03:00
|
|
|
// Look for a match.
|
2024-01-09 16:40:40 +03:00
|
|
|
var currentMatch absoluteMatch
|
2024-01-02 03:13:22 +03:00
|
|
|
|
|
|
|
if i-candidate != matches[0].Start-matches[0].Match {
|
|
|
|
if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
|
|
|
|
m := extendMatch2(src, i, candidate, e.NextEmit)
|
|
|
|
if m.End-m.Start > q.MinLength {
|
|
|
|
currentMatch = m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for j := 0; j < q.ChainLength; j++ {
|
|
|
|
delta := q.chain[candidate]
|
|
|
|
if delta == 0 {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
candidate -= int(delta)
|
|
|
|
if candidate <= 0 || i-candidate > q.MaxDistance {
|
|
|
|
break
|
|
|
|
}
|
|
|
|
if i-candidate != matches[0].Start-matches[0].Match {
|
|
|
|
if binary.LittleEndian.Uint32(src[candidate:]) == binary.LittleEndian.Uint32(src[i:]) {
|
|
|
|
m := extendMatch2(src, i, candidate, e.NextEmit)
|
2024-01-09 16:40:40 +03:00
|
|
|
if m.End-m.Start > q.MinLength && q.score(m) > q.score(currentMatch) {
|
2024-01-02 03:13:22 +03:00
|
|
|
currentMatch = m
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2024-01-09 17:03:56 +03:00
|
|
|
if currentMatch.End-currentMatch.Start < q.MinLength {
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
overlapPenalty := 0
|
|
|
|
if matches[0] != (absoluteMatch{}) {
|
|
|
|
overlapPenalty = 275
|
|
|
|
if currentMatch.Start <= matches[1].End {
|
|
|
|
// This match would completely replace the previous match,
|
|
|
|
// so there is no penalty for overlap.
|
|
|
|
overlapPenalty = 0
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if q.score(currentMatch) <= q.score(matches[0])+overlapPenalty {
|
2023-12-29 03:09:32 +03:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2024-01-09 16:40:40 +03:00
|
|
|
matches = [3]absoluteMatch{
|
2024-01-02 03:13:22 +03:00
|
|
|
currentMatch,
|
2023-12-29 03:09:32 +03:00
|
|
|
matches[0],
|
|
|
|
matches[1],
|
|
|
|
}
|
|
|
|
|
2024-01-09 16:40:40 +03:00
|
|
|
if matches[2] == (absoluteMatch{}) {
|
2023-12-29 03:09:32 +03:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
// We have three matches, so it's time to emit one and/or eliminate one.
|
|
|
|
switch {
|
|
|
|
case matches[0].Start < matches[2].End:
|
|
|
|
// The first and third matches overlap; discard the one in between.
|
2024-01-09 16:40:40 +03:00
|
|
|
matches = [3]absoluteMatch{
|
2023-12-29 03:09:32 +03:00
|
|
|
matches[0],
|
|
|
|
matches[2],
|
2024-01-09 16:40:40 +03:00
|
|
|
absoluteMatch{},
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
case matches[0].Start < matches[2].End+q.MinLength:
|
|
|
|
// The first and third matches don't overlap, but there's no room for
|
|
|
|
// another match between them. Emit the first match and discard the second.
|
2023-12-29 04:01:08 +03:00
|
|
|
e.emit(matches[2])
|
2024-01-09 16:40:40 +03:00
|
|
|
matches = [3]absoluteMatch{
|
2023-12-29 03:09:32 +03:00
|
|
|
matches[0],
|
2024-01-09 16:40:40 +03:00
|
|
|
absoluteMatch{},
|
|
|
|
absoluteMatch{},
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
|
|
|
|
default:
|
|
|
|
// Emit the first match, shortening it if necessary to avoid overlap with the second.
|
2023-12-29 04:21:34 +03:00
|
|
|
e.trim(matches[2], matches[1].Start, q.MinLength)
|
2024-01-09 16:40:40 +03:00
|
|
|
matches[2] = absoluteMatch{}
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// We've found all the matches now; emit the remaining ones.
|
2024-01-09 16:40:40 +03:00
|
|
|
if matches[1] != (absoluteMatch{}) {
|
2023-12-29 04:21:34 +03:00
|
|
|
e.trim(matches[1], matches[0].Start, q.MinLength)
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
2024-01-09 16:40:40 +03:00
|
|
|
if matches[0] != (absoluteMatch{}) {
|
2023-12-29 04:01:08 +03:00
|
|
|
e.emit(matches[0])
|
2023-12-29 03:09:32 +03:00
|
|
|
}
|
|
|
|
|
2023-12-29 04:01:08 +03:00
|
|
|
dst = e.Dst
|
|
|
|
if e.NextEmit < len(src) {
|
2023-12-29 03:09:32 +03:00
|
|
|
dst = append(dst, Match{
|
2023-12-29 04:01:08 +03:00
|
|
|
Unmatched: len(src) - e.NextEmit,
|
2023-12-29 03:09:32 +03:00
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
return dst
|
|
|
|
}
|
|
|
|
|
|
|
|
const hashMul64 = 0x1E35A7BD1E35A7BD
|
|
|
|
|
|
|
|
// extendMatch returns the largest k such that k <= len(src) and that
|
|
|
|
// src[i:i+k-j] and src[j:k] have the same contents.
|
|
|
|
//
|
|
|
|
// It assumes that:
|
|
|
|
//
|
|
|
|
// 0 <= i && i < j && j <= len(src)
|
|
|
|
func extendMatch(src []byte, i, j int) int {
|
|
|
|
switch runtime.GOARCH {
|
|
|
|
case "amd64":
|
|
|
|
// As long as we are 8 or more bytes before the end of src, we can load and
|
|
|
|
// compare 8 bytes at a time. If those 8 bytes are equal, repeat.
|
|
|
|
for j+8 < len(src) {
|
|
|
|
iBytes := binary.LittleEndian.Uint64(src[i:])
|
|
|
|
jBytes := binary.LittleEndian.Uint64(src[j:])
|
|
|
|
if iBytes != jBytes {
|
|
|
|
// If those 8 bytes were not equal, XOR the two 8 byte values, and return
|
|
|
|
// the index of the first byte that differs. The BSF instruction finds the
|
|
|
|
// least significant 1 bit, the amd64 architecture is little-endian, and
|
|
|
|
// the shift by 3 converts a bit index to a byte index.
|
|
|
|
return j + bits.TrailingZeros64(iBytes^jBytes)>>3
|
|
|
|
}
|
|
|
|
i, j = i+8, j+8
|
|
|
|
}
|
|
|
|
case "386":
|
|
|
|
// On a 32-bit CPU, we do it 4 bytes at a time.
|
|
|
|
for j+4 < len(src) {
|
|
|
|
iBytes := binary.LittleEndian.Uint32(src[i:])
|
|
|
|
jBytes := binary.LittleEndian.Uint32(src[j:])
|
|
|
|
if iBytes != jBytes {
|
|
|
|
return j + bits.TrailingZeros32(iBytes^jBytes)>>3
|
|
|
|
}
|
|
|
|
i, j = i+4, j+4
|
|
|
|
}
|
|
|
|
}
|
|
|
|
for ; j < len(src) && src[i] == src[j]; i, j = i+1, j+1 {
|
|
|
|
}
|
|
|
|
return j
|
|
|
|
}
|
2023-12-31 03:25:51 +03:00
|
|
|
|
|
|
|
// Given a 4-byte match at src[start] and src[candidate], extendMatch2 extends it
|
|
|
|
// upward as far as possible, and downward no farther than to min.
|
2024-01-09 16:40:40 +03:00
|
|
|
func extendMatch2(src []byte, start, candidate, min int) absoluteMatch {
|
2023-12-31 03:25:51 +03:00
|
|
|
end := extendMatch(src, candidate+4, start+4)
|
|
|
|
for start > min && candidate > 0 && src[start-1] == src[candidate-1] {
|
|
|
|
start--
|
|
|
|
candidate--
|
|
|
|
}
|
2024-01-09 16:40:40 +03:00
|
|
|
return absoluteMatch{
|
2023-12-31 03:25:51 +03:00
|
|
|
Start: start,
|
|
|
|
End: end,
|
|
|
|
Match: candidate,
|
|
|
|
}
|
|
|
|
}
|