Correctly handle non-ASCII runes in patterns (fixes #54)

When matching a row we calculate an index into the string, and this
index was in runes. However when slicing the string Go uses byte
indexes. This change tracks both, using the rune count to determine the
correct length and the byte index to slice the string.
This commit is contained in:
Jakob Borg 2022-06-15 10:05:05 +02:00
parent e7a84e9525
commit 1d823af501
2 changed files with 19 additions and 6 deletions

View File

@ -162,6 +162,16 @@ func TestGlob(t *testing.T) {
glob(true, pattern_prefix_suffix, fixture_prefix_suffix_match),
glob(false, pattern_prefix_suffix, fixture_prefix_suffix_mismatch),
glob(true, "155ö", "155ö"),
glob(true, "1?5ö", "155ö"), // <-
glob(true, "1?ö5", "15ö5"),
glob(true, "155helloö", "155helloö"),
glob(true, "1?5helloö", "155helloö"), // <-
glob(true, "1?ö5hello", "15ö5hello"),
glob(true, "1?5heöllo", "155heöllo"),
glob(true, "1ö?5", "1ö55"), // <-
glob(true, "ö1?5", "ö155"),
} {
t.Run("", func(t *testing.T) {
g := MustCompile(test.pattern, test.delimiters...)

View File

@ -2,6 +2,7 @@ package match
import (
"fmt"
"unicode/utf8"
)
type Row struct {
@ -23,19 +24,21 @@ func (self Row) matchAll(s string) bool {
for _, m := range self.Matchers {
length := m.Len()
var next, i int
for next = range s[idx:] {
i++
if i == length {
var runeCount, byteIdx int
var r rune
for _, r = range s[idx:] {
runeCount++
byteIdx += utf8.RuneLen(r)
if runeCount == length {
break
}
}
if i < length || !m.Match(s[idx:idx+next+1]) {
if runeCount < length || !m.Match(s[idx:idx+byteIdx]) {
return false
}
idx += next + 1
idx += byteIdx
}
return true