diff --git a/glob_test.go b/glob_test.go index d6ac140..7cd52af 100644 --- a/glob_test.go +++ b/glob_test.go @@ -67,6 +67,7 @@ func TestGlob(t *testing.T) { glob(true, "", ""), glob(false, "", "b"), + glob(true, "*ä", "åä"), glob(true, "abc", "abc"), glob(true, "a*c", "abc"), glob(true, "a*c", "a12345c"), @@ -169,7 +170,7 @@ func TestQuoteMeta(t *testing.T) { }{ { in: `[foo*]`, - out: `\[foo\*]`, + out: `\[foo\*\]`, }, { in: `{foo*}`, diff --git a/lexer.go b/lexer.go index 4e704e4..9b756dc 100644 --- a/lexer.go +++ b/lexer.go @@ -3,7 +3,7 @@ package glob import ( "bytes" "fmt" - "strings" + "github.com/gobwas/glob/runes" "unicode/utf8" ) @@ -25,6 +25,7 @@ var specials = []byte{ char_single, char_escape, char_range_open, + char_range_close, char_terms_open, char_terms_close, } @@ -33,10 +34,6 @@ func special(c byte) bool { return bytes.IndexByte(specials, c) != -1 } -var eof rune = 0 - -type stateFn func(*lexer) stateFn - type itemType int const ( @@ -119,7 +116,7 @@ type item struct { } func (i item) String() string { - return fmt.Sprintf("%v<%s>", i.t, i.s) + return fmt.Sprintf("%v<%q>", i.t, i.s) } type stubLexer struct { @@ -136,375 +133,240 @@ func (s *stubLexer) nextItem() (ret item) { return } +type items []item + +func (i *items) shift() (ret item) { + ret = (*i)[0] + copy(*i, (*i)[1:]) + *i = (*i)[:len(*i)-1] + return +} + +func (i *items) push(v item) { + *i = append(*i, v) +} + +func (i *items) empty() bool { + return len(*i) == 0 +} + +var eof rune = 0 + type lexer struct { - input string - start int - pos int - width int - runes int - termScopes []int - termPhrases map[int]int - state stateFn - items chan item + data string + pos int + err error + + items items + termsLevel int + + lastRune rune + lastRuneSize int + hasRune bool } func newLexer(source string) *lexer { l := &lexer{ - input: source, - state: lexRaw, - items: make(chan item, len(source)+1), - termPhrases: make(map[int]int), + data: source, + items: items(make([]item, 0, 4)), } return l } -func (l *lexer) run() { - for state := lexRaw; state != nil; { - state = state(l) - } - close(l.items) -} - -func (l *lexer) nextItem() item { - for { - select { - case item := <-l.items: - return item - default: - if l.state == nil { - return item{t: item_eof} - } - - l.state = l.state(l) - } +func (l *lexer) peek() (r rune, w int) { + if l.pos == len(l.data) { + return eof, 0 } - panic("something went wrong") -} - -func (l *lexer) read() (r rune) { - if l.pos >= len(l.input) { - return eof + r, w = utf8.DecodeRuneInString(l.data[l.pos:]) + if r == utf8.RuneError { + l.errorf("could not read rune") + r = eof + w = 0 } - r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) - l.pos += l.width - l.runes++ - return } -func (l *lexer) unread() { - l.pos -= l.width - l.runes-- -} - -func (l *lexer) reset() { - l.pos = l.start - l.runes = 0 -} - -func (l *lexer) ignore() { - l.start = l.pos - l.runes = 0 -} - -func (l *lexer) lookahead() rune { - r := l.read() - if r != eof { - l.unread() +func (l *lexer) read() rune { + if l.hasRune { + l.hasRune = false + l.seek(l.lastRuneSize) + return l.lastRune } + + r, s := l.peek() + l.seek(s) + + l.lastRune = r + l.lastRuneSize = s + return r } -func (l *lexer) accept(valid string) bool { - if strings.IndexRune(valid, l.read()) != -1 { - return true +func (l *lexer) seek(w int) { + l.pos += w +} + +func (l *lexer) unread() { + if l.hasRune { + l.errorf("could not unread rune") + return } - l.unread() - return false + l.seek(-l.lastRuneSize) + l.hasRune = true } -func (l *lexer) acceptAll(valid string) { - for strings.IndexRune(valid, l.read()) != -1 { - } - l.unread() -} - -func (l *lexer) emitCurrent(t itemType) { - l.emit(t, l.input[l.start:l.pos]) -} - -func (l *lexer) emit(t itemType, s string) { - l.items <- item{t, s} - l.start = l.pos - l.runes = 0 - l.width = 0 -} - -func (l *lexer) errorf(format string, args ...interface{}) { - l.items <- item{item_error, fmt.Sprintf(format, args...)} +func (l *lexer) errorf(f string, v ...interface{}) { + l.err = fmt.Errorf(f, v...) } func (l *lexer) inTerms() bool { - return len(l.termScopes) > 0 + return l.termsLevel > 0 } -func lexRaw(l *lexer) stateFn { +func (l *lexer) termsEnter() { + l.termsLevel++ +} + +func (l *lexer) termsLeave() { + l.termsLevel-- +} + +func (l *lexer) nextItem() item { + if l.err != nil { + return item{item_error, l.err.Error()} + } + if !l.items.empty() { + return l.items.shift() + } + + l.fetchItem() + return l.nextItem() +} + +var inTextBreakers = []rune{char_single, char_any, char_range_open, char_terms_open} +var inTermsBreakers = append(inTextBreakers, char_terms_close, char_comma) + +func (l *lexer) fetchItem() { + r := l.read() + switch { + case r == eof: + l.items.push(item{item_eof, ""}) + + case r == char_terms_open: + l.termsEnter() + l.items.push(item{item_terms_open, string(r)}) + + case r == char_comma && l.inTerms(): + l.items.push(item{item_separator, string(r)}) + + case r == char_terms_close && l.inTerms(): + l.items.push(item{item_terms_close, string(r)}) + l.termsLeave() + + case r == char_range_open: + l.items.push(item{item_range_open, string(r)}) + l.fetchRange() + + case r == char_single: + l.items.push(item{item_single, string(r)}) + + case r == char_any: + if l.read() == char_any { + l.items.push(item{item_super, string(r) + string(r)}) + } else { + l.unread() + l.items.push(item{item_any, string(r)}) + } + + default: + l.unread() + + var breakers []rune + if l.inTerms() { + breakers = inTermsBreakers + } else { + breakers = inTextBreakers + } + l.fetchText(breakers) + } +} + +func (l *lexer) fetchRange() { + var wantHi bool + var wantClose bool + var seenNot bool for { - c := l.read() - if c == eof { + r := l.read() + if r == eof { + l.errorf("unexpected end of input") + return + } + + if wantClose { + if r != char_range_close { + l.errorf("expected close range character") + } else { + l.items.push(item{item_range_close, string(r)}) + } + return + } + + if wantHi { + l.items.push(item{item_range_hi, string(r)}) + wantClose = true + continue + } + + if !seenNot && r == char_range_not { + l.items.push(item{item_not, string(r)}) + seenNot = true + continue + } + + if n, w := l.peek(); n == char_range_between { + l.seek(w) + l.items.push(item{item_range_lo, string(r)}) + l.items.push(item{item_range_between, string(n)}) + wantHi = true + continue + } + + l.unread() // unread first peek and fetch as text + l.fetchText([]rune{char_range_close}) + wantClose = true + } +} + +func (l *lexer) fetchText(breakers []rune) { + var data []rune + var escaped bool + +reading: + for { + r := l.read() + if r == eof { break } - switch c { - case char_single: - l.unread() - return lexSingle - - case char_any: - var n stateFn - if l.lookahead() == char_any { - n = lexSuper - } else { - n = lexAny - } - - l.unread() - return n - - case char_range_open: - l.unread() - return lexRangeOpen - - case char_terms_open: - l.unread() - return lexTermsOpen - - case char_terms_close: - if l.inTerms() { // if we are in terms - l.unread() - return lexTermsClose - } - - case char_comma: - if l.inTerms() { // if we are in terms - l.unread() - return lexSeparator - } - } - - l.unread() - return lexText - } - - if l.pos > l.start { - l.emitCurrent(item_text) - } - - if len(l.termScopes) != 0 { - l.errorf("invalid pattern syntax: unclosed terms") - return nil - } - - l.emitCurrent(item_eof) - - return nil -} - -func lexText(l *lexer) stateFn { - var escaped bool - var data []rune - -scan: - for c := l.read(); c != eof; c = l.read() { - switch { - case c == char_escape: - escaped = true - continue - - case !escaped && c == char_comma && l.inTerms(): - l.unread() - break scan - - case !escaped && utf8.RuneLen(c) == 1 && special(byte(c)): - l.unread() - break scan - - default: - data = append(data, c) - } - - escaped = false - } - - if len(data) > 0 { - l.emit(item_text, string(data)) - } - - return lexRaw -} - -func lexInsideRange(l *lexer) stateFn { - for { - c := l.read() - if c == eof { - l.errorf("unclosed range construction") - return nil - } - - switch c { - case char_range_not: - // only first char makes sense - if l.pos-l.width == l.start { - l.emitCurrent(item_not) - } - - case char_range_between: - if l.runes != 2 { - l.errorf("unexpected length of lo char inside range") - return nil - } - - l.reset() - return lexRangeHiLo - - case char_range_close: - if l.runes == 1 { - l.errorf("range should contain at least single char") - return nil - } - - l.unread() - l.emitCurrent(item_text) - return lexRangeClose - } - } -} - -func lexRangeHiLo(l *lexer) stateFn { - start := l.start - - for { - c := l.read() - if c == eof { - l.errorf("unexpected end of input") - return nil - } - - switch c { - case char_range_between: - if l.runes != 1 { - l.errorf("unexpected length of range: single character expected before minus") - return nil - } - - l.emitCurrent(item_range_between) - - case char_range_close: - l.unread() - - if l.runes != 1 { - l.errorf("unexpected length of range: single character expected before close") - return nil - } - - l.emitCurrent(item_range_hi) - return lexRangeClose - - default: - if start != l.start { + if !escaped { + if r == char_escape { + escaped = true continue } - if l.runes != 1 { - l.errorf("unexpected length of range: single character expected at the begining") - return nil + if runes.IndexRune(breakers, r) != -1 { + l.unread() + break reading } - - l.emitCurrent(item_range_lo) } - } -} -func lexAny(l *lexer) stateFn { - l.pos += 1 - l.emitCurrent(item_any) - return lexRaw -} - -func lexSuper(l *lexer) stateFn { - l.pos += 2 - l.emitCurrent(item_super) - return lexRaw -} - -func lexSingle(l *lexer) stateFn { - l.pos += 1 - l.emitCurrent(item_single) - return lexRaw -} - -func lexSeparator(l *lexer) stateFn { - posOpen := l.termScopes[len(l.termScopes)-1] - - if l.pos-posOpen == 1 { - l.errorf("syntax error: empty term before separator") - return nil + escaped = false + data = append(data, r) } - l.termPhrases[posOpen] += 1 - l.pos += 1 - l.emitCurrent(item_separator) - return lexRaw -} - -func lexTermsOpen(l *lexer) stateFn { - l.termScopes = append(l.termScopes, l.pos) - l.pos += 1 - l.emitCurrent(item_terms_open) - - return lexRaw -} - -func lexTermsClose(l *lexer) stateFn { - if len(l.termScopes) == 0 { - l.errorf("unexpected closing of terms: there is no opened terms") - return nil + if len(data) > 0 { + l.items.push(item{item_text, string(data)}) } - - lastOpen := len(l.termScopes) - 1 - posOpen := l.termScopes[lastOpen] - - // if it is empty term - if posOpen == l.pos-1 { - l.errorf("term could not be empty") - return nil - } - - if l.termPhrases[posOpen] == 0 { - l.errorf("term must contain >1 phrases") - return nil - } - - // cleanup - l.termScopes = l.termScopes[:lastOpen] - delete(l.termPhrases, posOpen) - - l.pos += 1 - l.emitCurrent(item_terms_close) - - return lexRaw -} - -func lexRangeOpen(l *lexer) stateFn { - l.pos += 1 - l.emitCurrent(item_range_open) - return lexInsideRange -} - -func lexRangeClose(l *lexer) stateFn { - l.pos += 1 - l.emitCurrent(item_range_close) - return lexRaw }