Merge pull request #13 from gobwas/newlexer

reimplement lexer
2016-05-31 11:53:28 +03:00 · 2016-05-31 11:53:28 +03:00 · 0b50f259da
parent 49571a1557 70f1304bc3
commit 0b50f259da
2 changed files with 200 additions and 337 deletions
--- a/glob_test.go
+++ b/glob_test.go
@ -67,6 +67,7 @@ func TestGlob(t *testing.T) {
 		glob(true, "", ""),
 		glob(false, "", "b"),

+		glob(true, "*ä", "åä"),
 		glob(true, "abc", "abc"),
 		glob(true, "a*c", "abc"),
 		glob(true, "a*c", "a12345c"),
@ -169,7 +170,7 @@ func TestQuoteMeta(t *testing.T) {
 	}{
 		{
 			in:  `[foo*]`,
-			out: `\[foo\*]`,
+			out: `\[foo\*\]`,
 		},
 		{
 			in:  `{foo*}`,
--- a/lexer.go
+++ b/lexer.go
@ -3,7 +3,7 @@ package glob
 import (
 	"bytes"
 	"fmt"
-	"strings"
+	"github.com/gobwas/glob/runes"
 	"unicode/utf8"
 )

@ -25,6 +25,7 @@ var specials = []byte{
 	char_single,
 	char_escape,
 	char_range_open,
+	char_range_close,
 	char_terms_open,
 	char_terms_close,
 }
@ -33,10 +34,6 @@ func special(c byte) bool {
 	return bytes.IndexByte(specials, c) != -1
 }

-var eof rune = 0
-
-type stateFn func(*lexer) stateFn
-
 type itemType int

 const (
@ -119,7 +116,7 @@ type item struct {
 }

 func (i item) String() string {
-	return fmt.Sprintf("%v<%s>", i.t, i.s)
+	return fmt.Sprintf("%v<%q>", i.t, i.s)
 }

 type stubLexer struct {
@ -136,375 +133,240 @@ func (s *stubLexer) nextItem() (ret item) {
 	return
 }

+type items []item
+
+func (i *items) shift() (ret item) {
+	ret = (*i)[0]
+	copy(*i, (*i)[1:])
+	*i = (*i)[:len(*i)-1]
+	return
+}
+
+func (i *items) push(v item) {
+	*i = append(*i, v)
+}
+
+func (i *items) empty() bool {
+	return len(*i) == 0
+}
+
+var eof rune = 0
+
 type lexer struct {
-	input       string
-	start       int
-	pos         int
-	width       int
-	runes       int
-	termScopes  []int
-	termPhrases map[int]int
-	state       stateFn
-	items       chan item
+	data string
+	pos  int
+	err  error
+
+	items      items
+	termsLevel int
+
+	lastRune     rune
+	lastRuneSize int
+	hasRune      bool
 }

 func newLexer(source string) *lexer {
 	l := &lexer{
-		input:       source,
-		state:       lexRaw,
-		items:       make(chan item, len(source)+1),
-		termPhrases: make(map[int]int),
+		data:  source,
+		items: items(make([]item, 0, 4)),
 	}
 	return l
 }

-func (l *lexer) run() {
-	for state := lexRaw; state != nil; {
-		state = state(l)
-	}
-	close(l.items)
-}
-
-func (l *lexer) nextItem() item {
-	for {
-		select {
-		case item := <-l.items:
-			return item
-		default:
-			if l.state == nil {
-				return item{t: item_eof}
-			}
-
-			l.state = l.state(l)
-		}
+func (l *lexer) peek() (r rune, w int) {
+	if l.pos == len(l.data) {
+		return eof, 0
 	}

-	panic("something went wrong")
-}
-
-func (l *lexer) read() (r rune) {
-	if l.pos >= len(l.input) {
-		return eof
+	r, w = utf8.DecodeRuneInString(l.data[l.pos:])
+	if r == utf8.RuneError {
+		l.errorf("could not read rune")
+		r = eof
+		w = 0
 	}

-	r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
-	l.pos += l.width
-	l.runes++
-
 	return
 }

-func (l *lexer) unread() {
-	l.pos -= l.width
-	l.runes--
-}
-
-func (l *lexer) reset() {
-	l.pos = l.start
-	l.runes = 0
-}
-
-func (l *lexer) ignore() {
-	l.start = l.pos
-	l.runes = 0
-}
-
-func (l *lexer) lookahead() rune {
-	r := l.read()
-	if r != eof {
-		l.unread()
+func (l *lexer) read() rune {
+	if l.hasRune {
+		l.hasRune = false
+		l.seek(l.lastRuneSize)
+		return l.lastRune
 	}
+
+	r, s := l.peek()
+	l.seek(s)
+
+	l.lastRune = r
+	l.lastRuneSize = s
+
 	return r
 }

-func (l *lexer) accept(valid string) bool {
-	if strings.IndexRune(valid, l.read()) != -1 {
-		return true
+func (l *lexer) seek(w int) {
+	l.pos += w
+}
+
+func (l *lexer) unread() {
+	if l.hasRune {
+		l.errorf("could not unread rune")
+		return
 	}
-	l.unread()
-	return false
+	l.seek(-l.lastRuneSize)
+	l.hasRune = true
 }

-func (l *lexer) acceptAll(valid string) {
-	for strings.IndexRune(valid, l.read()) != -1 {
-	}
-	l.unread()
-}
-
-func (l *lexer) emitCurrent(t itemType) {
-	l.emit(t, l.input[l.start:l.pos])
-}
-
-func (l *lexer) emit(t itemType, s string) {
-	l.items <- item{t, s}
-	l.start = l.pos
-	l.runes = 0
-	l.width = 0
-}
-
-func (l *lexer) errorf(format string, args ...interface{}) {
-	l.items <- item{item_error, fmt.Sprintf(format, args...)}
+func (l *lexer) errorf(f string, v ...interface{}) {
+	l.err = fmt.Errorf(f, v...)
 }

 func (l *lexer) inTerms() bool {
-	return len(l.termScopes) > 0
+	return l.termsLevel > 0
 }

-func lexRaw(l *lexer) stateFn {
+func (l *lexer) termsEnter() {
+	l.termsLevel++
+}
+
+func (l *lexer) termsLeave() {
+	l.termsLevel--
+}
+
+func (l *lexer) nextItem() item {
+	if l.err != nil {
+		return item{item_error, l.err.Error()}
+	}
+	if !l.items.empty() {
+		return l.items.shift()
+	}
+
+	l.fetchItem()
+	return l.nextItem()
+}
+
+var inTextBreakers = []rune{char_single, char_any, char_range_open, char_terms_open}
+var inTermsBreakers = append(inTextBreakers, char_terms_close, char_comma)
+
+func (l *lexer) fetchItem() {
+	r := l.read()
+	switch {
+	case r == eof:
+		l.items.push(item{item_eof, ""})
+
+	case r == char_terms_open:
+		l.termsEnter()
+		l.items.push(item{item_terms_open, string(r)})
+
+	case r == char_comma && l.inTerms():
+		l.items.push(item{item_separator, string(r)})
+
+	case r == char_terms_close && l.inTerms():
+		l.items.push(item{item_terms_close, string(r)})
+		l.termsLeave()
+
+	case r == char_range_open:
+		l.items.push(item{item_range_open, string(r)})
+		l.fetchRange()
+
+	case r == char_single:
+		l.items.push(item{item_single, string(r)})
+
+	case r == char_any:
+		if l.read() == char_any {
+			l.items.push(item{item_super, string(r) + string(r)})
+		} else {
+			l.unread()
+			l.items.push(item{item_any, string(r)})
+		}
+
+	default:
+		l.unread()
+
+		var breakers []rune
+		if l.inTerms() {
+			breakers = inTermsBreakers
+		} else {
+			breakers = inTextBreakers
+		}
+		l.fetchText(breakers)
+	}
+}
+
+func (l *lexer) fetchRange() {
+	var wantHi bool
+	var wantClose bool
+	var seenNot bool
 	for {
-		c := l.read()
-		if c == eof {
+		r := l.read()
+		if r == eof {
+			l.errorf("unexpected end of input")
+			return
+		}
+
+		if wantClose {
+			if r != char_range_close {
+				l.errorf("expected close range character")
+			} else {
+				l.items.push(item{item_range_close, string(r)})
+			}
+			return
+		}
+
+		if wantHi {
+			l.items.push(item{item_range_hi, string(r)})
+			wantClose = true
+			continue
+		}
+
+		if !seenNot && r == char_range_not {
+			l.items.push(item{item_not, string(r)})
+			seenNot = true
+			continue
+		}
+
+		if n, w := l.peek(); n == char_range_between {
+			l.seek(w)
+			l.items.push(item{item_range_lo, string(r)})
+			l.items.push(item{item_range_between, string(n)})
+			wantHi = true
+			continue
+		}
+
+		l.unread() // unread first peek and fetch as text
+		l.fetchText([]rune{char_range_close})
+		wantClose = true
+	}
+}
+
+func (l *lexer) fetchText(breakers []rune) {
+	var data []rune
+	var escaped bool
+
+reading:
+	for {
+		r := l.read()
+		if r == eof {
 			break
 		}

-		switch c {
-		case char_single:
-			l.unread()
-			return lexSingle
-
-		case char_any:
-			var n stateFn
-			if l.lookahead() == char_any {
-				n = lexSuper
-			} else {
-				n = lexAny
-			}
-
-			l.unread()
-			return n
-
-		case char_range_open:
-			l.unread()
-			return lexRangeOpen
-
-		case char_terms_open:
-			l.unread()
-			return lexTermsOpen
-
-		case char_terms_close:
-			if l.inTerms() { // if we are in terms
-				l.unread()
-				return lexTermsClose
-			}
-
-		case char_comma:
-			if l.inTerms() { // if we are in terms
-				l.unread()
-				return lexSeparator
-			}
-		}
-
-		l.unread()
-		return lexText
-	}
-
-	if l.pos > l.start {
-		l.emitCurrent(item_text)
-	}
-
-	if len(l.termScopes) != 0 {
-		l.errorf("invalid pattern syntax: unclosed terms")
-		return nil
-	}
-
-	l.emitCurrent(item_eof)
-
-	return nil
-}
-
-func lexText(l *lexer) stateFn {
-	var escaped bool
-	var data []rune
-
-scan:
-	for c := l.read(); c != eof; c = l.read() {
-		switch {
-		case c == char_escape:
-			escaped = true
-			continue
-
-		case !escaped && c == char_comma && l.inTerms():
-			l.unread()
-			break scan
-
-		case !escaped && utf8.RuneLen(c) == 1 && special(byte(c)):
-			l.unread()
-			break scan
-
-		default:
-			data = append(data, c)
-		}
-
-		escaped = false
-	}
-
-	if len(data) > 0 {
-		l.emit(item_text, string(data))
-	}
-
-	return lexRaw
-}
-
-func lexInsideRange(l *lexer) stateFn {
-	for {
-		c := l.read()
-		if c == eof {
-			l.errorf("unclosed range construction")
-			return nil
-		}
-
-		switch c {
-		case char_range_not:
-			// only first char makes sense
-			if l.pos-l.width == l.start {
-				l.emitCurrent(item_not)
-			}
-
-		case char_range_between:
-			if l.runes != 2 {
-				l.errorf("unexpected length of lo char inside range")
-				return nil
-			}
-
-			l.reset()
-			return lexRangeHiLo
-
-		case char_range_close:
-			if l.runes == 1 {
-				l.errorf("range should contain at least single char")
-				return nil
-			}
-
-			l.unread()
-			l.emitCurrent(item_text)
-			return lexRangeClose
-		}
-	}
-}
-
-func lexRangeHiLo(l *lexer) stateFn {
-	start := l.start
-
-	for {
-		c := l.read()
-		if c == eof {
-			l.errorf("unexpected end of input")
-			return nil
-		}
-
-		switch c {
-		case char_range_between:
-			if l.runes != 1 {
-				l.errorf("unexpected length of range: single character expected before minus")
-				return nil
-			}
-
-			l.emitCurrent(item_range_between)
-
-		case char_range_close:
-			l.unread()
-
-			if l.runes != 1 {
-				l.errorf("unexpected length of range: single character expected before close")
-				return nil
-			}
-
-			l.emitCurrent(item_range_hi)
-			return lexRangeClose
-
-		default:
-			if start != l.start {
+		if !escaped {
+			if r == char_escape {
+				escaped = true
 				continue
 			}

-			if l.runes != 1 {
-				l.errorf("unexpected length of range: single character expected at the begining")
-				return nil
+			if runes.IndexRune(breakers, r) != -1 {
+				l.unread()
+				break reading
 			}
-
-			l.emitCurrent(item_range_lo)
 		}
-	}
-}

-func lexAny(l *lexer) stateFn {
-	l.pos += 1
-	l.emitCurrent(item_any)
-	return lexRaw
-}
-
-func lexSuper(l *lexer) stateFn {
-	l.pos += 2
-	l.emitCurrent(item_super)
-	return lexRaw
-}
-
-func lexSingle(l *lexer) stateFn {
-	l.pos += 1
-	l.emitCurrent(item_single)
-	return lexRaw
-}
-
-func lexSeparator(l *lexer) stateFn {
-	posOpen := l.termScopes[len(l.termScopes)-1]
-
-	if l.pos-posOpen == 1 {
-		l.errorf("syntax error: empty term before separator")
-		return nil
+		escaped = false
+		data = append(data, r)
 	}

-	l.termPhrases[posOpen] += 1
-	l.pos += 1
-	l.emitCurrent(item_separator)
-	return lexRaw
-}
-
-func lexTermsOpen(l *lexer) stateFn {
-	l.termScopes = append(l.termScopes, l.pos)
-	l.pos += 1
-	l.emitCurrent(item_terms_open)
-
-	return lexRaw
-}
-
-func lexTermsClose(l *lexer) stateFn {
-	if len(l.termScopes) == 0 {
-		l.errorf("unexpected closing of terms: there is no opened terms")
-		return nil
+	if len(data) > 0 {
+		l.items.push(item{item_text, string(data)})
 	}
-
-	lastOpen := len(l.termScopes) - 1
-	posOpen := l.termScopes[lastOpen]
-
-	// if it is empty term
-	if posOpen == l.pos-1 {
-		l.errorf("term could not be empty")
-		return nil
-	}
-
-	if l.termPhrases[posOpen] == 0 {
-		l.errorf("term must contain >1 phrases")
-		return nil
-	}
-
-	// cleanup
-	l.termScopes = l.termScopes[:lastOpen]
-	delete(l.termPhrases, posOpen)
-
-	l.pos += 1
-	l.emitCurrent(item_terms_close)
-
-	return lexRaw
-}
-
-func lexRangeOpen(l *lexer) stateFn {
-	l.pos += 1
-	l.emitCurrent(item_range_open)
-	return lexInsideRange
-}
-
-func lexRangeClose(l *lexer) stateFn {
-	l.pos += 1
-	l.emitCurrent(item_range_close)
-	return lexRaw
 }