From d400ac872c3681e06479b945048142087617be7c Mon Sep 17 00:00:00 2001 From: "s.kamardin" Date: Fri, 25 Dec 2015 19:40:36 +0300 Subject: [PATCH] lexer tests --- glob.go | 34 ++++++----- lexer.go | 142 +++++++++++++++++++++++++++++----------------- match/multiple.go | 15 +++-- parser.go | 127 +++++++++++++++++++++++++++++++++++++++++ parser_test.go | 13 +++++ 5 files changed, 254 insertions(+), 77 deletions(-) create mode 100644 parser.go create mode 100644 parser_test.go diff --git a/glob.go b/glob.go index 5995597..0dd2859 100644 --- a/glob.go +++ b/glob.go @@ -1,21 +1,21 @@ package glob import ( - "strings" - "github.com/gobwas/glob/match" "fmt" + "github.com/gobwas/glob/match" + "strings" ) const ( any = '*' - single = '?' + single = '?' escape = '\\' range_open = '[' range_close = ']' ) const ( - inside_range_not = '!' + inside_range_not = '!' inside_range_minus = '-' ) @@ -67,7 +67,6 @@ func New(pattern string, separators ...string) (Glob, error) { return &match.Composite{c}, nil } - // parse parsed given pattern into list of tokens func parse(str string, sep string, st state) ([]token, error) { if len(str) == 0 { @@ -99,14 +98,14 @@ func parse(str string, sep string, st state) ([]token, error) { return nil, fmt.Errorf("'%s' should be closed with '%s'", string(range_open), string(range_close)) } - r := str[i+1:closed] + r := str[i+1 : closed] g, err := parseRange(r) if err != nil { return nil, err } st.tokens = append(st.tokens, token{g, r}) - if closed == len(str) -1 { + if closed == len(str)-1 { return st.tokens, nil } @@ -116,11 +115,11 @@ func parse(str string, sep string, st state) ([]token, error) { st.escape = true case any: if len(str) > i+1 && str[i+1] == any { - st.tokens = append(st.tokens, token{match.Multiple{}, c}) + st.tokens = append(st.tokens, token{match.Any{}, c}) return parse(str[i+len(c)+1:], sep, st) } - st.tokens = append(st.tokens, token{match.Multiple{sep}, c}) + st.tokens = append(st.tokens, token{match.Any{sep}, c}) case single: st.tokens = append(st.tokens, token{match.Single{sep}, c}) } @@ -129,14 +128,13 @@ func parse(str string, sep string, st state) ([]token, error) { return parse(str[i+len(c):], sep, st) } - func parseRange(def string) (match.Matcher, error) { var ( - not bool - esc bool - minus bool + not bool + esc bool + minus bool minusIndex int - b []byte + b []byte ) for i, c := range []byte(def) { @@ -146,13 +144,13 @@ func parseRange(def string) (match.Matcher, error) { continue } - switch c{ + switch c { case inside_range_not: if i == 0 { not = true } case escape: - if i == len(def) - 1 { + if i == len(def)-1 { return nil, fmt.Errorf("there should be any character after '%s'", string(escape)) } @@ -171,7 +169,7 @@ func parseRange(def string) (match.Matcher, error) { def = string(b) - if minus { + if minus { r := []rune(def) if len(r) != 2 || minusIndex != 1 { return nil, fmt.Errorf("invalid range syntax: '%s' should be between two characters", string(inside_range_minus)) @@ -191,4 +189,4 @@ type token struct { type state struct { escape bool tokens []token -} \ No newline at end of file +} diff --git a/lexer.go b/lexer.go index 999fd78..9b92da5 100644 --- a/lexer.go +++ b/lexer.go @@ -6,7 +6,7 @@ import ( "unicode/utf8" ) -var eof int = 0 +var eof rune = 0 type stateFn func(*lexer) stateFn @@ -20,7 +20,10 @@ const ( item_single item_range_open item_range_not + item_range_lo item_range_minus + item_range_hi + item_range_chars item_range_close ) @@ -29,15 +32,29 @@ type item struct { s string } +func (i item) String() string { + return fmt.Sprintf("%v[%s]", i.t, i.s) +} + type lexer struct { input string start int pos int width int runes int + state stateFn items chan item } +func newLexer(source string) *lexer { + l := &lexer{ + input: source, + state: lexText, + items: make(chan item, 5), + } + return l +} + func (l *lexer) run() { for state := lexText; state != nil; { state = state(l) @@ -45,12 +62,12 @@ func (l *lexer) run() { close(l.items) } -func (l *lexer) read() (rune int) { +func (l *lexer) read() (r rune) { if l.pos >= len(l.input) { return eof } - rune, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) + r, l.width = utf8.DecodeRuneInString(l.input[l.pos:]) l.pos += l.width l.runes++ @@ -62,12 +79,18 @@ func (l *lexer) unread() { l.runes-- } +func (l *lexer) shift(i int) { + l.pos += i + l.start = l.pos + l.runes = 0 +} + func (l *lexer) ignore() { l.start = l.pos l.runes = 0 } -func (l *lexer) lookahead() int { +func (l *lexer) lookahead() rune { r := l.read() l.unread() return r @@ -101,42 +124,53 @@ func (l *lexer) flush(t itemType) { } func (l *lexer) errorf(format string, args ...interface{}) { - l.emit(item{item_error, fmt.Sprintf(format, args...)}) + l.items <- item{item_error, fmt.Sprintf(format, args...)} } -func lex(source string) *lexer { - l := &lexer{ - input: strings.NewReader(source), - items: make(chan item), +func (l *lexer) nextItem() item { + for { + select { + case item := <-l.items: + return item + default: + if l.state == nil { + return item{t: item_eof} + } + + l.state = l.state(l) + } } - go l.run() - - return l + panic("something went wrong") } func lexText(l *lexer) stateFn { for { - switch l.input[l.pos] { + c := l.read() + if c == eof { + break + } + + switch c { case escape: if l.read() == eof { l.errorf("unclosed '%s' character", string(escape)) return nil } case single: + l.unread() l.flush(item_text) return lexSingle case any: + l.unread() l.flush(item_text) return lexAny case range_open: + l.unread() l.flush(item_text) return lexRangeOpen } - if l.read() == eof { - break - } } if l.pos > l.start { @@ -148,16 +182,15 @@ func lexText(l *lexer) stateFn { return nil } -func lexRangeOpen(l *lexer) stateFn { - l.pos += 1 - l.emit(item_range_open) - return lexInsideRange -} - func lexInsideRange(l *lexer) stateFn { for { - switch l.input[l.pos] { + c := l.read() + if c == eof { + l.errorf("unclosed range construction") + return nil + } + switch c { case inside_range_not: // only first char makes sense if l.pos == l.start { @@ -165,36 +198,19 @@ func lexInsideRange(l *lexer) stateFn { } case inside_range_minus: - if len(l.runes) != 1 { - l.errorf("unexpected character '%s'", string(inside_range_minus)) + if l.pos-l.start != 2 { + l.errorf("unexpected length of lo char inside range") return nil } - l.emit(item_text) - - l.pos += 1 - l.emit(item_range_minus) - - switch l.input[l.pos] { - case eof, range_close: - l.errorf("unexpected end of range: character is expected") - return nil - default: - l.read() - l.emit(item_text) - } - - return lexText + l.shift(-2) + return lexRangeHiLo case range_close: - l.flush(item_text) + l.unread() + l.flush(item_range_chars) return lexRangeClose } - - if l.read() == eof { - l.errorf("unclosed range construction") - return nil - } } } @@ -205,13 +221,31 @@ func lexAny(l *lexer) stateFn { } func lexRangeHiLo(l *lexer) stateFn { + for { + c := l.read() + if c == eof { + l.errorf("unexpected end of input") + return nil + } - l.emit(item_text) - return lexRangeMinus + if l.pos-l.start != 1 { + l.errorf("unexpected length of char inside range") + return nil + } - l.pos += 1 - l.emit(item_range_minus) - return lexInsideRange + switch c { + case inside_range_minus: + l.emit(item_range_minus) + + case range_close: + l.unread() + l.flush(item_range_hi) + return lexRangeClose + + default: + l.flush(item_range_lo) + } + } } func lexSingle(l *lexer) stateFn { @@ -220,6 +254,12 @@ func lexSingle(l *lexer) stateFn { return lexText } +func lexRangeOpen(l *lexer) stateFn { + l.pos += 1 + l.emit(item_range_open) + return lexInsideRange +} + func lexRangeClose(l *lexer) stateFn { l.pos += 1 l.emit(item_range_close) diff --git a/match/multiple.go b/match/multiple.go index c7f4cf9..42e84f4 100644 --- a/match/multiple.go +++ b/match/multiple.go @@ -1,20 +1,19 @@ package match import ( - "strings" "fmt" + "strings" ) -// multiple represents * -type Multiple struct { +type Any struct { Separators string } -func (self Multiple) Match(s string) bool { +func (self Any) Match(s string) bool { return strings.IndexAny(s, self.Separators) == -1 } -func (self Multiple) Search(s string) (i, l int, ok bool) { +func (self Any) Search(s string) (i, l int, ok bool) { if self.Match(s) { return 0, len(s), true } @@ -22,7 +21,7 @@ func (self Multiple) Search(s string) (i, l int, ok bool) { return } -func (self Multiple) Kind() Kind { +func (self Any) Kind() Kind { if self.Separators == "" { return KindMultipleSuper } else { @@ -30,6 +29,6 @@ func (self Multiple) Kind() Kind { } } -func (self Multiple) String() string { +func (self Any) String() string { return fmt.Sprintf("[multiple:%s]", self.Separators) -} \ No newline at end of file +} diff --git a/parser.go b/parser.go new file mode 100644 index 0000000..761b8b4 --- /dev/null +++ b/parser.go @@ -0,0 +1,127 @@ +package glob + +import ( + "errors" + "fmt" + "github.com/gobwas/glob/match" +) + +func parseAll(source, separators string) ([]token, error) { + lexer := newLexer(source) + + var tokens []token + for parser := parserMain; parser != nil; { + var err error + tokens, parser, err = parser(lexer, separators) + if err != nil { + return nil, err + } + } + + return tokens, nil +} + +type parseFn func(*lexer, string) ([]token, parseFn, error) + +func parserMain(lexer *lexer, separators string) ([]token, parseFn, error) { + var ( + prev *token + tokens []token + ) + + for item := lexer.nextItem(); ; { + var t token + + if item.t == item_eof { + break + } + + switch item.t { + case item_eof: + return tokens, nil, nil + + case item_error: + return nil, nil, errors.New(item.s) + + case item_text: + t = token{match.Raw{item.s}, item.s} + + case item_any: + if prev != nil && prev.matcher.Kind() == match.KindMultipleSeparated { + // remove simple any and replace it with super_any + tokens = tokens[:len(tokens)-1] + t = token{match.Any{""}, item.s} + } else { + t = token{match.Any{separators}, item.s} + } + + case item_single: + t = token{match.Single{separators}, item.s} + + case item_range_open: + return tokens, parserRange, nil + } + + prev = &t + } + + return tokens, nil, nil +} + +func parserRange(lexer *lexer, separators string) ([]token, parseFn, error) { + var ( + not bool + lo rune + hi rune + chars string + ) + + for item := lexer.nextItem(); ; { + switch item.t { + case item_eof: + return nil, nil, errors.New("unexpected end") + + case item_error: + return nil, nil, errors.New(item.s) + + case item_range_not: + not = true + + case item_range_lo: + r := []rune(item.s) + if len(r) != 1 { + return nil, nil, fmt.Errorf("unexpected length of lo character") + } + + lo = r[0] + + case item_range_minus: + // + + case item_range_hi: + r := []rune(item.s) + if len(r) != 1 { + return nil, nil, fmt.Errorf("unexpected length of hi character") + } + + hi = r[0] + + case item_range_chars: + chars = item.s + + case item_range_close: + isRange := lo != 0 && hi != 0 + isChars := chars == "" + + if !(isChars != isRange) { + return nil, nil, fmt.Errorf("parse error: unexpected lo, hi, chars in range") + } + + if isRange { + return []token{token{match.Between{lo, hi, not}, ""}}, parserMain, nil + } else { + return []token{token{match.RangeList{chars, not}, ""}}, parserMain, nil + } + } + } +} diff --git a/parser_test.go b/parser_test.go new file mode 100644 index 0000000..31525a6 --- /dev/null +++ b/parser_test.go @@ -0,0 +1,13 @@ +package glob + +import ( + "fmt" + "testing" +) + +func TestParseString(t *testing.T) { + lexer := newLexer("hello") + fmt.Println(lexer.nextItem()) + fmt.Println(lexer.nextItem()) + fmt.Println(lexer.nextItem()) +}