fix lexer with comma in text

This commit is contained in:
gobwas 2016-05-12 00:17:33 +03:00
parent d877f63521
commit 82e8d7da03
2 changed files with 102 additions and 67 deletions

142
lexer.go
View File

@ -9,7 +9,7 @@ import (
const ( const (
char_any = '*' char_any = '*'
char_separator = ',' char_comma = ','
char_single = '?' char_single = '?'
char_escape = '\\' char_escape = '\\'
char_range_open = '[' char_range_open = '['
@ -138,7 +138,7 @@ type lexer struct {
func newLexer(source string) *lexer { func newLexer(source string) *lexer {
l := &lexer{ l := &lexer{
input: source, input: source,
state: lexText, state: lexRaw,
items: make(chan item, len(source)), items: make(chan item, len(source)),
termPhrases: make(map[int]int), termPhrases: make(map[int]int),
} }
@ -146,7 +146,7 @@ func newLexer(source string) *lexer {
} }
func (l *lexer) run() { func (l *lexer) run() {
for state := lexText; state != nil; { for state := lexRaw; state != nil; {
state = state(l) state = state(l)
} }
close(l.items) close(l.items)
@ -218,29 +218,26 @@ func (l *lexer) acceptAll(valid string) {
l.unread() l.unread()
} }
func (l *lexer) emit(t itemType) { func (l *lexer) emitCurrent(t itemType) {
if l.pos == len(l.input) { l.emit(t, l.input[l.start:l.pos])
l.items <- item{t, l.input[l.start:]} }
} else {
l.items <- item{t, l.input[l.start:l.pos]}
}
func (l *lexer) emit(t itemType, s string) {
l.items <- item{t, s}
l.start = l.pos l.start = l.pos
l.runes = 0 l.runes = 0
l.width = 0 l.width = 0
} }
func (l *lexer) emitMaybe(t itemType) {
if l.pos > l.start {
l.emit(t)
}
}
func (l *lexer) errorf(format string, args ...interface{}) { func (l *lexer) errorf(format string, args ...interface{}) {
l.items <- item{item_error, fmt.Sprintf(format, args...)} l.items <- item{item_error, fmt.Sprintf(format, args...)}
} }
func lexText(l *lexer) stateFn { func (l *lexer) inTerms() bool {
return len(l.termScopes) > 0
}
func lexRaw(l *lexer) stateFn {
for { for {
c := l.read() c := l.read()
if c == eof { if c == eof {
@ -248,21 +245,8 @@ func lexText(l *lexer) stateFn {
} }
switch c { switch c {
case char_escape:
l.unread()
l.emitMaybe(item_text)
l.read()
l.ignore()
if l.read() == eof {
l.errorf("unclosed '%s' character", string(char_escape))
return nil
}
case char_single: case char_single:
l.unread() l.unread()
l.emitMaybe(item_text)
return lexSingle return lexSingle
case char_any: case char_any:
@ -274,33 +258,35 @@ func lexText(l *lexer) stateFn {
} }
l.unread() l.unread()
l.emitMaybe(item_text)
return n return n
case char_range_open: case char_range_open:
l.unread() l.unread()
l.emitMaybe(item_text)
return lexRangeOpen return lexRangeOpen
case char_terms_open: case char_terms_open:
l.unread() l.unread()
l.emitMaybe(item_text)
return lexTermsOpen return lexTermsOpen
case char_terms_close: case char_terms_close:
l.unread() l.unread()
l.emitMaybe(item_text)
return lexTermsClose return lexTermsClose
case char_separator: case char_comma:
if l.inTerms() { // if we are not in terms
l.unread()
return lexSeparator
}
fallthrough
default:
l.unread() l.unread()
l.emitMaybe(item_text) return lexText
return lexSeparator
} }
} }
if l.pos > l.start { if l.pos > l.start {
l.emit(item_text) l.emitCurrent(item_text)
} }
if len(l.termScopes) != 0 { if len(l.termScopes) != 0 {
@ -308,11 +294,41 @@ func lexText(l *lexer) stateFn {
return nil return nil
} }
l.emit(item_eof) l.emitCurrent(item_eof)
return nil return nil
} }
func lexText(l *lexer) stateFn {
var escaped bool
var data []rune
scan:
for c := l.read(); c != eof; c = l.read() {
switch {
case c == char_escape:
escaped = true
continue
case !escaped && c == char_comma && l.inTerms():
l.unread()
break scan
case !escaped && utf8.RuneLen(c) == 1 && special(byte(c)):
l.unread()
break scan
default:
data = append(data, c)
}
escaped = false
}
l.emit(item_text, string(data))
return lexRaw
}
func lexInsideRange(l *lexer) stateFn { func lexInsideRange(l *lexer) stateFn {
for { for {
c := l.read() c := l.read()
@ -325,7 +341,7 @@ func lexInsideRange(l *lexer) stateFn {
case char_range_not: case char_range_not:
// only first char makes sense // only first char makes sense
if l.pos-l.width == l.start { if l.pos-l.width == l.start {
l.emit(item_not) l.emitCurrent(item_not)
} }
case char_range_between: case char_range_between:
@ -338,8 +354,13 @@ func lexInsideRange(l *lexer) stateFn {
return lexRangeHiLo return lexRangeHiLo
case char_range_close: case char_range_close:
if l.runes == 1 {
l.errorf("range should contain at least single char")
return nil
}
l.unread() l.unread()
l.emitMaybe(item_text) l.emitCurrent(item_text)
return lexRangeClose return lexRangeClose
} }
} }
@ -362,7 +383,7 @@ func lexRangeHiLo(l *lexer) stateFn {
return nil return nil
} }
l.emit(item_range_between) l.emitCurrent(item_range_between)
case char_range_close: case char_range_close:
l.unread() l.unread()
@ -372,7 +393,7 @@ func lexRangeHiLo(l *lexer) stateFn {
return nil return nil
} }
l.emit(item_range_hi) l.emitCurrent(item_range_hi)
return lexRangeClose return lexRangeClose
default: default:
@ -385,35 +406,30 @@ func lexRangeHiLo(l *lexer) stateFn {
return nil return nil
} }
l.emit(item_range_lo) l.emitCurrent(item_range_lo)
} }
} }
} }
func lexAny(l *lexer) stateFn { func lexAny(l *lexer) stateFn {
l.pos += 1 l.pos += 1
l.emit(item_any) l.emitCurrent(item_any)
return lexText return lexRaw
} }
func lexSuper(l *lexer) stateFn { func lexSuper(l *lexer) stateFn {
l.pos += 2 l.pos += 2
l.emit(item_super) l.emitCurrent(item_super)
return lexText return lexRaw
} }
func lexSingle(l *lexer) stateFn { func lexSingle(l *lexer) stateFn {
l.pos += 1 l.pos += 1
l.emit(item_single) l.emitCurrent(item_single)
return lexText return lexRaw
} }
func lexSeparator(l *lexer) stateFn { func lexSeparator(l *lexer) stateFn {
if len(l.termScopes) == 0 {
l.errorf("syntax error: separator not inside terms list")
return nil
}
posOpen := l.termScopes[len(l.termScopes)-1] posOpen := l.termScopes[len(l.termScopes)-1]
if l.pos-posOpen == 1 { if l.pos-posOpen == 1 {
@ -423,16 +439,16 @@ func lexSeparator(l *lexer) stateFn {
l.termPhrases[posOpen] += 1 l.termPhrases[posOpen] += 1
l.pos += 1 l.pos += 1
l.emit(item_separator) l.emitCurrent(item_separator)
return lexText return lexRaw
} }
func lexTermsOpen(l *lexer) stateFn { func lexTermsOpen(l *lexer) stateFn {
l.termScopes = append(l.termScopes, l.pos) l.termScopes = append(l.termScopes, l.pos)
l.pos += 1 l.pos += 1
l.emit(item_terms_open) l.emitCurrent(item_terms_open)
return lexText return lexRaw
} }
func lexTermsClose(l *lexer) stateFn { func lexTermsClose(l *lexer) stateFn {
@ -460,19 +476,19 @@ func lexTermsClose(l *lexer) stateFn {
delete(l.termPhrases, posOpen) delete(l.termPhrases, posOpen)
l.pos += 1 l.pos += 1
l.emit(item_terms_close) l.emitCurrent(item_terms_close)
return lexText return lexRaw
} }
func lexRangeOpen(l *lexer) stateFn { func lexRangeOpen(l *lexer) stateFn {
l.pos += 1 l.pos += 1
l.emit(item_range_open) l.emitCurrent(item_range_open)
return lexInsideRange return lexInsideRange
} }
func lexRangeClose(l *lexer) stateFn { func lexRangeClose(l *lexer) stateFn {
l.pos += 1 l.pos += 1
l.emit(item_range_close) l.emitCurrent(item_range_close)
return lexText return lexRaw
} }

View File

@ -16,6 +16,27 @@ func TestLexGood(t *testing.T) {
item{item_eof, ""}, item{item_eof, ""},
}, },
}, },
{
pattern: "hello,world",
items: []item{
item{item_text, "hello,world"},
item{item_eof, ""},
},
},
{
pattern: "hello\\,world",
items: []item{
item{item_text, "hello,world"},
item{item_eof, ""},
},
},
{
pattern: "hello\\{world",
items: []item{
item{item_text, "hello{world"},
item{item_eof, ""},
},
},
{ {
pattern: "hello?", pattern: "hello?",
items: []item{ items: []item{
@ -124,12 +145,10 @@ func TestLexGood(t *testing.T) {
for i, exp := range test.items { for i, exp := range test.items {
act := lexer.nextItem() act := lexer.nextItem()
if act.t != exp.t { if act.t != exp.t {
t.Errorf("#%d wrong %d-th item type: exp: %v; act: %v (%s vs %s)", id, i, exp.t, act.t, exp, act) t.Errorf("#%d %q: wrong %d-th item type: exp: %q; act: %q\n\t(%s vs %s)", id, test.pattern, i, exp.t, act.t, exp, act)
break
} }
if act.s != exp.s { if act.s != exp.s {
t.Errorf("#%d wrong %d-th item contents: exp: %q; act: %q (%s vs %s)", id, i, exp.s, act.s, exp, act) t.Errorf("#%d %q: wrong %d-th item contents: exp: %q; act: %q\n\t(%s vs %s)", id, test.pattern, i, exp.s, act.s, exp, act)
break
} }
} }
} }