utf-8 support

This commit is contained in:
gobwas 2015-12-26 12:14:30 +03:00
parent f313fe3570
commit dbda43cc7c
2 changed files with 23 additions and 24 deletions

View File

@ -122,9 +122,8 @@ func (l *lexer) unread() {
l.runes-- l.runes--
} }
func (l *lexer) shift(i int) { func (l *lexer) reset() {
l.pos += i l.pos = l.start
l.start = l.pos
l.runes = 0 l.runes = 0
} }
@ -160,7 +159,7 @@ func (l *lexer) emit(t itemType) {
l.width = 0 l.width = 0
} }
func (l *lexer) flush(t itemType) { func (l *lexer) emitMaybe(t itemType) {
if l.pos > l.start { if l.pos > l.start {
l.emit(t) l.emit(t)
} }
@ -202,15 +201,15 @@ func lexText(l *lexer) stateFn {
} }
case single: case single:
l.unread() l.unread()
l.flush(item_text) l.emitMaybe(item_text)
return lexSingle return lexSingle
case any: case any:
l.unread() l.unread()
l.flush(item_text) l.emitMaybe(item_text)
return lexAny return lexAny
case range_open: case range_open:
l.unread() l.unread()
l.flush(item_text) l.emitMaybe(item_text)
return lexRangeOpen return lexRangeOpen
} }
@ -236,22 +235,22 @@ func lexInsideRange(l *lexer) stateFn {
switch c { switch c {
case inside_range_not: case inside_range_not:
// only first char makes sense // only first char makes sense
if l.pos-1 == l.start { if l.pos-l.width == l.start {
l.emit(item_range_not) l.emit(item_range_not)
} }
case inside_range_minus: case inside_range_minus:
if l.pos-l.start != 2 { if l.runes != 2 {
l.errorf("unexpected length of lo char inside range") l.errorf("unexpected length of lo char inside range")
return nil return nil
} }
l.shift(-2) l.reset()
return lexRangeHiLo return lexRangeHiLo
case range_close: case range_close:
l.unread() l.unread()
l.flush(item_range_chars) l.emitMaybe(item_range_chars)
return lexRangeClose return lexRangeClose
} }
} }
@ -275,7 +274,7 @@ func lexRangeHiLo(l *lexer) stateFn {
switch c { switch c {
case inside_range_minus: case inside_range_minus:
if l.pos-l.start != 1 { if l.runes != 1 {
l.errorf("unexpected length of range: single character expected before minus") l.errorf("unexpected length of range: single character expected before minus")
return nil return nil
} }
@ -285,7 +284,7 @@ func lexRangeHiLo(l *lexer) stateFn {
case range_close: case range_close:
l.unread() l.unread()
if l.pos-l.start != 1 { if l.runes != 1 {
l.errorf("unexpected length of range: single character expected before close") l.errorf("unexpected length of range: single character expected before close")
return nil return nil
} }
@ -298,7 +297,7 @@ func lexRangeHiLo(l *lexer) stateFn {
continue continue
} }
if l.pos-l.start != 1 { if l.runes != 1 {
l.errorf("unexpected length of range: single character expected at the begining") l.errorf("unexpected length of range: single character expected at the begining")
return nil return nil
} }

View File

@ -42,43 +42,43 @@ func TestLexGood(t *testing.T) {
}, },
}, },
{ {
pattern: "[a-b]", pattern: "[日-語]",
items: []item{ items: []item{
item{item_range_open, "["}, item{item_range_open, "["},
item{item_range_lo, "a"}, item{item_range_lo, ""},
item{item_range_minus, "-"}, item{item_range_minus, "-"},
item{item_range_hi, "b"}, item{item_range_hi, ""},
item{item_range_close, "]"}, item{item_range_close, "]"},
item{item_eof, ""}, item{item_eof, ""},
}, },
}, },
{ {
pattern: "[!a-b]", pattern: "[!日-語]",
items: []item{ items: []item{
item{item_range_open, "["}, item{item_range_open, "["},
item{item_range_not, "!"}, item{item_range_not, "!"},
item{item_range_lo, "a"}, item{item_range_lo, ""},
item{item_range_minus, "-"}, item{item_range_minus, "-"},
item{item_range_hi, "b"}, item{item_range_hi, ""},
item{item_range_close, "]"}, item{item_range_close, "]"},
item{item_eof, ""}, item{item_eof, ""},
}, },
}, },
{ {
pattern: "[abc]", pattern: "[日本語]",
items: []item{ items: []item{
item{item_range_open, "["}, item{item_range_open, "["},
item{item_range_chars, "abc"}, item{item_range_chars, "日本語"},
item{item_range_close, "]"}, item{item_range_close, "]"},
item{item_eof, ""}, item{item_eof, ""},
}, },
}, },
{ {
pattern: "[!abc]", pattern: "[!日本語]",
items: []item{ items: []item{
item{item_range_open, "["}, item{item_range_open, "["},
item{item_range_not, "!"}, item{item_range_not, "!"},
item{item_range_chars, "abc"}, item{item_range_chars, "日本語"},
item{item_range_close, "]"}, item{item_range_close, "]"},
item{item_eof, ""}, item{item_eof, ""},
}, },