glob/lexer.go

509 lines
7.8 KiB
Go
Raw Normal View History

2015-12-24 22:30:20 +03:00
package glob
import (
2016-02-24 23:53:19 +03:00
"bytes"
2015-12-24 22:30:20 +03:00
"fmt"
"strings"
"unicode/utf8"
)
2016-01-08 20:14:31 +03:00
const (
char_any = '*'
2016-05-12 00:17:33 +03:00
char_comma = ','
2016-01-08 20:14:31 +03:00
char_single = '?'
char_escape = '\\'
char_range_open = '['
char_range_close = ']'
char_terms_open = '{'
char_terms_close = '}'
char_range_not = '!'
char_range_between = '-'
)
2016-02-24 23:53:19 +03:00
var specials = []byte{
char_any,
char_single,
char_escape,
char_range_open,
char_range_close,
char_terms_open,
char_terms_close,
}
func special(c byte) bool {
return bytes.IndexByte(specials, c) != -1
}
2015-12-25 19:40:36 +03:00
var eof rune = 0
2015-12-24 22:30:20 +03:00
type stateFn func(*lexer) stateFn
type itemType int
const (
item_eof itemType = iota
item_error
item_text
2016-01-08 20:14:31 +03:00
item_char
2015-12-24 22:30:20 +03:00
item_any
2016-01-08 20:14:31 +03:00
item_super
2015-12-24 22:30:20 +03:00
item_single
2016-01-08 20:14:31 +03:00
item_not
item_separator
2015-12-24 22:30:20 +03:00
item_range_open
2016-01-08 20:14:31 +03:00
item_range_close
2015-12-25 19:40:36 +03:00
item_range_lo
item_range_hi
2016-01-08 20:14:31 +03:00
item_range_between
item_terms_open
item_terms_close
2015-12-24 22:30:20 +03:00
)
2015-12-25 21:08:54 +03:00
func (i itemType) String() string {
switch i {
case item_eof:
return "eof"
case item_error:
return "error"
case item_text:
return "text"
2016-01-08 20:14:31 +03:00
case item_char:
return "char"
2015-12-25 21:08:54 +03:00
case item_any:
return "any"
2016-01-08 20:14:31 +03:00
case item_super:
return "super"
2015-12-25 21:08:54 +03:00
case item_single:
return "single"
2016-01-08 20:14:31 +03:00
case item_not:
return "not"
case item_separator:
return "separator"
2015-12-25 21:08:54 +03:00
case item_range_open:
return "range_open"
2016-01-08 20:14:31 +03:00
case item_range_close:
return "range_close"
2015-12-25 21:08:54 +03:00
case item_range_lo:
return "range_lo"
case item_range_hi:
return "range_hi"
2016-01-08 20:14:31 +03:00
case item_range_between:
return "range_between"
2015-12-25 21:08:54 +03:00
2016-01-08 20:14:31 +03:00
case item_terms_open:
return "terms_open"
case item_terms_close:
return "terms_close"
2015-12-25 21:08:54 +03:00
default:
return "undef"
}
}
2015-12-24 22:30:20 +03:00
type item struct {
t itemType
s string
}
2015-12-25 19:40:36 +03:00
func (i item) String() string {
2015-12-25 21:08:54 +03:00
return fmt.Sprintf("%v<%s>", i.t, i.s)
2015-12-25 19:40:36 +03:00
}
2016-05-12 10:46:16 +03:00
type stubLexer struct {
Items []item
pos int
}
func (s *stubLexer) nextItem() (ret item) {
if s.pos == len(s.Items) {
return item{item_eof, ""}
}
ret = s.Items[s.pos]
s.pos++
return
}
2015-12-24 22:30:20 +03:00
type lexer struct {
2016-01-08 20:14:31 +03:00
input string
start int
pos int
width int
runes int
termScopes []int
termPhrases map[int]int
state stateFn
items chan item
2015-12-24 22:30:20 +03:00
}
2015-12-25 19:40:36 +03:00
func newLexer(source string) *lexer {
l := &lexer{
2016-01-08 20:14:31 +03:00
input: source,
2016-05-12 00:17:33 +03:00
state: lexRaw,
2016-02-25 00:31:37 +03:00
items: make(chan item, len(source)),
2016-01-08 20:14:31 +03:00
termPhrases: make(map[int]int),
2015-12-25 19:40:36 +03:00
}
return l
}
2015-12-24 22:30:20 +03:00
func (l *lexer) run() {
2016-05-12 00:17:33 +03:00
for state := lexRaw; state != nil; {
2015-12-24 22:30:20 +03:00
state = state(l)
}
close(l.items)
}
2016-01-08 20:14:31 +03:00
func (l *lexer) nextItem() item {
for {
select {
case item := <-l.items:
return item
default:
if l.state == nil {
return item{t: item_eof}
}
l.state = l.state(l)
}
}
panic("something went wrong")
}
2015-12-25 19:40:36 +03:00
func (l *lexer) read() (r rune) {
2015-12-24 22:30:20 +03:00
if l.pos >= len(l.input) {
return eof
}
2015-12-25 19:40:36 +03:00
r, l.width = utf8.DecodeRuneInString(l.input[l.pos:])
2015-12-24 22:30:20 +03:00
l.pos += l.width
l.runes++
return
}
func (l *lexer) unread() {
l.pos -= l.width
l.runes--
}
2015-12-26 12:14:30 +03:00
func (l *lexer) reset() {
l.pos = l.start
2015-12-25 19:40:36 +03:00
l.runes = 0
}
2015-12-24 22:30:20 +03:00
func (l *lexer) ignore() {
l.start = l.pos
l.runes = 0
}
2015-12-25 19:40:36 +03:00
func (l *lexer) lookahead() rune {
2015-12-24 22:30:20 +03:00
r := l.read()
2016-01-08 20:14:31 +03:00
if r != eof {
l.unread()
}
2015-12-24 22:30:20 +03:00
return r
}
func (l *lexer) accept(valid string) bool {
if strings.IndexRune(valid, l.read()) != -1 {
return true
}
l.unread()
return false
}
func (l *lexer) acceptAll(valid string) {
for strings.IndexRune(valid, l.read()) != -1 {
}
l.unread()
}
2016-05-12 00:17:33 +03:00
func (l *lexer) emitCurrent(t itemType) {
l.emit(t, l.input[l.start:l.pos])
}
2016-01-08 20:14:31 +03:00
2016-05-12 00:17:33 +03:00
func (l *lexer) emit(t itemType, s string) {
l.items <- item{t, s}
2015-12-24 22:30:20 +03:00
l.start = l.pos
l.runes = 0
l.width = 0
}
func (l *lexer) errorf(format string, args ...interface{}) {
2015-12-25 19:40:36 +03:00
l.items <- item{item_error, fmt.Sprintf(format, args...)}
2015-12-24 22:30:20 +03:00
}
2016-05-12 00:17:33 +03:00
func (l *lexer) inTerms() bool {
return len(l.termScopes) > 0
}
func lexRaw(l *lexer) stateFn {
2015-12-24 22:30:20 +03:00
for {
2015-12-25 19:40:36 +03:00
c := l.read()
if c == eof {
break
}
switch c {
2016-01-08 20:14:31 +03:00
case char_single:
2015-12-25 19:40:36 +03:00
l.unread()
2015-12-24 22:30:20 +03:00
return lexSingle
2016-01-08 20:14:31 +03:00
case char_any:
var n stateFn
if l.lookahead() == char_any {
n = lexSuper
} else {
n = lexAny
}
2015-12-25 19:40:36 +03:00
l.unread()
2016-01-08 20:14:31 +03:00
return n
case char_range_open:
2015-12-25 19:40:36 +03:00
l.unread()
2015-12-24 22:30:20 +03:00
return lexRangeOpen
2016-01-08 20:14:31 +03:00
case char_terms_open:
l.unread()
return lexTermsOpen
case char_terms_close:
l.unread()
return lexTermsClose
2016-05-12 00:17:33 +03:00
case char_comma:
if l.inTerms() { // if we are not in terms
l.unread()
return lexSeparator
}
fallthrough
default:
2016-01-08 20:14:31 +03:00
l.unread()
2016-05-12 00:17:33 +03:00
return lexText
2015-12-24 22:30:20 +03:00
}
}
if l.pos > l.start {
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_text)
2015-12-24 22:30:20 +03:00
}
2016-01-08 20:14:31 +03:00
if len(l.termScopes) != 0 {
l.errorf("invalid pattern syntax: unclosed terms")
return nil
}
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_eof)
2015-12-24 22:30:20 +03:00
return nil
}
2016-05-12 00:17:33 +03:00
func lexText(l *lexer) stateFn {
var escaped bool
var data []rune
scan:
for c := l.read(); c != eof; c = l.read() {
switch {
case c == char_escape:
escaped = true
continue
case !escaped && c == char_comma && l.inTerms():
l.unread()
break scan
case !escaped && utf8.RuneLen(c) == 1 && special(byte(c)):
l.unread()
break scan
default:
data = append(data, c)
}
escaped = false
}
l.emit(item_text, string(data))
return lexRaw
}
2015-12-24 22:30:20 +03:00
func lexInsideRange(l *lexer) stateFn {
for {
2015-12-25 19:40:36 +03:00
c := l.read()
if c == eof {
l.errorf("unclosed range construction")
return nil
}
2015-12-24 22:30:20 +03:00
2015-12-25 19:40:36 +03:00
switch c {
2016-01-08 20:14:31 +03:00
case char_range_not:
2015-12-24 22:30:20 +03:00
// only first char makes sense
2015-12-26 12:14:30 +03:00
if l.pos-l.width == l.start {
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_not)
2015-12-24 22:30:20 +03:00
}
2016-01-08 20:14:31 +03:00
case char_range_between:
2015-12-26 12:14:30 +03:00
if l.runes != 2 {
2015-12-25 19:40:36 +03:00
l.errorf("unexpected length of lo char inside range")
2015-12-24 22:30:20 +03:00
return nil
}
2015-12-26 12:14:30 +03:00
l.reset()
2015-12-25 19:40:36 +03:00
return lexRangeHiLo
2015-12-24 22:30:20 +03:00
2016-01-08 20:14:31 +03:00
case char_range_close:
2016-05-12 00:17:33 +03:00
if l.runes == 1 {
l.errorf("range should contain at least single char")
return nil
}
2015-12-25 19:40:36 +03:00
l.unread()
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_text)
2015-12-24 22:30:20 +03:00
return lexRangeClose
}
}
}
func lexRangeHiLo(l *lexer) stateFn {
2015-12-25 21:08:54 +03:00
start := l.start
2015-12-25 19:40:36 +03:00
for {
c := l.read()
if c == eof {
l.errorf("unexpected end of input")
return nil
}
switch c {
2016-01-08 20:14:31 +03:00
case char_range_between:
2015-12-26 12:14:30 +03:00
if l.runes != 1 {
2015-12-25 21:08:54 +03:00
l.errorf("unexpected length of range: single character expected before minus")
return nil
}
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_range_between)
2015-12-24 22:30:20 +03:00
2016-01-08 20:14:31 +03:00
case char_range_close:
2015-12-25 19:40:36 +03:00
l.unread()
2015-12-25 21:08:54 +03:00
2015-12-26 12:14:30 +03:00
if l.runes != 1 {
2015-12-25 21:08:54 +03:00
l.errorf("unexpected length of range: single character expected before close")
return nil
}
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_range_hi)
2015-12-25 19:40:36 +03:00
return lexRangeClose
default:
2015-12-25 21:08:54 +03:00
if start != l.start {
continue
}
2015-12-26 12:14:30 +03:00
if l.runes != 1 {
2015-12-25 21:08:54 +03:00
l.errorf("unexpected length of range: single character expected at the begining")
return nil
}
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_range_lo)
2015-12-25 19:40:36 +03:00
}
}
2015-12-24 22:30:20 +03:00
}
2016-01-08 20:14:31 +03:00
func lexAny(l *lexer) stateFn {
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_any)
return lexRaw
2016-01-08 20:14:31 +03:00
}
func lexSuper(l *lexer) stateFn {
l.pos += 2
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_super)
return lexRaw
2016-01-08 20:14:31 +03:00
}
2015-12-24 22:30:20 +03:00
func lexSingle(l *lexer) stateFn {
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_single)
return lexRaw
2015-12-24 22:30:20 +03:00
}
2016-01-08 20:14:31 +03:00
func lexSeparator(l *lexer) stateFn {
posOpen := l.termScopes[len(l.termScopes)-1]
if l.pos-posOpen == 1 {
l.errorf("syntax error: empty term before separator")
return nil
}
l.termPhrases[posOpen] += 1
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_separator)
return lexRaw
2016-01-08 20:14:31 +03:00
}
func lexTermsOpen(l *lexer) stateFn {
l.termScopes = append(l.termScopes, l.pos)
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_terms_open)
2016-01-08 20:14:31 +03:00
2016-05-12 00:17:33 +03:00
return lexRaw
2016-01-08 20:14:31 +03:00
}
func lexTermsClose(l *lexer) stateFn {
if len(l.termScopes) == 0 {
l.errorf("unexpected closing of terms: there is no opened terms")
return nil
}
lastOpen := len(l.termScopes) - 1
posOpen := l.termScopes[lastOpen]
// if it is empty term
if posOpen == l.pos-1 {
l.errorf("term could not be empty")
return nil
}
if l.termPhrases[posOpen] == 0 {
l.errorf("term must contain >1 phrases")
return nil
}
// cleanup
l.termScopes = l.termScopes[:lastOpen]
delete(l.termPhrases, posOpen)
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_terms_close)
2016-01-08 20:14:31 +03:00
2016-05-12 00:17:33 +03:00
return lexRaw
2016-01-08 20:14:31 +03:00
}
2015-12-25 19:40:36 +03:00
func lexRangeOpen(l *lexer) stateFn {
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_range_open)
2015-12-25 19:40:36 +03:00
return lexInsideRange
}
2015-12-24 22:30:20 +03:00
func lexRangeClose(l *lexer) stateFn {
l.pos += 1
2016-05-12 00:17:33 +03:00
l.emitCurrent(item_range_close)
return lexRaw
2015-12-24 22:30:20 +03:00
}