Fix stream decoder for unicode char

This commit is contained in:
Masaaki Goshima 2021-05-05 04:21:58 +09:00
parent 31951a151c
commit d4a64f8cde
2 changed files with 47 additions and 32 deletions

View File

@ -72,7 +72,11 @@ func (s *stream) readBuf() []byte {
s.buf = make([]byte, s.bufSize) s.buf = make([]byte, s.bufSize)
copy(s.buf, remainBuf) copy(s.buf, remainBuf)
} }
return s.buf[s.cursor:] remainLen := s.length - s.cursor
if remainLen > 0 {
remainLen-- // last char is nul
}
return s.buf[s.cursor+remainLen:]
} }
func (s *stream) read() bool { func (s *stream) read() bool {

View File

@ -91,6 +91,47 @@ func unicodeToRune(code []byte) rune {
return r return r
} }
func decodeUnicodeRune(s *stream) (rune, int64, error) {
const defaultOffset = 5
const surrogateOffset = 11
if s.cursor+defaultOffset >= s.length {
if !s.read() {
return rune(0), 0, errInvalidCharacter(s.char(), "escaped string", s.totalOffset())
}
}
r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+defaultOffset])
if utf16.IsSurrogate(r) {
if s.cursor+surrogateOffset >= s.length {
s.read()
}
if s.cursor+surrogateOffset >= s.length || s.buf[s.cursor+defaultOffset] != '\\' || s.buf[s.cursor+defaultOffset+1] != 'u' {
return unicode.ReplacementChar, defaultOffset, nil
}
r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset])
if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
return r, surrogateOffset, nil
}
}
return r, defaultOffset, nil
}
func decodeUnicode(s *stream) error {
const backSlashAndULen = 2 // length of \u
r, offset, err := decodeUnicodeRune(s)
if err != nil {
return err
}
unicode := []byte(string(r))
unicodeLen := int64(len(unicode))
s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+offset:]...)
s.length = int64(len(s.buf))
s.cursor = s.cursor - backSlashAndULen + unicodeLen
return nil
}
func decodeEscapeString(s *stream) error { func decodeEscapeString(s *stream) error {
s.cursor++ s.cursor++
RETRY: RETRY:
@ -112,37 +153,7 @@ RETRY:
case 't': case 't':
s.buf[s.cursor] = '\t' s.buf[s.cursor] = '\t'
case 'u': case 'u':
if s.cursor+5 >= s.length { return decodeUnicode(s)
if !s.read() {
return errInvalidCharacter(s.char(), "escaped string", s.totalOffset())
}
}
r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+5])
if utf16.IsSurrogate(r) {
if s.cursor+11 >= s.length || s.buf[s.cursor+5] != '\\' || s.buf[s.cursor+6] != 'u' {
r = unicode.ReplacementChar
unicode := []byte(string(r))
s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
s.cursor = s.cursor - 2 + int64(len(unicode))
return nil
}
r2 := unicodeToRune(s.buf[s.cursor+7 : s.cursor+11])
if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
// valid surrogate pair
unicode := []byte(string(r))
s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+11:]...)
s.cursor = s.cursor - 2 + int64(len(unicode))
} else {
unicode := []byte(string(r))
s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
s.cursor = s.cursor - 2 + int64(len(unicode))
}
} else {
unicode := []byte(string(r))
s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
s.cursor = s.cursor - 2 + int64(len(unicode))
}
return nil
case nul: case nul:
if !s.read() { if !s.read() {
return errInvalidCharacter(s.char(), "escaped string", s.totalOffset()) return errInvalidCharacter(s.char(), "escaped string", s.totalOffset())