Merge pull request #215 from goccy/feature/fix-stream-decoder-for-unicode

Fix stream decoder for unicode char
2021-05-06 03:10:40 +09:00 · 2021-05-06 03:10:40 +09:00 · 71b8e54730
parent 31951a151c 3b4e3255e3
commit 71b8e54730
2 changed files with 48 additions and 33 deletions
--- a/decode_stream.go
+++ b/decode_stream.go
@ -61,8 +61,8 @@ func (s *stream) statForRetry() ([]byte, int64, unsafe.Pointer) {
 func (s *stream) reset() {
 	s.offset += s.cursor
 	s.buf = s.buf[s.cursor:]
+	s.length -= s.cursor
 	s.cursor = 0
-	s.length = int64(len(s.buf))
 }

 func (s *stream) readBuf() []byte {
@ -72,7 +72,11 @@ func (s *stream) readBuf() []byte {
 		s.buf = make([]byte, s.bufSize)
 		copy(s.buf, remainBuf)
 	}
-	return s.buf[s.cursor:]
+	remainLen := s.length - s.cursor
+	if remainLen > 0 {
+		remainLen-- // last char is nul
+	}
+	return s.buf[s.cursor+remainLen:]
 }

 func (s *stream) read() bool {
--- a/decode_string.go
+++ b/decode_string.go
@ -91,6 +91,47 @@ func unicodeToRune(code []byte) rune {
 	return r
 }

+func decodeUnicodeRune(s *stream) (rune, int64, error) {
+	const defaultOffset = 5
+	const surrogateOffset = 11
+
+	if s.cursor+defaultOffset >= s.length {
+		if !s.read() {
+			return rune(0), 0, errInvalidCharacter(s.char(), "escaped string", s.totalOffset())
+		}
+	}
+
+	r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+defaultOffset])
+	if utf16.IsSurrogate(r) {
+		if s.cursor+surrogateOffset >= s.length {
+			s.read()
+		}
+		if s.cursor+surrogateOffset >= s.length || s.buf[s.cursor+defaultOffset] != '\\' || s.buf[s.cursor+defaultOffset+1] != 'u' {
+			return unicode.ReplacementChar, defaultOffset, nil
+		}
+		r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset])
+		if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
+			return r, surrogateOffset, nil
+		}
+	}
+	return r, defaultOffset, nil
+}
+
+func decodeUnicode(s *stream) error {
+	const backSlashAndULen = 2 // length of \u
+
+	r, offset, err := decodeUnicodeRune(s)
+	if err != nil {
+		return err
+	}
+	unicode := []byte(string(r))
+	unicodeLen := int64(len(unicode))
+	s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+offset:]...)
+	s.length = int64(len(s.buf))
+	s.cursor = s.cursor - backSlashAndULen + unicodeLen
+	return nil
+}
+
 func decodeEscapeString(s *stream) error {
 	s.cursor++
 RETRY:
@ -112,37 +153,7 @@ RETRY:
 	case 't':
 		s.buf[s.cursor] = '\t'
 	case 'u':
-		if s.cursor+5 >= s.length {
-			if !s.read() {
-				return errInvalidCharacter(s.char(), "escaped string", s.totalOffset())
-			}
-		}
-		r := unicodeToRune(s.buf[s.cursor+1 : s.cursor+5])
-		if utf16.IsSurrogate(r) {
-			if s.cursor+11 >= s.length || s.buf[s.cursor+5] != '\\' || s.buf[s.cursor+6] != 'u' {
-				r = unicode.ReplacementChar
-				unicode := []byte(string(r))
-				s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
-				s.cursor = s.cursor - 2 + int64(len(unicode))
-				return nil
-			}
-			r2 := unicodeToRune(s.buf[s.cursor+7 : s.cursor+11])
-			if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar {
-				// valid surrogate pair
-				unicode := []byte(string(r))
-				s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+11:]...)
-				s.cursor = s.cursor - 2 + int64(len(unicode))
-			} else {
-				unicode := []byte(string(r))
-				s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
-				s.cursor = s.cursor - 2 + int64(len(unicode))
-			}
-		} else {
-			unicode := []byte(string(r))
-			s.buf = append(append(s.buf[:s.cursor-1], unicode...), s.buf[s.cursor+5:]...)
-			s.cursor = s.cursor - 2 + int64(len(unicode))
-		}
-		return nil
+		return decodeUnicode(s)
 	case nul:
 		if !s.read() {
 			return errInvalidCharacter(s.char(), "escaped string", s.totalOffset())