From c128c8c9159b4f0c73548a9fae669e0f59ad2776 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Wed, 2 Jun 2021 17:23:32 +0900 Subject: [PATCH 1/3] Fix decoding fields containing escaped characters --- decode_struct.go | 120 +++++++++++++++++++++++++++++++++++------------ decode_test.go | 15 ++++++ 2 files changed, 105 insertions(+), 30 deletions(-) diff --git a/decode_struct.go b/decode_struct.go index f857bfc..d459fb8 100644 --- a/decode_struct.go +++ b/decode_struct.go @@ -6,6 +6,8 @@ import ( "math/bits" "sort" "strings" + "unicode" + "unicode/utf16" "unsafe" ) @@ -136,6 +138,52 @@ func (d *structDecoder) tryOptimize() { } } +// decode from '\uXXXX' +func decodeKeyCharByUnicodeRune(buf []byte, cursor int64) ([]byte, int64) { + const defaultOffset = 4 + const surrogateOffset = 6 + + r := unicodeToRune(buf[cursor : cursor+defaultOffset]) + if utf16.IsSurrogate(r) { + cursor += defaultOffset + if cursor+surrogateOffset >= int64(len(buf)) || buf[cursor] != '\\' || buf[cursor+1] != 'u' { + return []byte(string(unicode.ReplacementChar)), cursor + defaultOffset - 1 + } + cursor += 2 + r2 := unicodeToRune(buf[cursor : cursor+defaultOffset]) + if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar { + return []byte(string(r)), cursor + defaultOffset - 1 + } + } + return []byte(string(r)), cursor + defaultOffset - 1 +} + +func decodeKeyCharByEscapedChar(buf []byte, cursor int64) ([]byte, int64) { + c := buf[cursor] + cursor++ + switch c { + case '"': + return []byte{'"'}, cursor + case '\\': + return []byte{'\\'}, cursor + case '/': + return []byte{'/'}, cursor + case 'b': + return []byte{'\b'}, cursor + case 'f': + return []byte{'\f'}, cursor + case 'n': + return []byte{'\n'}, cursor + case 'r': + return []byte{'\r'}, cursor + case 't': + return []byte{'\t'}, cursor + case 'u': + return decodeKeyCharByUnicodeRune(buf, cursor) + } + return nil, cursor +} + func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldSet, error) { var ( field *structFieldSet @@ -174,24 +222,21 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, return cursor, field, nil case nul: return 0, nil, errUnexpectedEndOfJSON("string", cursor) + case '\\': + cursor++ + chars, nextCursor := decodeKeyCharByEscapedChar(buf, cursor) + for _, c := range chars { + curBit &= bitmap[keyIdx][largeToSmallTable[c]] + if curBit == 0 { + return decodeKeyNotFound(b, cursor, field) + } + keyIdx++ + } + cursor = nextCursor default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - for { - cursor++ - switch char(b, cursor) { - case '"': - cursor++ - return cursor, field, nil - case '\\': - cursor++ - if char(b, cursor) == nul { - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - case nul: - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - } + return decodeKeyNotFound(b, cursor, field) } keyIdx++ } @@ -203,6 +248,24 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, } } +func decodeKeyNotFound(b unsafe.Pointer, cursor int64, field *structFieldSet) (int64, *structFieldSet, error) { + for { + cursor++ + switch char(b, cursor) { + case '"': + cursor++ + return cursor, field, nil + case '\\': + cursor++ + if char(b, cursor) == nul { + return 0, nil, errUnexpectedEndOfJSON("string", cursor) + } + case nul: + return 0, nil, errUnexpectedEndOfJSON("string", cursor) + } + } +} + func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldSet, error) { var ( field *structFieldSet @@ -241,24 +304,21 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, return cursor, field, nil case nul: return 0, nil, errUnexpectedEndOfJSON("string", cursor) + case '\\': + cursor++ + chars, nextCursor := decodeKeyCharByEscapedChar(buf, cursor) + for _, c := range chars { + curBit &= bitmap[keyIdx][largeToSmallTable[c]] + if curBit == 0 { + return decodeKeyNotFound(b, cursor, field) + } + keyIdx++ + } + cursor = nextCursor default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - for { - cursor++ - switch char(b, cursor) { - case '"': - cursor++ - return cursor, field, nil - case '\\': - cursor++ - if char(b, cursor) == nul { - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - case nul: - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - } + return decodeKeyNotFound(b, cursor, field) } keyIdx++ } diff --git a/decode_test.go b/decode_test.go index 7dab500..8298a1f 100644 --- a/decode_test.go +++ b/decode_test.go @@ -3594,3 +3594,18 @@ func TestIssue218(t *testing.T) { }) } } + +func TestDecodeEscapedCharField(t *testing.T) { + b := []byte(`{"\u6D88\u606F":"\u6D88\u606F"}`) + t.Run("unmarshal", func(t *testing.T) { + v := struct { + Msg string `json:"消息"` + }{} + if err := json.Unmarshal(b, &v); err != nil { + t.Fatal(err) + } + if !bytes.Equal([]byte(v.Msg), []byte("消息")) { + t.Fatal("failed to decode unicode char") + } + }) +} From 14c828aad77095f5eb13ce04f4feee6f3bd3bda1 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Wed, 2 Jun 2021 19:01:41 +0900 Subject: [PATCH 2/3] Add test case for stream decoder --- decode_test.go | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/decode_test.go b/decode_test.go index 8298a1f..f3c2622 100644 --- a/decode_test.go +++ b/decode_test.go @@ -3608,4 +3608,15 @@ func TestDecodeEscapedCharField(t *testing.T) { t.Fatal("failed to decode unicode char") } }) + t.Run("stream", func(t *testing.T) { + v := struct { + Msg string `json:"消息"` + }{} + if err := json.NewDecoder(bytes.NewBuffer(b)).Decode(&v); err != nil { + t.Fatal(err) + } + if !bytes.Equal([]byte(v.Msg), []byte("消息")) { + t.Fatal("failed to decode unicode char") + } + }) } From 917f71bbcf46d6c66cf45604638e38cd849c7726 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Thu, 3 Jun 2021 15:00:31 +0900 Subject: [PATCH 3/3] Fix decoding fields containing escaped characters for stream decoder --- decode_struct.go | 260 ++++++++++++++++++++++++++++++----------------- 1 file changed, 166 insertions(+), 94 deletions(-) diff --git a/decode_struct.go b/decode_struct.go index d459fb8..053b8a1 100644 --- a/decode_struct.go +++ b/decode_struct.go @@ -186,7 +186,6 @@ func decodeKeyCharByEscapedChar(buf []byte, cursor int64) ([]byte, int64) { func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldSet, error) { var ( - field *structFieldSet curBit uint8 = math.MaxUint8 ) b := (*sliceHeader)(unsafe.Pointer(&buf)).data @@ -200,7 +199,7 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, switch c { case '"': cursor++ - return cursor, field, nil + return cursor, nil, nil case nul: return 0, nil, errUnexpectedEndOfJSON("string", cursor) } @@ -212,7 +211,7 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, switch c { case '"': fieldSetIndex := bits.TrailingZeros8(curBit) - field = d.sortedFieldSets[fieldSetIndex] + field := d.sortedFieldSets[fieldSetIndex] keyLen := cursor - start cursor++ if keyLen < field.keyLen { @@ -228,7 +227,7 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, for _, c := range chars { curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - return decodeKeyNotFound(b, cursor, field) + return decodeKeyNotFound(b, cursor) } keyIdx++ } @@ -236,7 +235,7 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - return decodeKeyNotFound(b, cursor, field) + return decodeKeyNotFound(b, cursor) } keyIdx++ } @@ -248,27 +247,8 @@ func decodeKeyByBitmapUint8(d *structDecoder, buf []byte, cursor int64) (int64, } } -func decodeKeyNotFound(b unsafe.Pointer, cursor int64, field *structFieldSet) (int64, *structFieldSet, error) { - for { - cursor++ - switch char(b, cursor) { - case '"': - cursor++ - return cursor, field, nil - case '\\': - cursor++ - if char(b, cursor) == nul { - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - case nul: - return 0, nil, errUnexpectedEndOfJSON("string", cursor) - } - } -} - func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldSet, error) { var ( - field *structFieldSet curBit uint16 = math.MaxUint16 ) b := (*sliceHeader)(unsafe.Pointer(&buf)).data @@ -282,7 +262,7 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, switch c { case '"': cursor++ - return cursor, field, nil + return cursor, nil, nil case nul: return 0, nil, errUnexpectedEndOfJSON("string", cursor) } @@ -294,7 +274,7 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, switch c { case '"': fieldSetIndex := bits.TrailingZeros16(curBit) - field = d.sortedFieldSets[fieldSetIndex] + field := d.sortedFieldSets[fieldSetIndex] keyLen := cursor - start cursor++ if keyLen < field.keyLen { @@ -310,7 +290,7 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, for _, c := range chars { curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - return decodeKeyNotFound(b, cursor, field) + return decodeKeyNotFound(b, cursor) } keyIdx++ } @@ -318,7 +298,7 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - return decodeKeyNotFound(b, cursor, field) + return decodeKeyNotFound(b, cursor) } keyIdx++ } @@ -330,6 +310,24 @@ func decodeKeyByBitmapUint16(d *structDecoder, buf []byte, cursor int64) (int64, } } +func decodeKeyNotFound(b unsafe.Pointer, cursor int64) (int64, *structFieldSet, error) { + for { + cursor++ + switch char(b, cursor) { + case '"': + cursor++ + return cursor, nil, nil + case '\\': + cursor++ + if char(b, cursor) == nul { + return 0, nil, errUnexpectedEndOfJSON("string", cursor) + } + case nul: + return 0, nil, errUnexpectedEndOfJSON("string", cursor) + } + } +} + func decodeKey(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldSet, error) { key, c, err := d.stringDecoder.decodeByte(buf, cursor) if err != nil { @@ -346,10 +344,9 @@ func decodeKey(d *structDecoder, buf []byte, cursor int64) (int64, *structFieldS func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, string, error) { var ( - field *structFieldSet curBit uint8 = math.MaxUint8 ) - buf, cursor, p := s.stat() + _, cursor, p := s.stat() for { switch char(p, cursor) { case ' ', '\n', '\t', '\r': @@ -357,7 +354,7 @@ func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() continue } return nil, "", errNotAtBeginningOfValue(s.totalOffset()) @@ -369,11 +366,11 @@ func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, case '"': cursor++ s.cursor = cursor - return field, "", nil + return nil, "", nil case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() goto FIRST_CHAR } return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) @@ -385,7 +382,7 @@ func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, switch c { case '"': fieldSetIndex := bits.TrailingZeros8(curBit) - field = d.sortedFieldSets[fieldSetIndex] + field := d.sortedFieldSets[fieldSetIndex] keyLen := cursor - start cursor++ s.cursor = cursor @@ -397,39 +394,30 @@ func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() continue } return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) + case '\\': + s.cursor = cursor + 1 // skip '\' char + chars, err := decodeKeyCharByEscapeCharStream(s) + if err != nil { + return nil, "", err + } + cursor = s.cursor + for _, c := range chars { + curBit &= bitmap[keyIdx][largeToSmallTable[c]] + if curBit == 0 { + s.cursor = cursor + return decodeKeyNotFoundStream(s, start) + } + keyIdx++ + } default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - for { - cursor++ - switch char(p, cursor) { - case '"': - b := buf[start:cursor] - key := *(*string)(unsafe.Pointer(&b)) - cursor++ - s.cursor = cursor - return field, key, nil - case '\\': - cursor++ - if char(p, cursor) == nul { - s.cursor = cursor - if !s.read() { - return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) - } - buf, cursor, p = s.statForRetry() - } - case nul: - s.cursor = cursor - if !s.read() { - return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) - } - buf, cursor, p = s.statForRetry() - } - } + s.cursor = cursor + return decodeKeyNotFoundStream(s, start) } keyIdx++ } @@ -443,10 +431,9 @@ func decodeKeyByBitmapUint8Stream(d *structDecoder, s *stream) (*structFieldSet, func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet, string, error) { var ( - field *structFieldSet curBit uint16 = math.MaxUint16 ) - buf, cursor, p := s.stat() + _, cursor, p := s.stat() for { switch char(p, cursor) { case ' ', '\n', '\t', '\r': @@ -454,7 +441,7 @@ func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() continue } return nil, "", errNotAtBeginningOfValue(s.totalOffset()) @@ -466,11 +453,11 @@ func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet case '"': cursor++ s.cursor = cursor - return field, "", nil + return nil, "", nil case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() goto FIRST_CHAR } return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) @@ -482,7 +469,7 @@ func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet switch c { case '"': fieldSetIndex := bits.TrailingZeros16(curBit) - field = d.sortedFieldSets[fieldSetIndex] + field := d.sortedFieldSets[fieldSetIndex] keyLen := cursor - start cursor++ s.cursor = cursor @@ -494,39 +481,30 @@ func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet case nul: s.cursor = cursor if s.read() { - buf, cursor, p = s.stat() + _, cursor, p = s.stat() continue } return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) + case '\\': + s.cursor = cursor + 1 // skip '\' char + chars, err := decodeKeyCharByEscapeCharStream(s) + if err != nil { + return nil, "", err + } + cursor = s.cursor + for _, c := range chars { + curBit &= bitmap[keyIdx][largeToSmallTable[c]] + if curBit == 0 { + s.cursor = cursor + return decodeKeyNotFoundStream(s, start) + } + keyIdx++ + } default: curBit &= bitmap[keyIdx][largeToSmallTable[c]] if curBit == 0 { - for { - cursor++ - switch char(p, cursor) { - case '"': - b := buf[start:cursor] - key := *(*string)(unsafe.Pointer(&b)) - cursor++ - s.cursor = cursor - return field, key, nil - case '\\': - cursor++ - if char(p, cursor) == nul { - s.cursor = cursor - if !s.read() { - return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) - } - buf, cursor, p = s.statForRetry() - } - case nul: - s.cursor = cursor - if !s.read() { - return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) - } - buf, cursor, p = s.statForRetry() - } - } + s.cursor = cursor + return decodeKeyNotFoundStream(s, start) } keyIdx++ } @@ -538,6 +516,100 @@ func decodeKeyByBitmapUint16Stream(d *structDecoder, s *stream) (*structFieldSet } } +// decode from '\uXXXX' +func decodeKeyCharByUnicodeRuneStream(s *stream) ([]byte, error) { + const defaultOffset = 4 + const surrogateOffset = 6 + + if s.cursor+defaultOffset >= s.length { + if !s.read() { + return nil, errInvalidCharacter(s.char(), "escaped unicode char", s.totalOffset()) + } + } + + r := unicodeToRune(s.buf[s.cursor : s.cursor+defaultOffset]) + if utf16.IsSurrogate(r) { + s.cursor += defaultOffset + if s.cursor+surrogateOffset >= s.length { + s.read() + } + if s.cursor+surrogateOffset >= s.length || s.buf[s.cursor] != '\\' || s.buf[s.cursor+1] != 'u' { + s.cursor += defaultOffset - 1 + return []byte(string(unicode.ReplacementChar)), nil + } + r2 := unicodeToRune(s.buf[s.cursor+defaultOffset+2 : s.cursor+surrogateOffset]) + if r := utf16.DecodeRune(r, r2); r != unicode.ReplacementChar { + s.cursor += defaultOffset - 1 + return []byte(string(r)), nil + } + } + s.cursor += defaultOffset - 1 + return []byte(string(r)), nil +} + +func decodeKeyCharByEscapeCharStream(s *stream) ([]byte, error) { + c := s.buf[s.cursor] + s.cursor++ +RETRY: + switch c { + case '"': + return []byte{'"'}, nil + case '\\': + return []byte{'\\'}, nil + case '/': + return []byte{'/'}, nil + case 'b': + return []byte{'\b'}, nil + case 'f': + return []byte{'\f'}, nil + case 'n': + return []byte{'\n'}, nil + case 'r': + return []byte{'\r'}, nil + case 't': + return []byte{'\t'}, nil + case 'u': + return decodeKeyCharByUnicodeRuneStream(s) + case nul: + if !s.read() { + return nil, errInvalidCharacter(s.char(), "escaped char", s.totalOffset()) + } + goto RETRY + default: + return nil, errUnexpectedEndOfJSON("struct field", s.totalOffset()) + } +} + +func decodeKeyNotFoundStream(s *stream, start int64) (*structFieldSet, string, error) { + buf, cursor, p := s.stat() + for { + cursor++ + switch char(p, cursor) { + case '"': + b := buf[start:cursor] + key := *(*string)(unsafe.Pointer(&b)) + cursor++ + s.cursor = cursor + return nil, key, nil + case '\\': + cursor++ + if char(p, cursor) == nul { + s.cursor = cursor + if !s.read() { + return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) + } + buf, cursor, p = s.statForRetry() + } + case nul: + s.cursor = cursor + if !s.read() { + return nil, "", errUnexpectedEndOfJSON("string", s.totalOffset()) + } + buf, cursor, p = s.statForRetry() + } + } +} + func decodeKeyStream(d *structDecoder, s *stream) (*structFieldSet, string, error) { key, err := d.stringDecoder.decodeStreamByte(s) if err != nil {