From 4bd7d2399f4eb8f9e1a1cd2e258700529f0dd73a Mon Sep 17 00:00:00 2001 From: Nao Yonashiro Date: Sat, 12 Feb 2022 17:25:52 +0900 Subject: [PATCH 1/2] feat: improve performance when a payload contains escape sequence --- internal/decoder/string.go | 66 ++++++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/internal/decoder/string.go b/internal/decoder/string.go index 65b1004..dea8647 100644 --- a/internal/decoder/string.go +++ b/internal/decoder/string.go @@ -1,6 +1,7 @@ package decoder import ( + "bytes" "reflect" "unicode" "unicode/utf16" @@ -308,49 +309,30 @@ func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, err cursor++ start := cursor b := (*sliceHeader)(unsafe.Pointer(&buf)).data + escaped := 0 for { switch char(b, cursor) { case '\\': + escaped++ cursor++ switch char(b, cursor) { - case '"': - buf[cursor] = '"' - buf = append(buf[:cursor-1], buf[cursor:]...) - case '\\': - buf[cursor] = '\\' - buf = append(buf[:cursor-1], buf[cursor:]...) - case '/': - buf[cursor] = '/' - buf = append(buf[:cursor-1], buf[cursor:]...) - case 'b': - buf[cursor] = '\b' - buf = append(buf[:cursor-1], buf[cursor:]...) - case 'f': - buf[cursor] = '\f' - buf = append(buf[:cursor-1], buf[cursor:]...) - case 'n': - buf[cursor] = '\n' - buf = append(buf[:cursor-1], buf[cursor:]...) - case 'r': - buf[cursor] = '\r' - buf = append(buf[:cursor-1], buf[cursor:]...) - case 't': - buf[cursor] = '\t' - buf = append(buf[:cursor-1], buf[cursor:]...) + case '"', '\\', '/', 'b', 'f', 'n', 'r', 't': + cursor++ case 'u': buflen := int64(len(buf)) if cursor+5 >= buflen { return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor) } - code := unicodeToRune(buf[cursor+1 : cursor+5]) - unicode := []byte(string(code)) - buf = append(append(buf[:cursor-1], unicode...), buf[cursor+5:]...) + cursor += 5 default: return nil, 0, errors.ErrUnexpectedEndOfJSON("escaped string", cursor) } continue case '"': literal := buf[start:cursor] + if escaped > 0 { + literal = literal[:unescapeString(literal, escaped)] + } cursor++ return literal, cursor, nil case nul: @@ -369,3 +351,33 @@ func (d *stringDecoder) decodeByte(buf []byte, cursor int64) ([]byte, int64, err } } } + +var unescapeMap = [256]byte{ + '"': '"', + '\\': '\\', + '/': '/', + 'b': '\b', + 'f': '\f', + 'n': '\n', + 'r': '\r', + 't': '\t', +} + +func unescapeString(buf []byte, escaped int) int { + cursor := 0 + for i := 0; i < escaped; i++ { + cursor += bytes.IndexByte(buf[cursor:], '\\') + c := buf[cursor+1] + if c == 'u' { + code := unicodeToRune(buf[cursor+2 : cursor+6]) + unicode := []byte(string(code)) + buf = append(append(buf[:cursor], unicode...), buf[cursor+6:]...) + cursor += len(unicode) + } else { + buf[cursor+1] = unescapeMap[c] + buf = append(buf[:cursor], buf[cursor+1:]...) + cursor++ + } + } + return len(buf) +} From 62b28d102edcdb70a1b46fd246e16c11f88dcc71 Mon Sep 17 00:00:00 2001 From: Nao Yonashiro Date: Sat, 12 Feb 2022 17:55:10 +0900 Subject: [PATCH 2/2] test: add benchmark --- benchmarks/decode_test.go | 10 ++++++++++ benchmarks/large_payload.go | 3 +++ 2 files changed, 13 insertions(+) diff --git a/benchmarks/decode_test.go b/benchmarks/decode_test.go index 0626219..5f2a4c2 100644 --- a/benchmarks/decode_test.go +++ b/benchmarks/decode_test.go @@ -477,3 +477,13 @@ func Benchmark_Decode_LargeStruct_Stream_GoJsonFirstWinMode(b *testing.B) { } } } + +func Benchmark_Decode_LargeSlice_EscapedString_GoJson(b *testing.B) { + b.ReportAllocs() + for i := 0; i < b.N; i++ { + var v []string + if err := gojson.Unmarshal(LargeSliceEscapedString, &v); err != nil { + b.Fatal(err) + } + } +} diff --git a/benchmarks/large_payload.go b/benchmarks/large_payload.go index f6d1c1a..42d8621 100644 --- a/benchmarks/large_payload.go +++ b/benchmarks/large_payload.go @@ -2,6 +2,7 @@ package benchmark import ( "strconv" + "strings" "github.com/francoispqt/gojay" ) @@ -208,3 +209,5 @@ func NewLargePayloadEasyJson() *LargePayloadEasyJson { }, } } + +var LargeSliceEscapedString = []byte("[" + strings.Repeat(",\"simple plain text\\r\\n\"", 10000)[1:] + "]")