From 118663d59f975457ba51169010e6630947e40aba Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Tue, 28 Dec 2021 03:22:39 +0900 Subject: [PATCH] Update SIMD codes --- internal/cmd/generator/simd/string.c | 110 ++++++- internal/cmd/generator/simd/string_avx.go | 12 + internal/cmd/generator/vm.go.tmpl | 7 +- internal/encoder/encoder.go | 12 +- internal/encoder/string.go | 38 +-- internal/encoder/string_avx.go | 12 + internal/encoder/string_avx.s | 352 +++++++++++++++++++++- internal/encoder/vm/vm.go | 7 +- internal/encoder/vm_color/vm.go | 7 +- internal/encoder/vm_color_indent/vm.go | 7 +- internal/encoder/vm_indent/vm.go | 7 +- 11 files changed, 517 insertions(+), 54 deletions(-) diff --git a/internal/cmd/generator/simd/string.c b/internal/cmd/generator/simd/string.c index 2db3f95..ebbb10b 100644 --- a/internal/cmd/generator/simd/string.c +++ b/internal/cmd/generator/simd/string.c @@ -3,7 +3,7 @@ #include #include -uint64_t findEscapeIndex64(char *buf, int len) { +uint64_t findHTMLEscapeIndex64(char *buf, int len) { static const uint64_t lsb = 0x0101010101010101; static const uint64_t msb = 0x8080808080808080; @@ -29,7 +29,7 @@ uint64_t findEscapeIndex64(char *buf, int len) { return 8 * chunkLen; } -uint64_t findEscapeIndex128(char *buf, int len) { +uint64_t findHTMLEscapeIndex128(char *buf, int len) { static const uint64_t lsb = 0x0101010101010101; static const uint64_t msb = 0x8080808080808080; @@ -71,12 +71,12 @@ uint64_t findEscapeIndex128(char *buf, int len) { } int idx = 16 * chunkLen; if (len - idx >= 8) { - return idx + findEscapeIndex64(sp, len - idx); + return idx + findHTMLEscapeIndex64(sp, len - idx); } return idx; } -uint64_t findEscapeIndex256(char *buf, int len) { +uint64_t findHTMLEscapeIndex256(char *buf, int len) { static const uint64_t lsb = 0x0101010101010101; static const uint64_t msb = 0x8080808080808080; @@ -118,6 +118,108 @@ uint64_t findEscapeIndex256(char *buf, int len) { } int idx = 32 * chunkLen; int remainLen = len - idx; + if (remainLen >= 16) { + return idx + findHTMLEscapeIndex128(sp, remainLen); + } else if (remainLen >= 8) { + return idx + findHTMLEscapeIndex64(sp, remainLen); + } + return idx; +} + +uint64_t findEscapeIndex64(char *buf, int len) { + static const uint64_t lsb = 0x0101010101010101; + static const uint64_t msb = 0x8080808080808080; + + static const uint64_t space = lsb * 0x20; + static const uint64_t quote = lsb * '"'; + static const uint64_t escape = lsb * '\\'; + + char *sp = buf; + size_t chunkLen = len / 8; + int chunkIdx = 0; + for (; chunkIdx < chunkLen; chunkIdx++) { + uint64_t n = *(uint64_t *)sp; + uint64_t mask = n | (n - space) | ((n ^ quote) - lsb) | ((n ^ escape) - lsb); + uint64_t masked = mask & msb; + if (masked != 0) { + return __builtin_ctz(masked) / 8; + } + sp += 8; + } + return 8 * chunkLen; +} + +uint64_t findEscapeIndex128(char *buf, int len) { + static const uint64_t lsb = 0x0101010101010101; + static const uint64_t msb = 0x8080808080808080; + + static const __m64 space = (__m64)(lsb * 0x20); + static const __m64 quote = (__m64)(lsb * '"'); + static const __m64 escape = (__m64)(lsb * '\\'); + + __m128i zeroV = _mm_setzero_si128(); + __m128i msbV = _mm_set_epi64((__m64)(msb), (__m64)(msb)); + __m128i lsbV = _mm_set_epi64((__m64)(lsb), (__m64)(lsb)); + __m128i spaceV = _mm_set_epi64(space, space); + __m128i quoteV = _mm_set_epi64(quote, quote); + __m128i escapeV = _mm_set_epi64(escape, escape); + + char *sp = buf; + size_t chunkLen = len / 16; + int chunkIdx = 0; + for (; chunkIdx < chunkLen; chunkIdx++) { + __m128i n = _mm_loadu_si128((const void *)sp); + __m128i spaceN = _mm_sub_epi64(n, spaceV); + __m128i quoteN = _mm_sub_epi64(_mm_xor_si128(n, quoteV), lsbV); + __m128i escapeN = _mm_sub_epi64(_mm_xor_si128(n, escapeV), lsbV); + + __m128i mask = _mm_or_si128(_mm_or_si128(_mm_or_si128(n, spaceN), quoteN), escapeN); + int movemask = _mm_movemask_epi8(_mm_and_si128(mask, msbV)); + if (movemask != 0) { + return __builtin_ctz(movemask); + } + sp += 16; + } + int idx = 16 * chunkLen; + if (len - idx >= 8) { + return idx + findEscapeIndex64(sp, len - idx); + } + return idx; +} + +uint64_t findEscapeIndex256(char *buf, int len) { + static const uint64_t lsb = 0x0101010101010101; + static const uint64_t msb = 0x8080808080808080; + + static const __m64 space = (__m64)(lsb * 0x20); + static const __m64 quote = (__m64)(lsb * '"'); + static const __m64 escape = (__m64)(lsb * '\\'); + + __m256i zeroV = _mm256_setzero_si256(); + __m256i msbV = _mm256_set1_epi64x(msb); + __m256i lsbV = _mm256_set1_epi64x(lsb); + __m256i spaceV = _mm256_set1_epi64x(space); + __m256i quoteV = _mm256_set1_epi64x(quote); + __m256i escapeV = _mm256_set1_epi64x(escape); + + char *sp = buf; + size_t chunkLen = len / 32; + int chunkIdx = 0; + for (; chunkIdx < chunkLen; chunkIdx++) { + __m256i n = _mm256_loadu_si256((const void *)sp); + __m256i spaceN = _mm256_sub_epi64(n, spaceV); + __m256i quoteN = _mm256_sub_epi64(_mm256_xor_si256(n, quoteV), lsbV); + __m256i escapeN = _mm256_sub_epi64(_mm256_xor_si256(n, escapeV), lsbV); + + __m256i mask = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(n, spaceN), quoteN), escapeN); + int movemask = _mm256_movemask_epi8(_mm256_and_si256(mask, msbV)); + if (movemask != 0) { + return __builtin_ctz(movemask); + } + sp += 32; + } + int idx = 32 * chunkLen; + int remainLen = len - idx; if (remainLen >= 16) { return idx + findEscapeIndex128(sp, remainLen); } else if (remainLen >= 8) { diff --git a/internal/cmd/generator/simd/string_avx.go b/internal/cmd/generator/simd/string_avx.go index 75ce996..4fd3b0e 100644 --- a/internal/cmd/generator/simd/string_avx.go +++ b/internal/cmd/generator/simd/string_avx.go @@ -2,6 +2,18 @@ package encoder import "unsafe" +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex64(buf unsafe.Pointer, len int) (ret int) + +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex128(buf unsafe.Pointer, len int) (ret int) + +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex256(buf unsafe.Pointer, len int) (ret int) + //go:nosplit //go:noescape func _findEscapeIndex64(buf unsafe.Pointer, len int) (ret int) diff --git a/internal/cmd/generator/vm.go.tmpl b/internal/cmd/generator/vm.go.tmpl index 4be6b80..5a3f636 100644 --- a/internal/cmd/generator/vm.go.tmpl +++ b/internal/cmd/generator/vm.go.tmpl @@ -402,12 +402,13 @@ func Run(ctx *encoder.RuntimeContext, b []byte, codeSet *encoder.OpcodeSet) ([]b code = code.End.Next break } - b = appendStructHead(ctx, b) - mapCtx := encoder.NewMapContext(mlen) + unorderedMap := (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 + mapCtx := encoder.NewMapContext(mlen, unorderedMap) mapiterinit(code.Type, uptr, &mapCtx.Iter) store(ctxptr, code.Idx, uintptr(unsafe.Pointer(mapCtx))) ctx.KeepRefs = append(ctx.KeepRefs, unsafe.Pointer(mapCtx)) - if (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 { + b = appendStructHead(ctx, b) + if unorderedMap { b = appendMapKeyIndent(ctx, code.Next, b) } else { mapCtx.Start = len(b) diff --git a/internal/encoder/encoder.go b/internal/encoder/encoder.go index 79a3f64..714f060 100644 --- a/internal/encoder/encoder.go +++ b/internal/encoder/encoder.go @@ -259,12 +259,14 @@ var mapContextPool = sync.Pool{ }, } -func NewMapContext(mapLen int) *MapContext { +func NewMapContext(mapLen int, unorderedMap bool) *MapContext { ctx := mapContextPool.Get().(*MapContext) - if len(ctx.Slice.Items) < mapLen { - ctx.Slice.Items = make([]MapItem, mapLen) - } else { - ctx.Slice.Items = ctx.Slice.Items[:mapLen] + if !unorderedMap { + if len(ctx.Slice.Items) < mapLen { + ctx.Slice.Items = make([]MapItem, mapLen) + } else { + ctx.Slice.Items = ctx.Slice.Items[:mapLen] + } } ctx.Buf = ctx.Buf[:0] ctx.Iter = mapIter{} diff --git a/internal/encoder/string.go b/internal/encoder/string.go index c146b06..115b4ab 100644 --- a/internal/encoder/string.go +++ b/internal/encoder/string.go @@ -1,7 +1,6 @@ package encoder import ( - "math/bits" "reflect" "unsafe" @@ -374,11 +373,11 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { switch valLen { case 1, 2, 3, 4, 5, 6, 7: case 8, 9, 10, 11, 12, 13, 14, 15: - j = _findEscapeIndex64((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) + j = _findHTMLEscapeIndex64((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) case 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31: - j = _findEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) + j = _findHTMLEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) default: - j = _findEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) + j = _findHTMLEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) } for j < valLen { c := s[j] @@ -479,30 +478,15 @@ func appendString(buf []byte, s string) []byte { var ( i, j int ) - if valLen >= 8 { - chunks := stringToUint64Slice(s) - for _, n := range chunks { - // combine masks before checking for the MSB of each byte. We include - // `n` in the mask to check whether any of the *input* byte MSBs were - // set (i.e. the byte was outside the ASCII range). - mask := n | (n - (lsb * 0x20)) | - ((n ^ (lsb * '"')) - lsb) | - ((n ^ (lsb * '\\')) - lsb) - if (mask & msb) != 0 { - j = bits.TrailingZeros64(mask&msb) / 8 - goto ESCAPE_END - } - } - valLen := len(s) - for i := len(chunks) * 8; i < valLen; i++ { - if needEscape[s[i]] { - j = i - goto ESCAPE_END - } - } - return append(append(buf, s...), '"') + switch valLen { + case 1, 2, 3, 4, 5, 6, 7: + case 8, 9, 10, 11, 12, 13, 14, 15: + j = _findEscapeIndex64((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) + case 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31: + j = _findEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) + default: + j = _findEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) } -ESCAPE_END: for j < valLen { c := s[j] diff --git a/internal/encoder/string_avx.go b/internal/encoder/string_avx.go index 75ce996..4fd3b0e 100644 --- a/internal/encoder/string_avx.go +++ b/internal/encoder/string_avx.go @@ -2,6 +2,18 @@ package encoder import "unsafe" +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex64(buf unsafe.Pointer, len int) (ret int) + +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex128(buf unsafe.Pointer, len int) (ret int) + +//go:nosplit +//go:noescape +func _findHTMLEscapeIndex256(buf unsafe.Pointer, len int) (ret int) + //go:nosplit //go:noescape func _findEscapeIndex64(buf unsafe.Pointer, len int) (ret int) diff --git a/internal/encoder/string_avx.s b/internal/encoder/string_avx.s index f9bad0c..e125b04 100644 --- a/internal/encoder/string_avx.s +++ b/internal/encoder/string_avx.s @@ -1,7 +1,7 @@ //+build !noasm !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT -TEXT ·_findEscapeIndex64(SB), $0-24 +TEXT ·_findHTMLEscapeIndex64(SB), $0-24 MOVQ buf+0(FP), DI MOVQ len+8(FP), SI @@ -84,7 +84,7 @@ DATA LCDATA1<>+0x070(SB)/8, $0x8080808080808080 DATA LCDATA1<>+0x078(SB)/8, $0x8080808080808080 GLOBL LCDATA1<>(SB), 8, $128 -TEXT ·_findEscapeIndex128(SB), $16-24 +TEXT ·_findHTMLEscapeIndex128(SB), $16-24 MOVQ buf+0(FP), DI MOVQ len+8(FP), SI @@ -237,7 +237,7 @@ DATA LCDATA2<>+0x0d0(SB)/8, $0x8080808080808080 DATA LCDATA2<>+0x0d8(SB)/8, $0x8080808080808080 GLOBL LCDATA2<>(SB), 8, $224 -TEXT ·_findEscapeIndex256(SB), $16-24 +TEXT ·_findHTMLEscapeIndex256(SB), $16-24 MOVQ buf+0(FP), DI MOVQ len+8(FP), SI @@ -470,3 +470,349 @@ LBB2_25: VZEROUPPER MOVQ AX, ret+16(FP) RET + +TEXT ·_findEscapeIndex64(SB), $0-24 + + MOVQ buf+0(FP), DI + MOVQ len+8(FP), SI + + WORD $0xf089 // mov eax, esi + WORD $0xf8c1; BYTE $0x1f // sar eax, 31 + WORD $0xe8c1; BYTE $0x1d // shr eax, 29 + WORD $0xf001 // add eax, esi + WORD $0xf8c1; BYTE $0x03 // sar eax, 3 + WORD $0x9848 // cdqe + WORD $0xc683; BYTE $0x07 // add esi, 7 + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JB LBB3_5 + QUAD $0xfefefefefeffbe49; WORD $0xfefe // mov r14, -72340172838076673 + WORD $0xd231 // xor edx, edx + QUAD $0xdfdfdfdfdfe0b849; WORD $0xdfdf // mov r8, -2314885530818453536 + QUAD $0x222222222222b949; WORD $0x2222 // mov r9, 2459565876494606882 + QUAD $0x5c5c5c5c5c5cba49; WORD $0x5c5c // mov r10, 6655295901103053916 + QUAD $0x808080808080bb49; WORD $0x8080 // mov r11, -9187201950435737472 + +LBB3_2: + LONG $0xd7348b48 // mov rsi, qword [rdi + 8*rdx] + LONG $0x061c8d4a // lea rbx, [rsi + r8] + WORD $0x0948; BYTE $0xf3 // or rbx, rsi + WORD $0x8948; BYTE $0xf1 // mov rcx, rsi + WORD $0x314c; BYTE $0xc9 // xor rcx, r9 + WORD $0x014c; BYTE $0xf1 // add rcx, r14 + WORD $0x0948; BYTE $0xd9 // or rcx, rbx + WORD $0x314c; BYTE $0xd6 // xor rsi, r10 + WORD $0x014c; BYTE $0xf6 // add rsi, r14 + WORD $0x0948; BYTE $0xce // or rsi, rcx + WORD $0x214c; BYTE $0xde // and rsi, r11 + JNE LBB3_3 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + JB LBB3_2 + +LBB3_5: + LONG $0x03e0c148 // shl rax, 3 + JMP LBB3_6 + +LBB3_3: + WORD $0xbc0f; BYTE $0xc6 // bsf eax, esi + WORD $0xe8c1; BYTE $0x03 // shr eax, 3 + +LBB3_6: + MOVQ AX, ret+16(FP) + RET + +DATA LCDATA3<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA3<>+0x008(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA3<>+0x010(SB)/8, $0x2222222222222222 +DATA LCDATA3<>+0x018(SB)/8, $0x2222222222222222 +DATA LCDATA3<>+0x020(SB)/8, $0xfefefefefefefeff +DATA LCDATA3<>+0x028(SB)/8, $0xfefefefefefefeff +DATA LCDATA3<>+0x030(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA3<>+0x038(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA3<>+0x040(SB)/8, $0x8080808080808080 +DATA LCDATA3<>+0x048(SB)/8, $0x8080808080808080 +GLOBL LCDATA3<>(SB), 8, $80 + +TEXT ·_findEscapeIndex128(SB), $0-24 + + MOVQ buf+0(FP), DI + MOVQ len+8(FP), SI + LEAQ LCDATA3<>(SB), BP + + WORD $0xf089 // mov eax, esi + WORD $0xf8c1; BYTE $0x1f // sar eax, 31 + WORD $0xe8c1; BYTE $0x1c // shr eax, 28 + WORD $0xf001 // add eax, esi + WORD $0xf8c1; BYTE $0x04 // sar eax, 4 + WORD $0x4e8d; BYTE $0x0f // lea ecx, [rsi + 15] + WORD $0xf983; BYTE $0x1f // cmp ecx, 31 + JB LBB4_5 + WORD $0x6348; BYTE $0xc8 // movsxd rcx, eax + WORD $0xd231 // xor edx, edx + LONG $0x456f79c5; BYTE $0x00 // vmovdqa xmm8, oword 0[rbp] /* [rip + .LCPI4_0] */ + LONG $0x4d6ff9c5; BYTE $0x10 // vmovdqa xmm1, oword 16[rbp] /* [rip + .LCPI4_1] */ + LONG $0x556ff9c5; BYTE $0x20 // vmovdqa xmm2, oword 32[rbp] /* [rip + .LCPI4_2] */ + LONG $0x5d6ff9c5; BYTE $0x30 // vmovdqa xmm3, oword 48[rbp] /* [rip + .LCPI4_3] */ + LONG $0x656ff9c5; BYTE $0x40 // vmovdqa xmm4, oword 64[rbp] /* [rip + .LCPI4_4] */ + +LBB4_2: + LONG $0x2f6ffac5 // vmovdqu xmm5, oword [rdi] + LONG $0xd451c1c4; BYTE $0xf0 // vpaddq xmm6, xmm5, xmm8 + LONG $0xf9efd1c5 // vpxor xmm7, xmm5, xmm1 + LONG $0xfad4c1c5 // vpaddq xmm7, xmm7, xmm2 + LONG $0xc3efd1c5 // vpxor xmm0, xmm5, xmm3 + LONG $0xc2d4f9c5 // vpaddq xmm0, xmm0, xmm2 + LONG $0xedebc9c5 // vpor xmm5, xmm6, xmm5 + LONG $0xefebd1c5 // vpor xmm5, xmm5, xmm7 + LONG $0xc0ebd1c5 // vpor xmm0, xmm5, xmm0 + LONG $0xc4dbf9c5 // vpand xmm0, xmm0, xmm4 + LONG $0xd8d7f9c5 // vpmovmskb ebx, xmm0 + WORD $0xdb85 // test ebx, ebx + JNE LBB4_3 + LONG $0x10c78348 // add rdi, 16 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xca // cmp rdx, rcx + JB LBB4_2 + +LBB4_5: + WORD $0xe0c1; BYTE $0x04 // shl eax, 4 + WORD $0xc629 // sub esi, eax + WORD $0x9848 // cdqe + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB4_12 + QUAD $0xfefefefefeffb849; WORD $0xfefe // mov r8, -72340172838076673 + WORD $0x8941; BYTE $0xf7 // mov r15d, esi + LONG $0x03efc141 // shr r15d, 3 + WORD $0xc931 // xor ecx, ecx + QUAD $0xdfdfdfdfdfe0b949; WORD $0xdfdf // mov r9, -2314885530818453536 + QUAD $0x222222222222ba49; WORD $0x2222 // mov r10, 2459565876494606882 + QUAD $0x5c5c5c5c5c5cbb49; WORD $0x5c5c // mov r11, 6655295901103053916 + QUAD $0x808080808080be49; WORD $0x8080 // mov r14, -9187201950435737472 + +LBB4_7: + LONG $0xcf1c8b48 // mov rbx, qword [rdi + 8*rcx] + LONG $0x0b248d4e // lea r12, [rbx + r9] + WORD $0x0949; BYTE $0xdc // or r12, rbx + WORD $0x8948; BYTE $0xda // mov rdx, rbx + WORD $0x314c; BYTE $0xd2 // xor rdx, r10 + WORD $0x014c; BYTE $0xc2 // add rdx, r8 + WORD $0x094c; BYTE $0xe2 // or rdx, r12 + WORD $0x314c; BYTE $0xdb // xor rbx, r11 + WORD $0x014c; BYTE $0xc3 // add rbx, r8 + WORD $0x0948; BYTE $0xd3 // or rbx, rdx + WORD $0x214c; BYTE $0xf3 // and rbx, r14 + JNE LBB4_8 + LONG $0x01c18348 // add rcx, 1 + WORD $0x394c; BYTE $0xf9 // cmp rcx, r15 + JB LBB4_7 + WORD $0xe683; BYTE $0xf8 // and esi, -8 + JMP LBB4_11 + +LBB4_3: + WORD $0xbc0f; BYTE $0xc3 // bsf eax, ebx + JMP LBB4_12 + +LBB4_8: + WORD $0xbc0f; BYTE $0xf3 // bsf esi, ebx + WORD $0xeec1; BYTE $0x03 // shr esi, 3 + +LBB4_11: + WORD $0xf189 // mov ecx, esi + WORD $0x0148; BYTE $0xc8 // add rax, rcx + +LBB4_12: + MOVQ AX, ret+16(FP) + RET + +DATA LCDATA4<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA4<>+0x008(SB)/8, $0x2222222222222222 +DATA LCDATA4<>+0x010(SB)/8, $0xfefefefefefefeff +DATA LCDATA4<>+0x018(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA4<>+0x020(SB)/8, $0x8080808080808080 +DATA LCDATA4<>+0x028(SB)/8, $0x8080808080808080 +DATA LCDATA4<>+0x030(SB)/8, $0x8080808080808080 +DATA LCDATA4<>+0x038(SB)/8, $0x8080808080808080 +DATA LCDATA4<>+0x040(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA4<>+0x048(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA4<>+0x050(SB)/8, $0x2222222222222222 +DATA LCDATA4<>+0x058(SB)/8, $0x2222222222222222 +DATA LCDATA4<>+0x060(SB)/8, $0xfefefefefefefeff +DATA LCDATA4<>+0x068(SB)/8, $0xfefefefefefefeff +DATA LCDATA4<>+0x070(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA4<>+0x078(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA4<>+0x080(SB)/8, $0x8080808080808080 +DATA LCDATA4<>+0x088(SB)/8, $0x8080808080808080 +GLOBL LCDATA4<>(SB), 8, $144 + +TEXT ·_findEscapeIndex256(SB), $0-24 + + MOVQ buf+0(FP), DI + MOVQ len+8(FP), SI + LEAQ LCDATA4<>(SB), BP + + WORD $0xf089 // mov eax, esi + WORD $0xf8c1; BYTE $0x1f // sar eax, 31 + WORD $0xe8c1; BYTE $0x1b // shr eax, 27 + WORD $0xf001 // add eax, esi + WORD $0xf8c1; BYTE $0x05 // sar eax, 5 + WORD $0x4e8d; BYTE $0x1f // lea ecx, [rsi + 31] + WORD $0xf983; BYTE $0x3f // cmp ecx, 63 + JB LBB5_4 + WORD $0x6348; BYTE $0xc8 // movsxd rcx, eax + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI5_0] */ + LONG $0x597de2c4; WORD $0x084d // vpbroadcastq ymm1, qword 8[rbp] /* [rip + .LCPI5_1] */ + LONG $0x597de2c4; WORD $0x1055 // vpbroadcastq ymm2, qword 16[rbp] /* [rip + .LCPI5_2] */ + WORD $0xd231 // xor edx, edx + LONG $0x597de2c4; WORD $0x185d // vpbroadcastq ymm3, qword 24[rbp] /* [rip + .LCPI5_3] */ + LONG $0x656ffdc5; BYTE $0x20 // vmovdqa ymm4, yword 32[rbp] /* [rip + .LCPI5_4] */ + +LBB5_2: + LONG $0x2f6ffec5 // vmovdqu ymm5, yword [rdi] + LONG $0xf0d4d5c5 // vpaddq ymm6, ymm5, ymm0 + LONG $0xf9efd5c5 // vpxor ymm7, ymm5, ymm1 + LONG $0xfad4c5c5 // vpaddq ymm7, ymm7, ymm2 + LONG $0xc3ef55c5 // vpxor ymm8, ymm5, ymm3 + LONG $0xc2d43dc5 // vpaddq ymm8, ymm8, ymm2 + LONG $0xedebcdc5 // vpor ymm5, ymm6, ymm5 + LONG $0xefebd5c5 // vpor ymm5, ymm5, ymm7 + LONG $0xeb55c1c4; BYTE $0xe8 // vpor ymm5, ymm5, ymm8 + LONG $0xecdbd5c5 // vpand ymm5, ymm5, ymm4 + LONG $0xddd7fdc5 // vpmovmskb ebx, ymm5 + WORD $0xdb85 // test ebx, ebx + JNE LBB5_18 + LONG $0x20c78348 // add rdi, 32 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xca // cmp rdx, rcx + JB LBB5_2 + +LBB5_4: + WORD $0xe0c1; BYTE $0x05 // shl eax, 5 + WORD $0xc629 // sub esi, eax + WORD $0xfe83; BYTE $0x10 // cmp esi, 16 + JL LBB5_13 + WORD $0x634c; BYTE $0xc0 // movsxd r8, eax + WORD $0xf089 // mov eax, esi + WORD $0xe8c1; BYTE $0x04 // shr eax, 4 + WORD $0xc931 // xor ecx, ecx + LONG $0x456f79c5; BYTE $0x40 // vmovdqa xmm8, oword 64[rbp] /* [rip + .LCPI5_5] */ + LONG $0x4d6ff9c5; BYTE $0x50 // vmovdqa xmm1, oword 80[rbp] /* [rip + .LCPI5_6] */ + LONG $0x556ff9c5; BYTE $0x60 // vmovdqa xmm2, oword 96[rbp] /* [rip + .LCPI5_7] */ + LONG $0x5d6ff9c5; BYTE $0x70 // vmovdqa xmm3, oword 112[rbp] /* [rip + .LCPI5_8] */ + QUAD $0x00000080a56ff9c5 // vmovdqa xmm4, oword 128[rbp] /* [rip + .LCPI5_9] */ + +LBB5_6: + LONG $0x2f6ffac5 // vmovdqu xmm5, oword [rdi] + LONG $0xd451c1c4; BYTE $0xf0 // vpaddq xmm6, xmm5, xmm8 + LONG $0xf9efd1c5 // vpxor xmm7, xmm5, xmm1 + LONG $0xfad4c1c5 // vpaddq xmm7, xmm7, xmm2 + LONG $0xc3efd1c5 // vpxor xmm0, xmm5, xmm3 + LONG $0xc2d4f9c5 // vpaddq xmm0, xmm0, xmm2 + LONG $0xedebc9c5 // vpor xmm5, xmm6, xmm5 + LONG $0xefebd1c5 // vpor xmm5, xmm5, xmm7 + LONG $0xc0ebd1c5 // vpor xmm0, xmm5, xmm0 + LONG $0xc4dbf9c5 // vpand xmm0, xmm0, xmm4 + LONG $0xd0d7f9c5 // vpmovmskb edx, xmm0 + WORD $0xd285 // test edx, edx + JNE LBB5_19 + LONG $0x10c78348 // add rdi, 16 + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xc1 // cmp rcx, rax + JB LBB5_6 + WORD $0xf089 // mov eax, esi + WORD $0xe083; BYTE $0xf0 // and eax, -16 + WORD $0xc629 // sub esi, eax + WORD $0x9848 // cdqe + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB5_24 + QUAD $0xfefefefefeffbf49; WORD $0xfefe // mov r15, -72340172838076673 + QUAD $0xdfdfdfdfdfe0b949; WORD $0xdfdf // mov r9, -2314885530818453536 + QUAD $0x808080808080ba49; WORD $0x8080 // mov r10, -9187201950435737472 + QUAD $0x5c5c5c5c5c5cbb49; WORD $0x5c5c // mov r11, 6655295901103053916 + QUAD $0x222222222222be49; WORD $0x2222 // mov r14, 2459565876494606882 + WORD $0x8941; BYTE $0xf4 // mov r12d, esi + LONG $0x03ecc141 // shr r12d, 3 + WORD $0xc931 // xor ecx, ecx + +LBB5_10: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + LONG $0x0a2c8d4e // lea r13, [rdx + r9] + WORD $0x0949; BYTE $0xd5 // or r13, rdx + WORD $0x8948; BYTE $0xd3 // mov rbx, rdx + WORD $0x314c; BYTE $0xf3 // xor rbx, r14 + WORD $0x014c; BYTE $0xfb // add rbx, r15 + WORD $0x094c; BYTE $0xeb // or rbx, r13 + WORD $0x314c; BYTE $0xda // xor rdx, r11 + WORD $0x014c; BYTE $0xfa // add rdx, r15 + WORD $0x0948; BYTE $0xda // or rdx, rbx + WORD $0x214c; BYTE $0xd2 // and rdx, r10 + JNE LBB5_22 + LONG $0x01c18348 // add rcx, 1 + WORD $0x394c; BYTE $0xe1 // cmp rcx, r12 + JB LBB5_10 + WORD $0xe683; BYTE $0xf8 // and esi, -8 + JMP LBB5_23 + +LBB5_13: + WORD $0x9848 // cdqe + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB5_25 + QUAD $0xfefefefefeffbe49; WORD $0xfefe // mov r14, -72340172838076673 + QUAD $0xdfdfdfdfdfe0b849; WORD $0xdfdf // mov r8, -2314885530818453536 + QUAD $0x808080808080b949; WORD $0x8080 // mov r9, -9187201950435737472 + QUAD $0x5c5c5c5c5c5cba49; WORD $0x5c5c // mov r10, 6655295901103053916 + QUAD $0x222222222222bb49; WORD $0x2222 // mov r11, 2459565876494606882 + WORD $0xeec1; BYTE $0x03 // shr esi, 3 + WORD $0xe683; BYTE $0x1f // and esi, 31 + WORD $0xdb31 // xor ebx, ebx + +LBB5_15: + LONG $0xdf148b48 // mov rdx, qword [rdi + 8*rbx] + LONG $0x023c8d4e // lea r15, [rdx + r8] + WORD $0x0949; BYTE $0xd7 // or r15, rdx + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + WORD $0x314c; BYTE $0xd9 // xor rcx, r11 + WORD $0x014c; BYTE $0xf1 // add rcx, r14 + WORD $0x094c; BYTE $0xf9 // or rcx, r15 + WORD $0x314c; BYTE $0xd2 // xor rdx, r10 + WORD $0x014c; BYTE $0xf2 // add rdx, r14 + WORD $0x0948; BYTE $0xca // or rdx, rcx + WORD $0x214c; BYTE $0xca // and rdx, r9 + JNE LBB5_20 + LONG $0x01c38348 // add rbx, 1 + WORD $0x3948; BYTE $0xf3 // cmp rbx, rsi + JB LBB5_15 + WORD $0xe6c1; BYTE $0x03 // shl esi, 3 + JMP LBB5_21 + +LBB5_18: + WORD $0xbc0f; BYTE $0xc3 // bsf eax, ebx + JMP LBB5_25 + +LBB5_19: + WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx + JMP LBB5_24 + +LBB5_20: + WORD $0xbc0f; BYTE $0xf2 // bsf esi, edx + WORD $0xeec1; BYTE $0x03 // shr esi, 3 + +LBB5_21: + WORD $0xf189 // mov ecx, esi + WORD $0x0148; BYTE $0xc8 // add rax, rcx + JMP LBB5_25 + +LBB5_22: + WORD $0xbc0f; BYTE $0xf2 // bsf esi, edx + WORD $0xeec1; BYTE $0x03 // shr esi, 3 + +LBB5_23: + WORD $0xf189 // mov ecx, esi + WORD $0x0148; BYTE $0xc8 // add rax, rcx + +LBB5_24: + WORD $0x014c; BYTE $0xc0 // add rax, r8 + +LBB5_25: + VZEROUPPER + MOVQ AX, ret+16(FP) + RET diff --git a/internal/encoder/vm/vm.go b/internal/encoder/vm/vm.go index 4be6b80..5a3f636 100644 --- a/internal/encoder/vm/vm.go +++ b/internal/encoder/vm/vm.go @@ -402,12 +402,13 @@ func Run(ctx *encoder.RuntimeContext, b []byte, codeSet *encoder.OpcodeSet) ([]b code = code.End.Next break } - b = appendStructHead(ctx, b) - mapCtx := encoder.NewMapContext(mlen) + unorderedMap := (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 + mapCtx := encoder.NewMapContext(mlen, unorderedMap) mapiterinit(code.Type, uptr, &mapCtx.Iter) store(ctxptr, code.Idx, uintptr(unsafe.Pointer(mapCtx))) ctx.KeepRefs = append(ctx.KeepRefs, unsafe.Pointer(mapCtx)) - if (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 { + b = appendStructHead(ctx, b) + if unorderedMap { b = appendMapKeyIndent(ctx, code.Next, b) } else { mapCtx.Start = len(b) diff --git a/internal/encoder/vm_color/vm.go b/internal/encoder/vm_color/vm.go index b13abe8..f3f99b6 100644 --- a/internal/encoder/vm_color/vm.go +++ b/internal/encoder/vm_color/vm.go @@ -402,12 +402,13 @@ func Run(ctx *encoder.RuntimeContext, b []byte, codeSet *encoder.OpcodeSet) ([]b code = code.End.Next break } - b = appendStructHead(ctx, b) - mapCtx := encoder.NewMapContext(mlen) + unorderedMap := (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 + mapCtx := encoder.NewMapContext(mlen, unorderedMap) mapiterinit(code.Type, uptr, &mapCtx.Iter) store(ctxptr, code.Idx, uintptr(unsafe.Pointer(mapCtx))) ctx.KeepRefs = append(ctx.KeepRefs, unsafe.Pointer(mapCtx)) - if (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 { + b = appendStructHead(ctx, b) + if unorderedMap { b = appendMapKeyIndent(ctx, code.Next, b) } else { mapCtx.Start = len(b) diff --git a/internal/encoder/vm_color_indent/vm.go b/internal/encoder/vm_color_indent/vm.go index a45aa54..f172236 100644 --- a/internal/encoder/vm_color_indent/vm.go +++ b/internal/encoder/vm_color_indent/vm.go @@ -402,12 +402,13 @@ func Run(ctx *encoder.RuntimeContext, b []byte, codeSet *encoder.OpcodeSet) ([]b code = code.End.Next break } - b = appendStructHead(ctx, b) - mapCtx := encoder.NewMapContext(mlen) + unorderedMap := (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 + mapCtx := encoder.NewMapContext(mlen, unorderedMap) mapiterinit(code.Type, uptr, &mapCtx.Iter) store(ctxptr, code.Idx, uintptr(unsafe.Pointer(mapCtx))) ctx.KeepRefs = append(ctx.KeepRefs, unsafe.Pointer(mapCtx)) - if (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 { + b = appendStructHead(ctx, b) + if unorderedMap { b = appendMapKeyIndent(ctx, code.Next, b) } else { mapCtx.Start = len(b) diff --git a/internal/encoder/vm_indent/vm.go b/internal/encoder/vm_indent/vm.go index d1e0b45..1123387 100644 --- a/internal/encoder/vm_indent/vm.go +++ b/internal/encoder/vm_indent/vm.go @@ -402,12 +402,13 @@ func Run(ctx *encoder.RuntimeContext, b []byte, codeSet *encoder.OpcodeSet) ([]b code = code.End.Next break } - b = appendStructHead(ctx, b) - mapCtx := encoder.NewMapContext(mlen) + unorderedMap := (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 + mapCtx := encoder.NewMapContext(mlen, unorderedMap) mapiterinit(code.Type, uptr, &mapCtx.Iter) store(ctxptr, code.Idx, uintptr(unsafe.Pointer(mapCtx))) ctx.KeepRefs = append(ctx.KeepRefs, unsafe.Pointer(mapCtx)) - if (ctx.Option.Flag & encoder.UnorderedMapOption) != 0 { + b = appendStructHead(ctx, b) + if unorderedMap { b = appendMapKeyIndent(ctx, code.Next, b) } else { mapCtx.Start = len(b)