Generate SIMD files for encoding

This commit is contained in:
Masaaki Goshima 2021-12-28 01:13:18 +09:00
parent 2d98d47d0f
commit 38b316a540
No known key found for this signature in database
GPG Key ID: 6A53785055537153
14 changed files with 385 additions and 2 deletions

View File

@ -36,4 +36,4 @@ golangci-lint: | $(BIN_DIR)
.PHONY: generate .PHONY: generate
generate: generate:
go generate ./internal/... cd internal/cmd/generator && go generate .

1
internal/cmd/generator/.gitignore vendored Normal file
View File

@ -0,0 +1 @@
*.s

View File

@ -0,0 +1,31 @@
FROM golang:1.16
RUN apt-get update; \
apt-get install -y build-essential clang yasm
WORKDIR /work
COPY ./go.* ./
RUN go install \
github.com/minio/asm2plan9s \
github.com/minio/c2goasm \
github.com/klauspost/asmfmt/cmd/asmfmt
COPY ./simd/string.h ./string.h
COPY ./simd/string.c ./string.c
RUN clang -S \
-O2 \
-mavx2 \
-masm=intel \
-mno-red-zone \
-mstackrealign \
-mllvm \
-inline-threshold=1000 \
-fno-asynchronous-unwind-tables \
-fno-exceptions \
-fno-rtti \
-c string.c
COPY ./simd/string_avx.go ./string_avx.go
RUN c2goasm -a -f ./string.s ./string_avx.s

View File

@ -0,0 +1,8 @@
.PHONY: asm
asm:
clang -S -O2 -mavx2 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -c ./simd/string.c
.PHONY: generate
generate:
docker build -f Dockerfile.simd -t go-json-simd .
docker run --rm -v "$(CURDIR):/tmp" go-json-simd bash -c "cp /work/string_avx.s /tmp/string_avx.s"

View File

@ -0,0 +1,9 @@
module github.com/goccy/go-json/internal/cmd/generator
go 1.17
require (
github.com/klauspost/asmfmt v1.3.1
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3
)

View File

@ -0,0 +1,6 @@
github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw=
github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=

View File

@ -8,6 +8,9 @@ import (
"go/printer" "go/printer"
"go/token" "go/token"
"io/ioutil" "io/ioutil"
"log"
"os"
"os/exec"
"path/filepath" "path/filepath"
"runtime" "runtime"
"strings" "strings"
@ -302,6 +305,32 @@ func generateVM() error {
return nil return nil
} }
func generateSIMDSources() error {
root := repoRoot()
genCmd := exec.Command("make", "generate")
genCmd.Stdout = os.Stdout
genCmd.Stderr = os.Stderr
if err := genCmd.Run(); err != nil {
return err
}
for _, srcName := range []string{
"string_avx.s",
"string_avx.go",
} {
srcFile := filepath.Join(root, "internal", "cmd", "generator", "simd", srcName)
dstFile := filepath.Join(root, "internal", "encoder", srcName)
log.Printf("copy %s to %s", srcFile, dstFile)
src, err := os.ReadFile(srcFile)
if err != nil {
return err
}
if err := os.WriteFile(dstFile, src, 0o600); err != nil {
return err
}
}
return nil
}
func repoRoot() string { func repoRoot() string {
_, file, _, _ := runtime.Caller(0) _, file, _, _ := runtime.Caller(0)
relativePathFromRepoRoot := filepath.Join("internal", "cmd", "generator") relativePathFromRepoRoot := filepath.Join("internal", "cmd", "generator")
@ -310,6 +339,9 @@ func repoRoot() string {
//go:generate go run main.go //go:generate go run main.go
func main() { func main() {
if err := generateSIMDSources(); err != nil {
panic(err)
}
if err := generateVM(); err != nil { if err := generateVM(); err != nil {
panic(err) panic(err)
} }

View File

@ -0,0 +1,88 @@
#include "string.h"
#include <immintrin.h>
uint64_t findEscapeIndex128(char *buf, int len) {
static const uint64_t lsb = 0x0101010101010101;
static const uint64_t msb = 0x8080808080808080;
static const __m64 space = (__m64)(lsb * 0x20);
static const __m64 quote = (__m64)(lsb * '"');
static const __m64 escape = (__m64)(lsb * '\\');
static const __m64 lt = (__m64)(lsb * '<');
static const __m64 gt = (__m64)(lsb * '>');
static const __m64 amp = (__m64)(lsb * '&');
__m128i zeroV = _mm_setzero_si128();
__m128i msbV = _mm_set_epi64((__m64)(msb), (__m64)(msb));
__m128i lsbV = _mm_set_epi64((__m64)(lsb), (__m64)(lsb));
__m128i spaceV = _mm_set_epi64(space, space);
__m128i quoteV = _mm_set_epi64(quote, quote);
__m128i escapeV = _mm_set_epi64(escape, escape);
__m128i ltV = _mm_set_epi64(lt, lt);
__m128i gtV = _mm_set_epi64(gt, gt);
__m128i ampV = _mm_set_epi64(amp, amp);
char *sp = buf;
size_t chunkLen = len / 16;
int chunkIdx = 0;
for (; chunkIdx < chunkLen; chunkIdx++) {
__m128i n = _mm_loadu_si128((const void *)sp);
__m128i spaceN = _mm_sub_epi64(n, spaceV);
__m128i quoteN = _mm_sub_epi64(_mm_xor_si128(n, quoteV), lsbV);
__m128i escapeN = _mm_sub_epi64(_mm_xor_si128(n, escapeV), lsbV);
__m128i ltN = _mm_sub_epi64(_mm_xor_si128(n, ltV), lsbV);
__m128i gtN = _mm_sub_epi64(_mm_xor_si128(n, gtV), lsbV);
__m128i ampN = _mm_sub_epi64(_mm_xor_si128(n, ampV), lsbV);
__m128i mask = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(n, spaceN), quoteN), escapeN), ltN), gtN), ampN);
int movemask = _mm_movemask_epi8(_mm_and_si128(mask, msbV));
if (movemask != 0) {
return __builtin_ctz(movemask);
}
sp += 16;
}
return 16 * chunkLen;
}
uint64_t findEscapeIndex256(char *buf, int len) {
static const uint64_t lsb = 0x0101010101010101;
static const uint64_t msb = 0x8080808080808080;
static const __m64 space = (__m64)(lsb * 0x20);
static const __m64 quote = (__m64)(lsb * '"');
static const __m64 escape = (__m64)(lsb * '\\');
static const __m64 lt = (__m64)(lsb * '<');
static const __m64 gt = (__m64)(lsb * '>');
static const __m64 amp = (__m64)(lsb * '&');
__m256i zeroV = _mm256_setzero_si256();
__m256i msbV = _mm256_set1_epi64x(msb);
__m256i lsbV = _mm256_set1_epi64x(lsb);
__m256i spaceV = _mm256_set1_epi64x(space);
__m256i quoteV = _mm256_set1_epi64x(quote);
__m256i escapeV = _mm256_set1_epi64x(escape);
__m256i ltV = _mm256_set1_epi64x(lt);
__m256i gtV = _mm256_set1_epi64x(gt);
__m256i ampV = _mm256_set1_epi64x(amp);
char *sp = buf;
size_t chunkLen = len / 32;
int chunkIdx = 0;
for (; chunkIdx < chunkLen; chunkIdx++) {
__m256i n = _mm256_loadu_si256((const void *)sp);
__m256i spaceN = _mm256_sub_epi64(n, spaceV);
__m256i quoteN = _mm256_sub_epi64(_mm256_xor_si256(n, quoteV), lsbV);
__m256i escapeN = _mm256_sub_epi64(_mm256_xor_si256(n, escapeV), lsbV);
__m256i ltN = _mm256_sub_epi64(_mm256_xor_si256(n, ltV), lsbV);
__m256i gtN = _mm256_sub_epi64(_mm256_xor_si256(n, gtV), lsbV);
__m256i ampN = _mm256_sub_epi64(_mm256_xor_si256(n, ampV), lsbV);
__m256i mask = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(n, spaceN), quoteN), escapeN), ltN), gtN), ampN);
int movemask = _mm256_movemask_epi8(_mm256_and_si256(mask, msbV));
if (movemask != 0) {
return __builtin_ctz(movemask);
}
sp += 32;
}
return 32 * chunkLen;
}

View File

@ -0,0 +1,8 @@
#include <stdio.h>
#include <stdint.h>
#include <string.h>
typedef struct GoString {
char *buf;
size_t len;
} GoString;

View File

@ -0,0 +1,11 @@
package encoder
import "unsafe"
//go:nosplit
//go:noescape
func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int)
//go:nosplit
//go:noescape
func _findEscapeIndex256(buf unsafe.Pointer, len int) (ret int)

View File

@ -0,0 +1,7 @@
package main
import (
_ "github.com/klauspost/asmfmt/cmd/asmfmt"
_ "github.com/minio/asm2plan9s"
_ "github.com/minio/c2goasm"
)

View File

@ -4,6 +4,8 @@ import (
"math/bits" "math/bits"
"reflect" "reflect"
"unsafe" "unsafe"
"github.com/goccy/go-json/internal/runtime"
) )
const ( const (
@ -369,7 +371,9 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
var ( var (
i, j int i, j int
) )
if valLen >= 8 { switch valLen {
case 1, 2, 3, 4, 5, 6, 7:
case 8, 9, 10, 11, 12, 13, 14, 15:
chunks := stringToUint64Slice(s) chunks := stringToUint64Slice(s)
for _, n := range chunks { for _, n := range chunks {
// combine masks before checking for the MSB of each byte. We include // combine masks before checking for the MSB of each byte. We include
@ -394,6 +398,10 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
} }
// no found any escape characters. // no found any escape characters.
return append(append(buf, s...), '"') return append(append(buf, s...), '"')
case 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31:
j = _findEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s))
default:
j = _findEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s))
} }
ESCAPE_END: ESCAPE_END:
for j < valLen { for j < valLen {

View File

@ -0,0 +1,11 @@
package encoder
import "unsafe"
//go:nosplit
//go:noescape
func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int)
//go:nosplit
//go:noescape
func _findEscapeIndex256(buf unsafe.Pointer, len int) (ret int)

View File

@ -0,0 +1,163 @@
//+build !noasm !appengine
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
DATA LCDATA1<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0
DATA LCDATA1<>+0x008(SB)/8, $0xdfdfdfdfdfdfdfe0
DATA LCDATA1<>+0x010(SB)/8, $0x2222222222222222
DATA LCDATA1<>+0x018(SB)/8, $0x2222222222222222
DATA LCDATA1<>+0x020(SB)/8, $0xfefefefefefefeff
DATA LCDATA1<>+0x028(SB)/8, $0xfefefefefefefeff
DATA LCDATA1<>+0x030(SB)/8, $0x5c5c5c5c5c5c5c5c
DATA LCDATA1<>+0x038(SB)/8, $0x5c5c5c5c5c5c5c5c
DATA LCDATA1<>+0x040(SB)/8, $0x3c3c3c3c3c3c3c3c
DATA LCDATA1<>+0x048(SB)/8, $0x3c3c3c3c3c3c3c3c
DATA LCDATA1<>+0x050(SB)/8, $0x3e3e3e3e3e3e3e3e
DATA LCDATA1<>+0x058(SB)/8, $0x3e3e3e3e3e3e3e3e
DATA LCDATA1<>+0x060(SB)/8, $0x2626262626262626
DATA LCDATA1<>+0x068(SB)/8, $0x2626262626262626
DATA LCDATA1<>+0x070(SB)/8, $0x8080808080808080
DATA LCDATA1<>+0x078(SB)/8, $0x8080808080808080
GLOBL LCDATA1<>(SB), 8, $128
TEXT ·_findEscapeIndex128(SB), $0-24
MOVQ buf+0(FP), DI
MOVQ len+8(FP), SI
LEAQ LCDATA1<>(SB), BP
WORD $0xf089 // mov eax, esi
WORD $0xf8c1; BYTE $0x1f // sar eax, 31
WORD $0xe8c1; BYTE $0x1c // shr eax, 28
WORD $0xf001 // add eax, esi
WORD $0xf8c1; BYTE $0x04 // sar eax, 4
WORD $0x9848 // cdqe
WORD $0xc683; BYTE $0x0f // add esi, 15
WORD $0xfe83; BYTE $0x1f // cmp esi, 31
JB LBB0_5
WORD $0xc931 // xor ecx, ecx
LONG $0x456f79c5; BYTE $0x00 // vmovdqa xmm8, oword 0[rbp] /* [rip + .LCPI0_0] */
LONG $0x4d6f79c5; BYTE $0x10 // vmovdqa xmm9, oword 16[rbp] /* [rip + .LCPI0_1] */
LONG $0x556ff9c5; BYTE $0x20 // vmovdqa xmm2, oword 32[rbp] /* [rip + .LCPI0_2] */
LONG $0x556f79c5; BYTE $0x30 // vmovdqa xmm10, oword 48[rbp] /* [rip + .LCPI0_3] */
LONG $0x5d6f79c5; BYTE $0x40 // vmovdqa xmm11, oword 64[rbp] /* [rip + .LCPI0_4] */
LONG $0x656f79c5; BYTE $0x50 // vmovdqa xmm12, oword 80[rbp] /* [rip + .LCPI0_5] */
LONG $0x6d6f79c5; BYTE $0x60 // vmovdqa xmm13, oword 96[rbp] /* [rip + .LCPI0_6] */
LONG $0x7d6ff9c5; BYTE $0x70 // vmovdqa xmm7, oword 112[rbp] /* [rip + .LCPI0_7] */
LBB0_2:
LONG $0x076ffac5 // vmovdqu xmm0, oword [rdi]
LONG $0xd479c1c4; BYTE $0xc8 // vpaddq xmm1, xmm0, xmm8
LONG $0xef79c1c4; BYTE $0xd9 // vpxor xmm3, xmm0, xmm9
LONG $0xdad4e1c5 // vpaddq xmm3, xmm3, xmm2
LONG $0xef79c1c4; BYTE $0xe2 // vpxor xmm4, xmm0, xmm10
LONG $0xe2d4d9c5 // vpaddq xmm4, xmm4, xmm2
LONG $0xef79c1c4; BYTE $0xeb // vpxor xmm5, xmm0, xmm11
LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2
LONG $0xe5ebd9c5 // vpor xmm4, xmm4, xmm5
LONG $0xef79c1c4; BYTE $0xec // vpxor xmm5, xmm0, xmm12
LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2
LONG $0xef79c1c4; BYTE $0xf5 // vpxor xmm6, xmm0, xmm13
LONG $0xf2d4c9c5 // vpaddq xmm6, xmm6, xmm2
LONG $0xeeebd1c5 // vpor xmm5, xmm5, xmm6
LONG $0xc0ebf1c5 // vpor xmm0, xmm1, xmm0
LONG $0xc3ebf9c5 // vpor xmm0, xmm0, xmm3
LONG $0xc4ebf9c5 // vpor xmm0, xmm0, xmm4
LONG $0xc5ebf9c5 // vpor xmm0, xmm0, xmm5
LONG $0xc7dbf9c5 // vpand xmm0, xmm0, xmm7
LONG $0xd0d7f9c5 // vpmovmskb edx, xmm0
WORD $0xd285 // test edx, edx
JNE LBB0_3
LONG $0x10c78348 // add rdi, 16
LONG $0x01c18348 // add rcx, 1
WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
JB LBB0_2
LBB0_5:
LONG $0x04e0c148 // shl rax, 4
JMP LBB0_6
LBB0_3:
WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx
LBB0_6:
MOVQ AX, ret+16(FP)
RET
DATA LCDATA2<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0
DATA LCDATA2<>+0x008(SB)/8, $0x2222222222222222
DATA LCDATA2<>+0x010(SB)/8, $0xfefefefefefefeff
DATA LCDATA2<>+0x018(SB)/8, $0x5c5c5c5c5c5c5c5c
DATA LCDATA2<>+0x020(SB)/8, $0x3c3c3c3c3c3c3c3c
DATA LCDATA2<>+0x028(SB)/8, $0x3e3e3e3e3e3e3e3e
DATA LCDATA2<>+0x030(SB)/8, $0x2626262626262626
DATA LCDATA2<>+0x038(SB)/8, $0x0000000000000000
DATA LCDATA2<>+0x040(SB)/8, $0x8080808080808080
DATA LCDATA2<>+0x048(SB)/8, $0x8080808080808080
DATA LCDATA2<>+0x050(SB)/8, $0x8080808080808080
DATA LCDATA2<>+0x058(SB)/8, $0x8080808080808080
GLOBL LCDATA2<>(SB), 8, $96
TEXT ·_findEscapeIndex256(SB), $0-24
MOVQ buf+0(FP), DI
MOVQ len+8(FP), SI
LEAQ LCDATA2<>(SB), BP
WORD $0xf089 // mov eax, esi
WORD $0xf8c1; BYTE $0x1f // sar eax, 31
WORD $0xe8c1; BYTE $0x1b // shr eax, 27
WORD $0xf001 // add eax, esi
WORD $0xf8c1; BYTE $0x05 // sar eax, 5
WORD $0x9848 // cdqe
WORD $0xc683; BYTE $0x1f // add esi, 31
WORD $0xfe83; BYTE $0x3f // cmp esi, 63
JB LBB1_5
LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI1_0] */
LONG $0x597de2c4; WORD $0x084d // vpbroadcastq ymm1, qword 8[rbp] /* [rip + .LCPI1_1] */
LONG $0x597de2c4; WORD $0x1055 // vpbroadcastq ymm2, qword 16[rbp] /* [rip + .LCPI1_2] */
LONG $0x597de2c4; WORD $0x185d // vpbroadcastq ymm3, qword 24[rbp] /* [rip + .LCPI1_3] */
LONG $0x597de2c4; WORD $0x2065 // vpbroadcastq ymm4, qword 32[rbp] /* [rip + .LCPI1_4] */
LONG $0x597de2c4; WORD $0x286d // vpbroadcastq ymm5, qword 40[rbp] /* [rip + .LCPI1_5] */
WORD $0xc931 // xor ecx, ecx
LONG $0x597de2c4; WORD $0x3075 // vpbroadcastq ymm6, qword 48[rbp] /* [rip + .LCPI1_6] */
LONG $0x7d6ffdc5; BYTE $0x40 // vmovdqa ymm7, yword 64[rbp] /* [rip + .LCPI1_7] */
LBB1_2:
LONG $0x076f7ec5 // vmovdqu ymm8, yword [rdi]
LONG $0xc8d43dc5 // vpaddq ymm9, ymm8, ymm0
LONG $0xd1ef3dc5 // vpxor ymm10, ymm8, ymm1
LONG $0xd2d42dc5 // vpaddq ymm10, ymm10, ymm2
LONG $0xdbef3dc5 // vpxor ymm11, ymm8, ymm3
LONG $0xdad425c5 // vpaddq ymm11, ymm11, ymm2
LONG $0xe4ef3dc5 // vpxor ymm12, ymm8, ymm4
LONG $0xe2d41dc5 // vpaddq ymm12, ymm12, ymm2
LONG $0xeb2541c4; BYTE $0xdc // vpor ymm11, ymm11, ymm12
LONG $0xe5ef3dc5 // vpxor ymm12, ymm8, ymm5
LONG $0xe2d41dc5 // vpaddq ymm12, ymm12, ymm2
LONG $0xeeef3dc5 // vpxor ymm13, ymm8, ymm6
LONG $0xead415c5 // vpaddq ymm13, ymm13, ymm2
LONG $0xeb1d41c4; BYTE $0xe5 // vpor ymm12, ymm12, ymm13
LONG $0xeb3541c4; BYTE $0xc0 // vpor ymm8, ymm9, ymm8
LONG $0xeb3d41c4; BYTE $0xc2 // vpor ymm8, ymm8, ymm10
LONG $0xeb3d41c4; BYTE $0xc3 // vpor ymm8, ymm8, ymm11
LONG $0xeb3d41c4; BYTE $0xc4 // vpor ymm8, ymm8, ymm12
LONG $0xc7db3dc5 // vpand ymm8, ymm8, ymm7
LONG $0xd77dc1c4; BYTE $0xd0 // vpmovmskb edx, ymm8
WORD $0xd285 // test edx, edx
JNE LBB1_3
LONG $0x20c78348 // add rdi, 32
LONG $0x01c18348 // add rcx, 1
WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
JB LBB1_2
LBB1_5:
LONG $0x05e0c148 // shl rax, 5
JMP LBB1_6
LBB1_3:
WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx
LBB1_6:
VZEROUPPER
MOVQ AX, ret+16(FP)
RET