mirror of https://github.com/goccy/go-json.git
Generate SIMD files for encoding
This commit is contained in:
parent
2d98d47d0f
commit
38b316a540
2
Makefile
2
Makefile
|
@ -36,4 +36,4 @@ golangci-lint: | $(BIN_DIR)
|
|||
|
||||
.PHONY: generate
|
||||
generate:
|
||||
go generate ./internal/...
|
||||
cd internal/cmd/generator && go generate .
|
||||
|
|
|
@ -0,0 +1 @@
|
|||
*.s
|
|
@ -0,0 +1,31 @@
|
|||
FROM golang:1.16
|
||||
|
||||
RUN apt-get update; \
|
||||
apt-get install -y build-essential clang yasm
|
||||
|
||||
WORKDIR /work
|
||||
|
||||
COPY ./go.* ./
|
||||
RUN go install \
|
||||
github.com/minio/asm2plan9s \
|
||||
github.com/minio/c2goasm \
|
||||
github.com/klauspost/asmfmt/cmd/asmfmt
|
||||
|
||||
COPY ./simd/string.h ./string.h
|
||||
COPY ./simd/string.c ./string.c
|
||||
|
||||
RUN clang -S \
|
||||
-O2 \
|
||||
-mavx2 \
|
||||
-masm=intel \
|
||||
-mno-red-zone \
|
||||
-mstackrealign \
|
||||
-mllvm \
|
||||
-inline-threshold=1000 \
|
||||
-fno-asynchronous-unwind-tables \
|
||||
-fno-exceptions \
|
||||
-fno-rtti \
|
||||
-c string.c
|
||||
|
||||
COPY ./simd/string_avx.go ./string_avx.go
|
||||
RUN c2goasm -a -f ./string.s ./string_avx.s
|
|
@ -0,0 +1,8 @@
|
|||
.PHONY: asm
|
||||
asm:
|
||||
clang -S -O2 -mavx2 -masm=intel -mno-red-zone -mstackrealign -mllvm -inline-threshold=1000 -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -c ./simd/string.c
|
||||
|
||||
.PHONY: generate
|
||||
generate:
|
||||
docker build -f Dockerfile.simd -t go-json-simd .
|
||||
docker run --rm -v "$(CURDIR):/tmp" go-json-simd bash -c "cp /work/string_avx.s /tmp/string_avx.s"
|
|
@ -0,0 +1,9 @@
|
|||
module github.com/goccy/go-json/internal/cmd/generator
|
||||
|
||||
go 1.17
|
||||
|
||||
require (
|
||||
github.com/klauspost/asmfmt v1.3.1
|
||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8
|
||||
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3
|
||||
)
|
|
@ -0,0 +1,6 @@
|
|||
github.com/klauspost/asmfmt v1.3.1 h1:7xZi1N7s9gTLbqiM8KUv8TLyysavbTRGBT5/ly0bRtw=
|
||||
github.com/klauspost/asmfmt v1.3.1/go.mod h1:AG8TuvYojzulgDAMCnYn50l/5QV3Bs/tp6j0HLHbNSE=
|
||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8 h1:AMFGa4R4MiIpspGNG7Z948v4n35fFGB3RR3G/ry4FWs=
|
||||
github.com/minio/asm2plan9s v0.0.0-20200509001527-cdd76441f9d8/go.mod h1:mC1jAcsrzbxHt8iiaC+zU4b1ylILSosueou12R++wfY=
|
||||
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3 h1:+n/aFZefKZp7spd8DFdX7uMikMLXX4oubIzJF4kv/wI=
|
||||
github.com/minio/c2goasm v0.0.0-20190812172519-36a3d3bbc4f3/go.mod h1:RagcQ7I8IeTMnF8JTXieKnO4Z6JCsikNEzj0DwauVzE=
|
|
@ -8,6 +8,9 @@ import (
|
|||
"go/printer"
|
||||
"go/token"
|
||||
"io/ioutil"
|
||||
"log"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"strings"
|
||||
|
@ -302,6 +305,32 @@ func generateVM() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func generateSIMDSources() error {
|
||||
root := repoRoot()
|
||||
genCmd := exec.Command("make", "generate")
|
||||
genCmd.Stdout = os.Stdout
|
||||
genCmd.Stderr = os.Stderr
|
||||
if err := genCmd.Run(); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, srcName := range []string{
|
||||
"string_avx.s",
|
||||
"string_avx.go",
|
||||
} {
|
||||
srcFile := filepath.Join(root, "internal", "cmd", "generator", "simd", srcName)
|
||||
dstFile := filepath.Join(root, "internal", "encoder", srcName)
|
||||
log.Printf("copy %s to %s", srcFile, dstFile)
|
||||
src, err := os.ReadFile(srcFile)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.WriteFile(dstFile, src, 0o600); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func repoRoot() string {
|
||||
_, file, _, _ := runtime.Caller(0)
|
||||
relativePathFromRepoRoot := filepath.Join("internal", "cmd", "generator")
|
||||
|
@ -310,6 +339,9 @@ func repoRoot() string {
|
|||
|
||||
//go:generate go run main.go
|
||||
func main() {
|
||||
if err := generateSIMDSources(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
if err := generateVM(); err != nil {
|
||||
panic(err)
|
||||
}
|
||||
|
|
|
@ -0,0 +1,88 @@
|
|||
#include "string.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
uint64_t findEscapeIndex128(char *buf, int len) {
|
||||
static const uint64_t lsb = 0x0101010101010101;
|
||||
static const uint64_t msb = 0x8080808080808080;
|
||||
|
||||
static const __m64 space = (__m64)(lsb * 0x20);
|
||||
static const __m64 quote = (__m64)(lsb * '"');
|
||||
static const __m64 escape = (__m64)(lsb * '\\');
|
||||
static const __m64 lt = (__m64)(lsb * '<');
|
||||
static const __m64 gt = (__m64)(lsb * '>');
|
||||
static const __m64 amp = (__m64)(lsb * '&');
|
||||
|
||||
__m128i zeroV = _mm_setzero_si128();
|
||||
__m128i msbV = _mm_set_epi64((__m64)(msb), (__m64)(msb));
|
||||
__m128i lsbV = _mm_set_epi64((__m64)(lsb), (__m64)(lsb));
|
||||
__m128i spaceV = _mm_set_epi64(space, space);
|
||||
__m128i quoteV = _mm_set_epi64(quote, quote);
|
||||
__m128i escapeV = _mm_set_epi64(escape, escape);
|
||||
__m128i ltV = _mm_set_epi64(lt, lt);
|
||||
__m128i gtV = _mm_set_epi64(gt, gt);
|
||||
__m128i ampV = _mm_set_epi64(amp, amp);
|
||||
|
||||
char *sp = buf;
|
||||
size_t chunkLen = len / 16;
|
||||
int chunkIdx = 0;
|
||||
for (; chunkIdx < chunkLen; chunkIdx++) {
|
||||
__m128i n = _mm_loadu_si128((const void *)sp);
|
||||
__m128i spaceN = _mm_sub_epi64(n, spaceV);
|
||||
__m128i quoteN = _mm_sub_epi64(_mm_xor_si128(n, quoteV), lsbV);
|
||||
__m128i escapeN = _mm_sub_epi64(_mm_xor_si128(n, escapeV), lsbV);
|
||||
__m128i ltN = _mm_sub_epi64(_mm_xor_si128(n, ltV), lsbV);
|
||||
__m128i gtN = _mm_sub_epi64(_mm_xor_si128(n, gtV), lsbV);
|
||||
__m128i ampN = _mm_sub_epi64(_mm_xor_si128(n, ampV), lsbV);
|
||||
|
||||
__m128i mask = _mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(_mm_or_si128(n, spaceN), quoteN), escapeN), ltN), gtN), ampN);
|
||||
int movemask = _mm_movemask_epi8(_mm_and_si128(mask, msbV));
|
||||
if (movemask != 0) {
|
||||
return __builtin_ctz(movemask);
|
||||
}
|
||||
sp += 16;
|
||||
}
|
||||
return 16 * chunkLen;
|
||||
}
|
||||
|
||||
uint64_t findEscapeIndex256(char *buf, int len) {
|
||||
static const uint64_t lsb = 0x0101010101010101;
|
||||
static const uint64_t msb = 0x8080808080808080;
|
||||
|
||||
static const __m64 space = (__m64)(lsb * 0x20);
|
||||
static const __m64 quote = (__m64)(lsb * '"');
|
||||
static const __m64 escape = (__m64)(lsb * '\\');
|
||||
static const __m64 lt = (__m64)(lsb * '<');
|
||||
static const __m64 gt = (__m64)(lsb * '>');
|
||||
static const __m64 amp = (__m64)(lsb * '&');
|
||||
|
||||
__m256i zeroV = _mm256_setzero_si256();
|
||||
__m256i msbV = _mm256_set1_epi64x(msb);
|
||||
__m256i lsbV = _mm256_set1_epi64x(lsb);
|
||||
__m256i spaceV = _mm256_set1_epi64x(space);
|
||||
__m256i quoteV = _mm256_set1_epi64x(quote);
|
||||
__m256i escapeV = _mm256_set1_epi64x(escape);
|
||||
__m256i ltV = _mm256_set1_epi64x(lt);
|
||||
__m256i gtV = _mm256_set1_epi64x(gt);
|
||||
__m256i ampV = _mm256_set1_epi64x(amp);
|
||||
|
||||
char *sp = buf;
|
||||
size_t chunkLen = len / 32;
|
||||
int chunkIdx = 0;
|
||||
for (; chunkIdx < chunkLen; chunkIdx++) {
|
||||
__m256i n = _mm256_loadu_si256((const void *)sp);
|
||||
__m256i spaceN = _mm256_sub_epi64(n, spaceV);
|
||||
__m256i quoteN = _mm256_sub_epi64(_mm256_xor_si256(n, quoteV), lsbV);
|
||||
__m256i escapeN = _mm256_sub_epi64(_mm256_xor_si256(n, escapeV), lsbV);
|
||||
__m256i ltN = _mm256_sub_epi64(_mm256_xor_si256(n, ltV), lsbV);
|
||||
__m256i gtN = _mm256_sub_epi64(_mm256_xor_si256(n, gtV), lsbV);
|
||||
__m256i ampN = _mm256_sub_epi64(_mm256_xor_si256(n, ampV), lsbV);
|
||||
|
||||
__m256i mask = _mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(_mm256_or_si256(n, spaceN), quoteN), escapeN), ltN), gtN), ampN);
|
||||
int movemask = _mm256_movemask_epi8(_mm256_and_si256(mask, msbV));
|
||||
if (movemask != 0) {
|
||||
return __builtin_ctz(movemask);
|
||||
}
|
||||
sp += 32;
|
||||
}
|
||||
return 32 * chunkLen;
|
||||
}
|
|
@ -0,0 +1,8 @@
|
|||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
#include <string.h>
|
||||
|
||||
typedef struct GoString {
|
||||
char *buf;
|
||||
size_t len;
|
||||
} GoString;
|
|
@ -0,0 +1,11 @@
|
|||
package encoder
|
||||
|
||||
import "unsafe"
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
func _findEscapeIndex256(buf unsafe.Pointer, len int) (ret int)
|
|
@ -0,0 +1,7 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
_ "github.com/klauspost/asmfmt/cmd/asmfmt"
|
||||
_ "github.com/minio/asm2plan9s"
|
||||
_ "github.com/minio/c2goasm"
|
||||
)
|
|
@ -4,6 +4,8 @@ import (
|
|||
"math/bits"
|
||||
"reflect"
|
||||
"unsafe"
|
||||
|
||||
"github.com/goccy/go-json/internal/runtime"
|
||||
)
|
||||
|
||||
const (
|
||||
|
@ -369,7 +371,9 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
|
|||
var (
|
||||
i, j int
|
||||
)
|
||||
if valLen >= 8 {
|
||||
switch valLen {
|
||||
case 1, 2, 3, 4, 5, 6, 7:
|
||||
case 8, 9, 10, 11, 12, 13, 14, 15:
|
||||
chunks := stringToUint64Slice(s)
|
||||
for _, n := range chunks {
|
||||
// combine masks before checking for the MSB of each byte. We include
|
||||
|
@ -394,6 +398,10 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte {
|
|||
}
|
||||
// no found any escape characters.
|
||||
return append(append(buf, s...), '"')
|
||||
case 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31:
|
||||
j = _findEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s))
|
||||
default:
|
||||
j = _findEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s))
|
||||
}
|
||||
ESCAPE_END:
|
||||
for j < valLen {
|
||||
|
|
|
@ -0,0 +1,11 @@
|
|||
package encoder
|
||||
|
||||
import "unsafe"
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int)
|
||||
|
||||
//go:nosplit
|
||||
//go:noescape
|
||||
func _findEscapeIndex256(buf unsafe.Pointer, len int) (ret int)
|
|
@ -0,0 +1,163 @@
|
|||
//+build !noasm !appengine
|
||||
// AUTO-GENERATED BY C2GOASM -- DO NOT EDIT
|
||||
|
||||
DATA LCDATA1<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0
|
||||
DATA LCDATA1<>+0x008(SB)/8, $0xdfdfdfdfdfdfdfe0
|
||||
DATA LCDATA1<>+0x010(SB)/8, $0x2222222222222222
|
||||
DATA LCDATA1<>+0x018(SB)/8, $0x2222222222222222
|
||||
DATA LCDATA1<>+0x020(SB)/8, $0xfefefefefefefeff
|
||||
DATA LCDATA1<>+0x028(SB)/8, $0xfefefefefefefeff
|
||||
DATA LCDATA1<>+0x030(SB)/8, $0x5c5c5c5c5c5c5c5c
|
||||
DATA LCDATA1<>+0x038(SB)/8, $0x5c5c5c5c5c5c5c5c
|
||||
DATA LCDATA1<>+0x040(SB)/8, $0x3c3c3c3c3c3c3c3c
|
||||
DATA LCDATA1<>+0x048(SB)/8, $0x3c3c3c3c3c3c3c3c
|
||||
DATA LCDATA1<>+0x050(SB)/8, $0x3e3e3e3e3e3e3e3e
|
||||
DATA LCDATA1<>+0x058(SB)/8, $0x3e3e3e3e3e3e3e3e
|
||||
DATA LCDATA1<>+0x060(SB)/8, $0x2626262626262626
|
||||
DATA LCDATA1<>+0x068(SB)/8, $0x2626262626262626
|
||||
DATA LCDATA1<>+0x070(SB)/8, $0x8080808080808080
|
||||
DATA LCDATA1<>+0x078(SB)/8, $0x8080808080808080
|
||||
GLOBL LCDATA1<>(SB), 8, $128
|
||||
|
||||
TEXT ·_findEscapeIndex128(SB), $0-24
|
||||
|
||||
MOVQ buf+0(FP), DI
|
||||
MOVQ len+8(FP), SI
|
||||
LEAQ LCDATA1<>(SB), BP
|
||||
|
||||
WORD $0xf089 // mov eax, esi
|
||||
WORD $0xf8c1; BYTE $0x1f // sar eax, 31
|
||||
WORD $0xe8c1; BYTE $0x1c // shr eax, 28
|
||||
WORD $0xf001 // add eax, esi
|
||||
WORD $0xf8c1; BYTE $0x04 // sar eax, 4
|
||||
WORD $0x9848 // cdqe
|
||||
WORD $0xc683; BYTE $0x0f // add esi, 15
|
||||
WORD $0xfe83; BYTE $0x1f // cmp esi, 31
|
||||
JB LBB0_5
|
||||
WORD $0xc931 // xor ecx, ecx
|
||||
LONG $0x456f79c5; BYTE $0x00 // vmovdqa xmm8, oword 0[rbp] /* [rip + .LCPI0_0] */
|
||||
LONG $0x4d6f79c5; BYTE $0x10 // vmovdqa xmm9, oword 16[rbp] /* [rip + .LCPI0_1] */
|
||||
LONG $0x556ff9c5; BYTE $0x20 // vmovdqa xmm2, oword 32[rbp] /* [rip + .LCPI0_2] */
|
||||
LONG $0x556f79c5; BYTE $0x30 // vmovdqa xmm10, oword 48[rbp] /* [rip + .LCPI0_3] */
|
||||
LONG $0x5d6f79c5; BYTE $0x40 // vmovdqa xmm11, oword 64[rbp] /* [rip + .LCPI0_4] */
|
||||
LONG $0x656f79c5; BYTE $0x50 // vmovdqa xmm12, oword 80[rbp] /* [rip + .LCPI0_5] */
|
||||
LONG $0x6d6f79c5; BYTE $0x60 // vmovdqa xmm13, oword 96[rbp] /* [rip + .LCPI0_6] */
|
||||
LONG $0x7d6ff9c5; BYTE $0x70 // vmovdqa xmm7, oword 112[rbp] /* [rip + .LCPI0_7] */
|
||||
|
||||
LBB0_2:
|
||||
LONG $0x076ffac5 // vmovdqu xmm0, oword [rdi]
|
||||
LONG $0xd479c1c4; BYTE $0xc8 // vpaddq xmm1, xmm0, xmm8
|
||||
LONG $0xef79c1c4; BYTE $0xd9 // vpxor xmm3, xmm0, xmm9
|
||||
LONG $0xdad4e1c5 // vpaddq xmm3, xmm3, xmm2
|
||||
LONG $0xef79c1c4; BYTE $0xe2 // vpxor xmm4, xmm0, xmm10
|
||||
LONG $0xe2d4d9c5 // vpaddq xmm4, xmm4, xmm2
|
||||
LONG $0xef79c1c4; BYTE $0xeb // vpxor xmm5, xmm0, xmm11
|
||||
LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2
|
||||
LONG $0xe5ebd9c5 // vpor xmm4, xmm4, xmm5
|
||||
LONG $0xef79c1c4; BYTE $0xec // vpxor xmm5, xmm0, xmm12
|
||||
LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2
|
||||
LONG $0xef79c1c4; BYTE $0xf5 // vpxor xmm6, xmm0, xmm13
|
||||
LONG $0xf2d4c9c5 // vpaddq xmm6, xmm6, xmm2
|
||||
LONG $0xeeebd1c5 // vpor xmm5, xmm5, xmm6
|
||||
LONG $0xc0ebf1c5 // vpor xmm0, xmm1, xmm0
|
||||
LONG $0xc3ebf9c5 // vpor xmm0, xmm0, xmm3
|
||||
LONG $0xc4ebf9c5 // vpor xmm0, xmm0, xmm4
|
||||
LONG $0xc5ebf9c5 // vpor xmm0, xmm0, xmm5
|
||||
LONG $0xc7dbf9c5 // vpand xmm0, xmm0, xmm7
|
||||
LONG $0xd0d7f9c5 // vpmovmskb edx, xmm0
|
||||
WORD $0xd285 // test edx, edx
|
||||
JNE LBB0_3
|
||||
LONG $0x10c78348 // add rdi, 16
|
||||
LONG $0x01c18348 // add rcx, 1
|
||||
WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
|
||||
JB LBB0_2
|
||||
|
||||
LBB0_5:
|
||||
LONG $0x04e0c148 // shl rax, 4
|
||||
JMP LBB0_6
|
||||
|
||||
LBB0_3:
|
||||
WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx
|
||||
|
||||
LBB0_6:
|
||||
MOVQ AX, ret+16(FP)
|
||||
RET
|
||||
|
||||
DATA LCDATA2<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0
|
||||
DATA LCDATA2<>+0x008(SB)/8, $0x2222222222222222
|
||||
DATA LCDATA2<>+0x010(SB)/8, $0xfefefefefefefeff
|
||||
DATA LCDATA2<>+0x018(SB)/8, $0x5c5c5c5c5c5c5c5c
|
||||
DATA LCDATA2<>+0x020(SB)/8, $0x3c3c3c3c3c3c3c3c
|
||||
DATA LCDATA2<>+0x028(SB)/8, $0x3e3e3e3e3e3e3e3e
|
||||
DATA LCDATA2<>+0x030(SB)/8, $0x2626262626262626
|
||||
DATA LCDATA2<>+0x038(SB)/8, $0x0000000000000000
|
||||
DATA LCDATA2<>+0x040(SB)/8, $0x8080808080808080
|
||||
DATA LCDATA2<>+0x048(SB)/8, $0x8080808080808080
|
||||
DATA LCDATA2<>+0x050(SB)/8, $0x8080808080808080
|
||||
DATA LCDATA2<>+0x058(SB)/8, $0x8080808080808080
|
||||
GLOBL LCDATA2<>(SB), 8, $96
|
||||
|
||||
TEXT ·_findEscapeIndex256(SB), $0-24
|
||||
|
||||
MOVQ buf+0(FP), DI
|
||||
MOVQ len+8(FP), SI
|
||||
LEAQ LCDATA2<>(SB), BP
|
||||
|
||||
WORD $0xf089 // mov eax, esi
|
||||
WORD $0xf8c1; BYTE $0x1f // sar eax, 31
|
||||
WORD $0xe8c1; BYTE $0x1b // shr eax, 27
|
||||
WORD $0xf001 // add eax, esi
|
||||
WORD $0xf8c1; BYTE $0x05 // sar eax, 5
|
||||
WORD $0x9848 // cdqe
|
||||
WORD $0xc683; BYTE $0x1f // add esi, 31
|
||||
WORD $0xfe83; BYTE $0x3f // cmp esi, 63
|
||||
JB LBB1_5
|
||||
LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI1_0] */
|
||||
LONG $0x597de2c4; WORD $0x084d // vpbroadcastq ymm1, qword 8[rbp] /* [rip + .LCPI1_1] */
|
||||
LONG $0x597de2c4; WORD $0x1055 // vpbroadcastq ymm2, qword 16[rbp] /* [rip + .LCPI1_2] */
|
||||
LONG $0x597de2c4; WORD $0x185d // vpbroadcastq ymm3, qword 24[rbp] /* [rip + .LCPI1_3] */
|
||||
LONG $0x597de2c4; WORD $0x2065 // vpbroadcastq ymm4, qword 32[rbp] /* [rip + .LCPI1_4] */
|
||||
LONG $0x597de2c4; WORD $0x286d // vpbroadcastq ymm5, qword 40[rbp] /* [rip + .LCPI1_5] */
|
||||
WORD $0xc931 // xor ecx, ecx
|
||||
LONG $0x597de2c4; WORD $0x3075 // vpbroadcastq ymm6, qword 48[rbp] /* [rip + .LCPI1_6] */
|
||||
LONG $0x7d6ffdc5; BYTE $0x40 // vmovdqa ymm7, yword 64[rbp] /* [rip + .LCPI1_7] */
|
||||
|
||||
LBB1_2:
|
||||
LONG $0x076f7ec5 // vmovdqu ymm8, yword [rdi]
|
||||
LONG $0xc8d43dc5 // vpaddq ymm9, ymm8, ymm0
|
||||
LONG $0xd1ef3dc5 // vpxor ymm10, ymm8, ymm1
|
||||
LONG $0xd2d42dc5 // vpaddq ymm10, ymm10, ymm2
|
||||
LONG $0xdbef3dc5 // vpxor ymm11, ymm8, ymm3
|
||||
LONG $0xdad425c5 // vpaddq ymm11, ymm11, ymm2
|
||||
LONG $0xe4ef3dc5 // vpxor ymm12, ymm8, ymm4
|
||||
LONG $0xe2d41dc5 // vpaddq ymm12, ymm12, ymm2
|
||||
LONG $0xeb2541c4; BYTE $0xdc // vpor ymm11, ymm11, ymm12
|
||||
LONG $0xe5ef3dc5 // vpxor ymm12, ymm8, ymm5
|
||||
LONG $0xe2d41dc5 // vpaddq ymm12, ymm12, ymm2
|
||||
LONG $0xeeef3dc5 // vpxor ymm13, ymm8, ymm6
|
||||
LONG $0xead415c5 // vpaddq ymm13, ymm13, ymm2
|
||||
LONG $0xeb1d41c4; BYTE $0xe5 // vpor ymm12, ymm12, ymm13
|
||||
LONG $0xeb3541c4; BYTE $0xc0 // vpor ymm8, ymm9, ymm8
|
||||
LONG $0xeb3d41c4; BYTE $0xc2 // vpor ymm8, ymm8, ymm10
|
||||
LONG $0xeb3d41c4; BYTE $0xc3 // vpor ymm8, ymm8, ymm11
|
||||
LONG $0xeb3d41c4; BYTE $0xc4 // vpor ymm8, ymm8, ymm12
|
||||
LONG $0xc7db3dc5 // vpand ymm8, ymm8, ymm7
|
||||
LONG $0xd77dc1c4; BYTE $0xd0 // vpmovmskb edx, ymm8
|
||||
WORD $0xd285 // test edx, edx
|
||||
JNE LBB1_3
|
||||
LONG $0x20c78348 // add rdi, 32
|
||||
LONG $0x01c18348 // add rcx, 1
|
||||
WORD $0x3948; BYTE $0xc1 // cmp rcx, rax
|
||||
JB LBB1_2
|
||||
|
||||
LBB1_5:
|
||||
LONG $0x05e0c148 // shl rax, 5
|
||||
JMP LBB1_6
|
||||
|
||||
LBB1_3:
|
||||
WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx
|
||||
|
||||
LBB1_6:
|
||||
VZEROUPPER
|
||||
MOVQ AX, ret+16(FP)
|
||||
RET
|
Loading…
Reference in New Issue