From 390aa2d0ea348af496a861e9d5471badf3468b56 Mon Sep 17 00:00:00 2001 From: Masaaki Goshima Date: Tue, 28 Dec 2021 01:37:33 +0900 Subject: [PATCH] Fix SIMD API --- internal/cmd/generator/Dockerfile.simd | 1 - internal/cmd/generator/Makefile | 2 +- internal/cmd/generator/simd/string.c | 45 ++- internal/cmd/generator/simd/string.h | 8 - internal/cmd/generator/simd/string_avx.go | 4 + internal/encoder/string.go | 26 +- internal/encoder/string_avx.go | 4 + internal/encoder/string_avx.s | 386 +++++++++++++++++++--- 8 files changed, 385 insertions(+), 91 deletions(-) delete mode 100644 internal/cmd/generator/simd/string.h diff --git a/internal/cmd/generator/Dockerfile.simd b/internal/cmd/generator/Dockerfile.simd index d50be5a..6f9cc7f 100644 --- a/internal/cmd/generator/Dockerfile.simd +++ b/internal/cmd/generator/Dockerfile.simd @@ -11,7 +11,6 @@ RUN go install \ github.com/minio/c2goasm \ github.com/klauspost/asmfmt/cmd/asmfmt -COPY ./simd/string.h ./string.h COPY ./simd/string.c ./string.c RUN clang -S \ diff --git a/internal/cmd/generator/Makefile b/internal/cmd/generator/Makefile index fc9e967..f1e0c86 100644 --- a/internal/cmd/generator/Makefile +++ b/internal/cmd/generator/Makefile @@ -5,4 +5,4 @@ asm: .PHONY: generate generate: docker build -f Dockerfile.simd -t go-json-simd . - docker run --rm -v "$(CURDIR):/tmp" go-json-simd bash -c "cp /work/string_avx.s /tmp/string_avx.s" + docker run --rm -v "$(CURDIR)/simd:/tmp" go-json-simd bash -c "cp /work/string_avx.s /tmp/string_avx.s" diff --git a/internal/cmd/generator/simd/string.c b/internal/cmd/generator/simd/string.c index 0351095..5d59f92 100644 --- a/internal/cmd/generator/simd/string.c +++ b/internal/cmd/generator/simd/string.c @@ -1,6 +1,34 @@ -#include "string.h" +#include +#include +#include #include +uint64_t findEscapeIndex64(char *buf, int len) { + static const uint64_t lsb = 0x0101010101010101; + static const uint64_t msb = 0x8080808080808080; + + static const uint64_t space = lsb * 0x20; + static const uint64_t quote = lsb * '"'; + static const uint64_t escape = lsb * '\\'; + static const uint64_t lt = lsb * '<'; + static const uint64_t gt = lsb * '>'; + static const uint64_t amp = lsb * '&'; + + char *sp = buf; + size_t chunkLen = len / 8; + int chunkIdx = 0; + for (; chunkIdx < chunkLen; chunkIdx++) { + uint64_t n = *(uint64_t *)sp; + uint64_t mask = n | (n - space) | ((n ^ quote) - lsb) | ((n ^ escape) - lsb) | ((n ^ lt) - lsb) | ((n ^ gt) - lsb) | ((n ^ amp) - lsb); + uint64_t masked = mask & msb; + if (masked != 0) { + return __builtin_ctz(masked); + } + sp += 8; + } + return 8 * chunkLen; +} + uint64_t findEscapeIndex128(char *buf, int len) { static const uint64_t lsb = 0x0101010101010101; static const uint64_t msb = 0x8080808080808080; @@ -41,7 +69,11 @@ uint64_t findEscapeIndex128(char *buf, int len) { } sp += 16; } - return 16 * chunkLen; + int idx = 16 * chunkLen; + if (len - idx >= 8) { + return findEscapeIndex64(sp, len - idx); + } + return idx; } uint64_t findEscapeIndex256(char *buf, int len) { @@ -84,5 +116,12 @@ uint64_t findEscapeIndex256(char *buf, int len) { } sp += 32; } - return 32 * chunkLen; + int idx = 32 * chunkLen; + int remainLen = len - idx; + if (remainLen >= 16) { + return findEscapeIndex128(sp, remainLen); + } else if (remainLen >= 8) { + return findEscapeIndex64(sp, remainLen); + } + return idx; } diff --git a/internal/cmd/generator/simd/string.h b/internal/cmd/generator/simd/string.h deleted file mode 100644 index e7fda34..0000000 --- a/internal/cmd/generator/simd/string.h +++ /dev/null @@ -1,8 +0,0 @@ -#include -#include -#include - -typedef struct GoString { - char *buf; - size_t len; -} GoString; diff --git a/internal/cmd/generator/simd/string_avx.go b/internal/cmd/generator/simd/string_avx.go index ece014f..75ce996 100644 --- a/internal/cmd/generator/simd/string_avx.go +++ b/internal/cmd/generator/simd/string_avx.go @@ -2,6 +2,10 @@ package encoder import "unsafe" +//go:nosplit +//go:noescape +func _findEscapeIndex64(buf unsafe.Pointer, len int) (ret int) + //go:nosplit //go:noescape func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int) diff --git a/internal/encoder/string.go b/internal/encoder/string.go index e8e57d4..c146b06 100644 --- a/internal/encoder/string.go +++ b/internal/encoder/string.go @@ -374,36 +374,12 @@ func AppendString(ctx *RuntimeContext, buf []byte, s string) []byte { switch valLen { case 1, 2, 3, 4, 5, 6, 7: case 8, 9, 10, 11, 12, 13, 14, 15: - chunks := stringToUint64Slice(s) - for _, n := range chunks { - // combine masks before checking for the MSB of each byte. We include - // `n` in the mask to check whether any of the *input* byte MSBs were - // set (i.e. the byte was outside the ASCII range). - mask := n | (n - (lsb * 0x20)) | - ((n ^ (lsb * '"')) - lsb) | - ((n ^ (lsb * '\\')) - lsb) | - ((n ^ (lsb * '<')) - lsb) | - ((n ^ (lsb * '>')) - lsb) | - ((n ^ (lsb * '&')) - lsb) - if (mask & msb) != 0 { - j = bits.TrailingZeros64(mask&msb) / 8 - goto ESCAPE_END - } - } - for i := len(chunks) * 8; i < valLen; i++ { - if needEscapeWithHTML[s[i]] { - j = i - goto ESCAPE_END - } - } - // no found any escape characters. - return append(append(buf, s...), '"') + j = _findEscapeIndex64((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) case 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31: j = _findEscapeIndex128((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) default: j = _findEscapeIndex256((*runtime.SliceHeader)(unsafe.Pointer(&s)).Data, len(s)) } -ESCAPE_END: for j < valLen { c := s[j] diff --git a/internal/encoder/string_avx.go b/internal/encoder/string_avx.go index ece014f..75ce996 100644 --- a/internal/encoder/string_avx.go +++ b/internal/encoder/string_avx.go @@ -2,6 +2,10 @@ package encoder import "unsafe" +//go:nosplit +//go:noescape +func _findEscapeIndex64(buf unsafe.Pointer, len int) (ret int) + //go:nosplit //go:noescape func _findEscapeIndex128(buf unsafe.Pointer, len int) (ret int) diff --git a/internal/encoder/string_avx.s b/internal/encoder/string_avx.s index 99b4829..d365c3b 100644 --- a/internal/encoder/string_avx.s +++ b/internal/encoder/string_avx.s @@ -1,6 +1,70 @@ //+build !noasm !appengine // AUTO-GENERATED BY C2GOASM -- DO NOT EDIT +TEXT ·_findEscapeIndex64(SB), $0-24 + + MOVQ buf+0(FP), DI + MOVQ len+8(FP), SI + + WORD $0xf089 // mov eax, esi + WORD $0xf8c1; BYTE $0x1f // sar eax, 31 + WORD $0xe8c1; BYTE $0x1d // shr eax, 29 + WORD $0xf001 // add eax, esi + WORD $0xf8c1; BYTE $0x03 // sar eax, 3 + WORD $0x9848 // cdqe + WORD $0xc683; BYTE $0x07 // add esi, 7 + WORD $0xfe83; BYTE $0x0f // cmp esi, 15 + JB LBB0_5 + QUAD $0xfefefefefeffbd49; WORD $0xfefe // mov r13, -72340172838076673 + WORD $0xd231 // xor edx, edx + QUAD $0x222222222222b949; WORD $0x2222 // mov r9, 2459565876494606882 + QUAD $0x5c5c5c5c5c5cba49; WORD $0x5c5c // mov r10, 6655295901103053916 + QUAD $0x3c3c3c3c3c3cbb49; WORD $0x3c3c // mov r11, 4340410370284600380 + QUAD $0x3e3e3e3e3e3ebe49; WORD $0x3e3e // mov r14, 4485090715960753726 + QUAD $0x262626262626bf49; WORD $0x2626 // mov r15, 2748926567846913574 + QUAD $0x808080808080bc49; WORD $0x8080 // mov r12, -9187201950435737472 + +LBB0_2: + LONG $0xd71c8b48 // mov rbx, qword [rdi + 8*rdx] + QUAD $0xdfdfdfdfdfe0b948; WORD $0xdfdf // mov rcx, -2314885530818453536 + LONG $0x0b348d48 // lea rsi, [rbx + rcx] + WORD $0x0948; BYTE $0xde // or rsi, rbx + WORD $0x8948; BYTE $0xd9 // mov rcx, rbx + WORD $0x314c; BYTE $0xc9 // xor rcx, r9 + WORD $0x014c; BYTE $0xe9 // add rcx, r13 + WORD $0x0948; BYTE $0xf1 // or rcx, rsi + WORD $0x8948; BYTE $0xde // mov rsi, rbx + WORD $0x314c; BYTE $0xd6 // xor rsi, r10 + WORD $0x014c; BYTE $0xee // add rsi, r13 + WORD $0x8949; BYTE $0xd8 // mov r8, rbx + WORD $0x314d; BYTE $0xd8 // xor r8, r11 + WORD $0x014d; BYTE $0xe8 // add r8, r13 + WORD $0x0949; BYTE $0xf0 // or r8, rsi + WORD $0x0949; BYTE $0xc8 // or r8, rcx + WORD $0x8948; BYTE $0xd9 // mov rcx, rbx + WORD $0x314c; BYTE $0xf1 // xor rcx, r14 + WORD $0x014c; BYTE $0xe9 // add rcx, r13 + WORD $0x314c; BYTE $0xfb // xor rbx, r15 + WORD $0x014c; BYTE $0xeb // add rbx, r13 + WORD $0x0948; BYTE $0xcb // or rbx, rcx + WORD $0x094c; BYTE $0xc3 // or rbx, r8 + WORD $0x214c; BYTE $0xe3 // and rbx, r12 + JNE LBB0_3 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xc2 // cmp rdx, rax + JB LBB0_2 + +LBB0_5: + LONG $0x03e0c148 // shl rax, 3 + JMP LBB0_6 + +LBB0_3: + WORD $0xbc0f; BYTE $0xc3 // bsf eax, ebx + +LBB0_6: + MOVQ AX, ret+16(FP) + RET + DATA LCDATA1<>+0x000(SB)/8, $0xdfdfdfdfdfdfdfe0 DATA LCDATA1<>+0x008(SB)/8, $0xdfdfdfdfdfdfdfe0 DATA LCDATA1<>+0x010(SB)/8, $0x2222222222222222 @@ -30,21 +94,21 @@ TEXT ·_findEscapeIndex128(SB), $0-24 WORD $0xe8c1; BYTE $0x1c // shr eax, 28 WORD $0xf001 // add eax, esi WORD $0xf8c1; BYTE $0x04 // sar eax, 4 - WORD $0x9848 // cdqe - WORD $0xc683; BYTE $0x0f // add esi, 15 - WORD $0xfe83; BYTE $0x1f // cmp esi, 31 - JB LBB0_5 - WORD $0xc931 // xor ecx, ecx - LONG $0x456f79c5; BYTE $0x00 // vmovdqa xmm8, oword 0[rbp] /* [rip + .LCPI0_0] */ - LONG $0x4d6f79c5; BYTE $0x10 // vmovdqa xmm9, oword 16[rbp] /* [rip + .LCPI0_1] */ - LONG $0x556ff9c5; BYTE $0x20 // vmovdqa xmm2, oword 32[rbp] /* [rip + .LCPI0_2] */ - LONG $0x556f79c5; BYTE $0x30 // vmovdqa xmm10, oword 48[rbp] /* [rip + .LCPI0_3] */ - LONG $0x5d6f79c5; BYTE $0x40 // vmovdqa xmm11, oword 64[rbp] /* [rip + .LCPI0_4] */ - LONG $0x656f79c5; BYTE $0x50 // vmovdqa xmm12, oword 80[rbp] /* [rip + .LCPI0_5] */ - LONG $0x6d6f79c5; BYTE $0x60 // vmovdqa xmm13, oword 96[rbp] /* [rip + .LCPI0_6] */ - LONG $0x7d6ff9c5; BYTE $0x70 // vmovdqa xmm7, oword 112[rbp] /* [rip + .LCPI0_7] */ + WORD $0x4e8d; BYTE $0x0f // lea ecx, [rsi + 15] + WORD $0xf983; BYTE $0x1f // cmp ecx, 31 + JB LBB1_5 + WORD $0x6348; BYTE $0xc8 // movsxd rcx, eax + WORD $0xd231 // xor edx, edx + LONG $0x456f79c5; BYTE $0x00 // vmovdqa xmm8, oword 0[rbp] /* [rip + .LCPI1_0] */ + LONG $0x4d6f79c5; BYTE $0x10 // vmovdqa xmm9, oword 16[rbp] /* [rip + .LCPI1_1] */ + LONG $0x556ff9c5; BYTE $0x20 // vmovdqa xmm2, oword 32[rbp] /* [rip + .LCPI1_2] */ + LONG $0x556f79c5; BYTE $0x30 // vmovdqa xmm10, oword 48[rbp] /* [rip + .LCPI1_3] */ + LONG $0x5d6f79c5; BYTE $0x40 // vmovdqa xmm11, oword 64[rbp] /* [rip + .LCPI1_4] */ + LONG $0x656f79c5; BYTE $0x50 // vmovdqa xmm12, oword 80[rbp] /* [rip + .LCPI1_5] */ + LONG $0x6d6f79c5; BYTE $0x60 // vmovdqa xmm13, oword 96[rbp] /* [rip + .LCPI1_6] */ + LONG $0x7d6ff9c5; BYTE $0x70 // vmovdqa xmm7, oword 112[rbp] /* [rip + .LCPI1_7] */ -LBB0_2: +LBB1_2: LONG $0x076ffac5 // vmovdqu xmm0, oword [rdi] LONG $0xd479c1c4; BYTE $0xc8 // vpaddq xmm1, xmm0, xmm8 LONG $0xef79c1c4; BYTE $0xd9 // vpxor xmm3, xmm0, xmm9 @@ -64,22 +128,75 @@ LBB0_2: LONG $0xc4ebf9c5 // vpor xmm0, xmm0, xmm4 LONG $0xc5ebf9c5 // vpor xmm0, xmm0, xmm5 LONG $0xc7dbf9c5 // vpand xmm0, xmm0, xmm7 - LONG $0xd0d7f9c5 // vpmovmskb edx, xmm0 - WORD $0xd285 // test edx, edx - JNE LBB0_3 + LONG $0xd8d7f9c5 // vpmovmskb ebx, xmm0 + WORD $0xdb85 // test ebx, ebx + JNE LBB1_3 LONG $0x10c78348 // add rdi, 16 - LONG $0x01c18348 // add rcx, 1 - WORD $0x3948; BYTE $0xc1 // cmp rcx, rax - JB LBB0_2 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xca // cmp rdx, rcx + JB LBB1_2 -LBB0_5: - LONG $0x04e0c148 // shl rax, 4 - JMP LBB0_6 +LBB1_5: + WORD $0xe0c1; BYTE $0x04 // shl eax, 4 + WORD $0xc629 // sub esi, eax + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB1_11 + QUAD $0xfefefefefeffba49; WORD $0xfefe // mov r10, -72340172838076673 + WORD $0x8941; BYTE $0xf5 // mov r13d, esi + LONG $0x03edc141 // shr r13d, 3 + WORD $0xd231 // xor edx, edx + QUAD $0x3c3c3c3c3c3cbb49; WORD $0x3c3c // mov r11, 4340410370284600380 + QUAD $0x3e3e3e3e3e3ebe49; WORD $0x3e3e // mov r14, 4485090715960753726 + QUAD $0x262626262626bf49; WORD $0x2626 // mov r15, 2748926567846913574 + QUAD $0x808080808080bc49; WORD $0x8080 // mov r12, -9187201950435737472 -LBB0_3: - WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx +LBB1_7: + LONG $0xd71c8b48 // mov rbx, qword [rdi + 8*rdx] + QUAD $0xdfdfdfdfdfe0b948; WORD $0xdfdf // mov rcx, -2314885530818453536 + LONG $0x0b0c8d48 // lea rcx, [rbx + rcx] + WORD $0x0948; BYTE $0xd9 // or rcx, rbx + WORD $0x8949; BYTE $0xd8 // mov r8, rbx + QUAD $0x222222222222b848; WORD $0x2222 // mov rax, 2459565876494606882 + WORD $0x3149; BYTE $0xc0 // xor r8, rax + WORD $0x014d; BYTE $0xd0 // add r8, r10 + WORD $0x0949; BYTE $0xc8 // or r8, rcx + WORD $0x8948; BYTE $0xd9 // mov rcx, rbx + QUAD $0x5c5c5c5c5c5cb848; WORD $0x5c5c // mov rax, 6655295901103053916 + WORD $0x3148; BYTE $0xc1 // xor rcx, rax + WORD $0x014c; BYTE $0xd1 // add rcx, r10 + WORD $0x8949; BYTE $0xd9 // mov r9, rbx + WORD $0x314d; BYTE $0xd9 // xor r9, r11 + WORD $0x014d; BYTE $0xd1 // add r9, r10 + WORD $0x0949; BYTE $0xc9 // or r9, rcx + WORD $0x094d; BYTE $0xc1 // or r9, r8 + WORD $0x8948; BYTE $0xd9 // mov rcx, rbx + WORD $0x314c; BYTE $0xf1 // xor rcx, r14 + WORD $0x014c; BYTE $0xd1 // add rcx, r10 + WORD $0x314c; BYTE $0xfb // xor rbx, r15 + WORD $0x014c; BYTE $0xd3 // add rbx, r10 + WORD $0x0948; BYTE $0xcb // or rbx, rcx + WORD $0x094c; BYTE $0xcb // or rbx, r9 + WORD $0x214c; BYTE $0xe3 // and rbx, r12 + JNE LBB1_8 + LONG $0x01c28348 // add rdx, 1 + WORD $0x394c; BYTE $0xea // cmp rdx, r13 + JB LBB1_7 + WORD $0xe683; BYTE $0xf8 // and esi, -8 + JMP LBB1_12 -LBB0_6: +LBB1_11: + WORD $0x6348; BYTE $0xf0 // movsxd rsi, eax + JMP LBB1_12 + +LBB1_3: + WORD $0xbc0f; BYTE $0xf3 // bsf esi, ebx + JMP LBB1_12 + +LBB1_8: + WORD $0xbc0f; BYTE $0xf3 // bsf esi, ebx + +LBB1_12: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi MOVQ AX, ret+16(FP) RET @@ -95,7 +212,23 @@ DATA LCDATA2<>+0x040(SB)/8, $0x8080808080808080 DATA LCDATA2<>+0x048(SB)/8, $0x8080808080808080 DATA LCDATA2<>+0x050(SB)/8, $0x8080808080808080 DATA LCDATA2<>+0x058(SB)/8, $0x8080808080808080 -GLOBL LCDATA2<>(SB), 8, $96 +DATA LCDATA2<>+0x060(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA2<>+0x068(SB)/8, $0xdfdfdfdfdfdfdfe0 +DATA LCDATA2<>+0x070(SB)/8, $0x2222222222222222 +DATA LCDATA2<>+0x078(SB)/8, $0x2222222222222222 +DATA LCDATA2<>+0x080(SB)/8, $0xfefefefefefefeff +DATA LCDATA2<>+0x088(SB)/8, $0xfefefefefefefeff +DATA LCDATA2<>+0x090(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA2<>+0x098(SB)/8, $0x5c5c5c5c5c5c5c5c +DATA LCDATA2<>+0x0a0(SB)/8, $0x3c3c3c3c3c3c3c3c +DATA LCDATA2<>+0x0a8(SB)/8, $0x3c3c3c3c3c3c3c3c +DATA LCDATA2<>+0x0b0(SB)/8, $0x3e3e3e3e3e3e3e3e +DATA LCDATA2<>+0x0b8(SB)/8, $0x3e3e3e3e3e3e3e3e +DATA LCDATA2<>+0x0c0(SB)/8, $0x2626262626262626 +DATA LCDATA2<>+0x0c8(SB)/8, $0x2626262626262626 +DATA LCDATA2<>+0x0d0(SB)/8, $0x8080808080808080 +DATA LCDATA2<>+0x0d8(SB)/8, $0x8080808080808080 +GLOBL LCDATA2<>(SB), 8, $224 TEXT ·_findEscapeIndex256(SB), $0-24 @@ -108,21 +241,21 @@ TEXT ·_findEscapeIndex256(SB), $0-24 WORD $0xe8c1; BYTE $0x1b // shr eax, 27 WORD $0xf001 // add eax, esi WORD $0xf8c1; BYTE $0x05 // sar eax, 5 - WORD $0x9848 // cdqe - WORD $0xc683; BYTE $0x1f // add esi, 31 - WORD $0xfe83; BYTE $0x3f // cmp esi, 63 - JB LBB1_5 - LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI1_0] */ - LONG $0x597de2c4; WORD $0x084d // vpbroadcastq ymm1, qword 8[rbp] /* [rip + .LCPI1_1] */ - LONG $0x597de2c4; WORD $0x1055 // vpbroadcastq ymm2, qword 16[rbp] /* [rip + .LCPI1_2] */ - LONG $0x597de2c4; WORD $0x185d // vpbroadcastq ymm3, qword 24[rbp] /* [rip + .LCPI1_3] */ - LONG $0x597de2c4; WORD $0x2065 // vpbroadcastq ymm4, qword 32[rbp] /* [rip + .LCPI1_4] */ - LONG $0x597de2c4; WORD $0x286d // vpbroadcastq ymm5, qword 40[rbp] /* [rip + .LCPI1_5] */ - WORD $0xc931 // xor ecx, ecx - LONG $0x597de2c4; WORD $0x3075 // vpbroadcastq ymm6, qword 48[rbp] /* [rip + .LCPI1_6] */ - LONG $0x7d6ffdc5; BYTE $0x40 // vmovdqa ymm7, yword 64[rbp] /* [rip + .LCPI1_7] */ + WORD $0x4e8d; BYTE $0x1f // lea ecx, [rsi + 31] + WORD $0xf983; BYTE $0x3f // cmp ecx, 63 + JB LBB2_4 + WORD $0x6348; BYTE $0xc8 // movsxd rcx, eax + LONG $0x597de2c4; WORD $0x0045 // vpbroadcastq ymm0, qword 0[rbp] /* [rip + .LCPI2_0] */ + LONG $0x597de2c4; WORD $0x084d // vpbroadcastq ymm1, qword 8[rbp] /* [rip + .LCPI2_1] */ + LONG $0x597de2c4; WORD $0x1055 // vpbroadcastq ymm2, qword 16[rbp] /* [rip + .LCPI2_2] */ + LONG $0x597de2c4; WORD $0x185d // vpbroadcastq ymm3, qword 24[rbp] /* [rip + .LCPI2_3] */ + LONG $0x597de2c4; WORD $0x2065 // vpbroadcastq ymm4, qword 32[rbp] /* [rip + .LCPI2_4] */ + LONG $0x597de2c4; WORD $0x286d // vpbroadcastq ymm5, qword 40[rbp] /* [rip + .LCPI2_5] */ + WORD $0xd231 // xor edx, edx + LONG $0x597de2c4; WORD $0x3075 // vpbroadcastq ymm6, qword 48[rbp] /* [rip + .LCPI2_6] */ + LONG $0x7d6ffdc5; BYTE $0x40 // vmovdqa ymm7, yword 64[rbp] /* [rip + .LCPI2_7] */ -LBB1_2: +LBB2_2: LONG $0x076f7ec5 // vmovdqu ymm8, yword [rdi] LONG $0xc8d43dc5 // vpaddq ymm9, ymm8, ymm0 LONG $0xd1ef3dc5 // vpxor ymm10, ymm8, ymm1 @@ -142,22 +275,169 @@ LBB1_2: LONG $0xeb3d41c4; BYTE $0xc3 // vpor ymm8, ymm8, ymm11 LONG $0xeb3d41c4; BYTE $0xc4 // vpor ymm8, ymm8, ymm12 LONG $0xc7db3dc5 // vpand ymm8, ymm8, ymm7 - LONG $0xd77dc1c4; BYTE $0xd0 // vpmovmskb edx, ymm8 - WORD $0xd285 // test edx, edx - JNE LBB1_3 + LONG $0xd77dc1c4; BYTE $0xd8 // vpmovmskb ebx, ymm8 + WORD $0xdb85 // test ebx, ebx + JNE LBB2_19 LONG $0x20c78348 // add rdi, 32 - LONG $0x01c18348 // add rcx, 1 - WORD $0x3948; BYTE $0xc1 // cmp rcx, rax - JB LBB1_2 + LONG $0x01c28348 // add rdx, 1 + WORD $0x3948; BYTE $0xca // cmp rdx, rcx + JB LBB2_2 -LBB1_5: - LONG $0x05e0c148 // shl rax, 5 - JMP LBB1_6 +LBB2_4: + WORD $0xe0c1; BYTE $0x05 // shl eax, 5 + WORD $0xc629 // sub esi, eax + WORD $0xfe83; BYTE $0x10 // cmp esi, 16 + JL LBB2_13 + WORD $0xf089 // mov eax, esi + WORD $0xe8c1; BYTE $0x04 // shr eax, 4 + WORD $0xc931 // xor ecx, ecx + LONG $0x456f79c5; BYTE $0x60 // vmovdqa xmm8, oword 96[rbp] /* [rip + .LCPI2_8] */ + LONG $0x4d6f79c5; BYTE $0x70 // vmovdqa xmm9, oword 112[rbp] /* [rip + .LCPI2_9] */ + QUAD $0x00000080956ff9c5 // vmovdqa xmm2, oword 128[rbp] /* [rip + .LCPI2_10] */ + QUAD $0x00000090956f79c5 // vmovdqa xmm10, oword 144[rbp] /* [rip + .LCPI2_11] */ + QUAD $0x000000a09d6f79c5 // vmovdqa xmm11, oword 160[rbp] /* [rip + .LCPI2_12] */ + QUAD $0x000000b0a56f79c5 // vmovdqa xmm12, oword 176[rbp] /* [rip + .LCPI2_13] */ + QUAD $0x000000c0ad6f79c5 // vmovdqa xmm13, oword 192[rbp] /* [rip + .LCPI2_14] */ + QUAD $0x000000d0bd6ff9c5 // vmovdqa xmm7, oword 208[rbp] /* [rip + .LCPI2_15] */ -LBB1_3: - WORD $0xbc0f; BYTE $0xc2 // bsf eax, edx +LBB2_6: + LONG $0x076ffac5 // vmovdqu xmm0, oword [rdi] + LONG $0xd479c1c4; BYTE $0xc8 // vpaddq xmm1, xmm0, xmm8 + LONG $0xef79c1c4; BYTE $0xd9 // vpxor xmm3, xmm0, xmm9 + LONG $0xdad4e1c5 // vpaddq xmm3, xmm3, xmm2 + LONG $0xef79c1c4; BYTE $0xe2 // vpxor xmm4, xmm0, xmm10 + LONG $0xe2d4d9c5 // vpaddq xmm4, xmm4, xmm2 + LONG $0xef79c1c4; BYTE $0xeb // vpxor xmm5, xmm0, xmm11 + LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2 + LONG $0xe5ebd9c5 // vpor xmm4, xmm4, xmm5 + LONG $0xef79c1c4; BYTE $0xec // vpxor xmm5, xmm0, xmm12 + LONG $0xead4d1c5 // vpaddq xmm5, xmm5, xmm2 + LONG $0xef79c1c4; BYTE $0xf5 // vpxor xmm6, xmm0, xmm13 + LONG $0xf2d4c9c5 // vpaddq xmm6, xmm6, xmm2 + LONG $0xeeebd1c5 // vpor xmm5, xmm5, xmm6 + LONG $0xc0ebf1c5 // vpor xmm0, xmm1, xmm0 + LONG $0xc3ebf9c5 // vpor xmm0, xmm0, xmm3 + LONG $0xc4ebf9c5 // vpor xmm0, xmm0, xmm4 + LONG $0xc5ebf9c5 // vpor xmm0, xmm0, xmm5 + LONG $0xc7dbf9c5 // vpand xmm0, xmm0, xmm7 + LONG $0xd0d7f9c5 // vpmovmskb edx, xmm0 + WORD $0xd285 // test edx, edx + JNE LBB2_20 + LONG $0x10c78348 // add rdi, 16 + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xc1 // cmp rcx, rax + JB LBB2_6 + WORD $0xf089 // mov eax, esi + WORD $0xe083; BYTE $0xf0 // and eax, -16 + WORD $0xc629 // sub esi, eax + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB2_18 + QUAD $0xfefefefefeffb848; WORD $0xfefe // mov rax, -72340172838076673 + QUAD $0x5c5c5c5c5c5cba49; WORD $0x5c5c // mov r10, 6655295901103053916 + QUAD $0x3e3e3e3e3e3ebb49; WORD $0x3e3e // mov r11, 4485090715960753726 + QUAD $0x3c3c3c3c3c3cbe49; WORD $0x3c3c // mov r14, 4340410370284600380 + QUAD $0x262626262626bf49; WORD $0x2626 // mov r15, 2748926567846913574 + QUAD $0x222222222222bc49; WORD $0x2222 // mov r12, 2459565876494606882 + WORD $0x8941; BYTE $0xf5 // mov r13d, esi + LONG $0x03edc141 // shr r13d, 3 + WORD $0xdb31 // xor ebx, ebx -LBB1_6: +LBB2_10: + LONG $0xdf148b48 // mov rdx, qword [rdi + 8*rbx] + QUAD $0xdfdfdfdfdfe0b948; WORD $0xdfdf // mov rcx, -2314885530818453536 + LONG $0x0a0c8d48 // lea rcx, [rdx + rcx] + WORD $0x0948; BYTE $0xd1 // or rcx, rdx + WORD $0x8949; BYTE $0xd0 // mov r8, rdx + WORD $0x314d; BYTE $0xe0 // xor r8, r12 + WORD $0x0149; BYTE $0xc0 // add r8, rax + WORD $0x0949; BYTE $0xc8 // or r8, rcx + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + WORD $0x314c; BYTE $0xd1 // xor rcx, r10 + WORD $0x0148; BYTE $0xc1 // add rcx, rax + WORD $0x8949; BYTE $0xd1 // mov r9, rdx + WORD $0x314d; BYTE $0xf1 // xor r9, r14 + WORD $0x0149; BYTE $0xc1 // add r9, rax + WORD $0x0949; BYTE $0xc9 // or r9, rcx + WORD $0x094d; BYTE $0xc1 // or r9, r8 + WORD $0x8948; BYTE $0xd1 // mov rcx, rdx + WORD $0x314c; BYTE $0xd9 // xor rcx, r11 + WORD $0x0148; BYTE $0xc1 // add rcx, rax + WORD $0x314c; BYTE $0xfa // xor rdx, r15 + WORD $0x0148; BYTE $0xc2 // add rdx, rax + WORD $0x0948; BYTE $0xca // or rdx, rcx + WORD $0x094c; BYTE $0xca // or rdx, r9 + QUAD $0x808080808080b948; WORD $0x8080 // mov rcx, -9187201950435737472 + WORD $0x2148; BYTE $0xca // and rdx, rcx + JNE LBB2_22 + LONG $0x01c38348 // add rbx, 1 + WORD $0x394c; BYTE $0xeb // cmp rbx, r13 + JB LBB2_10 + WORD $0xe683; BYTE $0xf8 // and esi, -8 + JMP LBB2_23 + +LBB2_13: + WORD $0xfe83; BYTE $0x08 // cmp esi, 8 + JL LBB2_18 + QUAD $0xfefefefefeffbd49; WORD $0xfefe // mov r13, -72340172838076673 + QUAD $0x808080808080b949; WORD $0x8080 // mov r9, -9187201950435737472 + QUAD $0x5c5c5c5c5c5cba49; WORD $0x5c5c // mov r10, 6655295901103053916 + QUAD $0x3e3e3e3e3e3ebb49; WORD $0x3e3e // mov r11, 4485090715960753726 + QUAD $0x3c3c3c3c3c3cbe49; WORD $0x3c3c // mov r14, 4340410370284600380 + QUAD $0x262626262626bf49; WORD $0x2626 // mov r15, 2748926567846913574 + QUAD $0x222222222222bc49; WORD $0x2222 // mov r12, 2459565876494606882 + WORD $0xeec1; BYTE $0x03 // shr esi, 3 + WORD $0xe683; BYTE $0x1f // and esi, 31 + WORD $0xc931 // xor ecx, ecx + +LBB2_15: + LONG $0xcf148b48 // mov rdx, qword [rdi + 8*rcx] + QUAD $0xdfdfdfdfdfe0b848; WORD $0xdfdf // mov rax, -2314885530818453536 + LONG $0x021c8d48 // lea rbx, [rdx + rax] + WORD $0x0948; BYTE $0xd3 // or rbx, rdx + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + WORD $0x314c; BYTE $0xe0 // xor rax, r12 + WORD $0x014c; BYTE $0xe8 // add rax, r13 + WORD $0x0948; BYTE $0xd8 // or rax, rbx + WORD $0x8948; BYTE $0xd3 // mov rbx, rdx + WORD $0x314c; BYTE $0xd3 // xor rbx, r10 + WORD $0x014c; BYTE $0xeb // add rbx, r13 + WORD $0x8949; BYTE $0xd0 // mov r8, rdx + WORD $0x314d; BYTE $0xf0 // xor r8, r14 + WORD $0x014d; BYTE $0xe8 // add r8, r13 + WORD $0x0949; BYTE $0xd8 // or r8, rbx + WORD $0x0949; BYTE $0xc0 // or r8, rax + WORD $0x8948; BYTE $0xd0 // mov rax, rdx + WORD $0x314c; BYTE $0xd8 // xor rax, r11 + WORD $0x014c; BYTE $0xe8 // add rax, r13 + WORD $0x314c; BYTE $0xfa // xor rdx, r15 + WORD $0x014c; BYTE $0xea // add rdx, r13 + WORD $0x0948; BYTE $0xc2 // or rdx, rax + WORD $0x094c; BYTE $0xc2 // or rdx, r8 + WORD $0x214c; BYTE $0xca // and rdx, r9 + JNE LBB2_22 + LONG $0x01c18348 // add rcx, 1 + WORD $0x3948; BYTE $0xf1 // cmp rcx, rsi + JB LBB2_15 + WORD $0xe6c1; BYTE $0x03 // shl esi, 3 + JMP LBB2_23 + +LBB2_18: + WORD $0x6348; BYTE $0xf0 // movsxd rsi, eax + JMP LBB2_23 + +LBB2_19: + WORD $0xbc0f; BYTE $0xf3 // bsf esi, ebx + JMP LBB2_23 + +LBB2_20: + WORD $0xbc0f; BYTE $0xf2 // bsf esi, edx + JMP LBB2_23 + +LBB2_22: + WORD $0xbc0f; BYTE $0xf2 // bsf esi, edx + +LBB2_23: + WORD $0x8948; BYTE $0xf0 // mov rax, rsi VZEROUPPER MOVQ AX, ret+16(FP) RET