tile38/vendor/github.com/pierrec/lz4/decode_amd64.s

// +build !appengine
// +build gc
// +build !noasm

#include "textflag.h"

// AX scratch
// BX scratch
// CX scratch
// DX token
//
// DI &dst
// SI &src
// R8 &dst + len(dst)
// R9 &src + len(src)
// R11 &dst
// R12 short output end
// R13 short input end
// func decodeBlock(dst, src []byte) int
// using 50 bytes of stack currently
TEXT ·decodeBlock(SB), NOSPLIT, $64-56
	MOVQ dst_base+0(FP), DI
	MOVQ DI, R11
	MOVQ dst_len+8(FP), R8
	ADDQ DI, R8

	MOVQ src_base+24(FP), SI
	MOVQ src_len+32(FP), R9
	ADDQ SI, R9

	// shortcut ends
	// short output end
	MOVQ R8, R12
	SUBQ $32, R12
	// short input end
	MOVQ R9, R13
	SUBQ $16, R13

loop:
	// for si < len(src)
	CMPQ SI, R9
	JGE end

	// token := uint32(src[si])
	MOVBQZX (SI), DX
	INCQ SI

	// lit_len = token >> 4
	// if lit_len > 0
	// CX = lit_len
	MOVQ DX, CX
	SHRQ $4, CX

	// if lit_len != 0xF
	CMPQ CX, $0xF
	JEQ lit_len_loop_pre
	CMPQ DI, R12
	JGE lit_len_loop_pre
	CMPQ SI, R13
	JGE lit_len_loop_pre

	// copy shortcut

	// A two-stage shortcut for the most common case:
	// 1) If the literal length is 0..14, and there is enough space,
	// enter the shortcut and copy 16 bytes on behalf of the literals
	// (in the fast mode, only 8 bytes can be safely copied this way).
	// 2) Further if the match length is 4..18, copy 18 bytes in a similar
	// manner; but we ensure that there's enough space in the output for
	// those 18 bytes earlier, upon entering the shortcut (in other words,
	// there is a combined check for both stages).

	// copy literal
	MOVOU (SI), X0
	MOVOU X0, (DI)
	ADDQ CX, DI
	ADDQ CX, SI

	MOVQ DX, CX
	ANDQ $0xF, CX

	// The second stage: prepare for match copying, decode full info.
	// If it doesn't work out, the info won't be wasted.
	// offset := uint16(data[:2])
	MOVWQZX (SI), DX
	ADDQ $2, SI

	MOVQ DI, AX
	SUBQ DX, AX
	CMPQ AX, DI
	JGT err_short_buf

	// if we can't do the second stage then jump straight to read the
	// match length, we already have the offset.
	CMPQ CX, $0xF
	JEQ match_len_loop_pre
	CMPQ DX, $8
	JLT match_len_loop_pre
	CMPQ AX, R11
	JLT err_short_buf

	// memcpy(op + 0, match + 0, 8);
	MOVQ (AX), BX
	MOVQ BX, (DI)
	// memcpy(op + 8, match + 8, 8);
	MOVQ 8(AX), BX
	MOVQ BX, 8(DI)
	// memcpy(op +16, match +16, 2);
	MOVW 16(AX), BX
	MOVW BX, 16(DI)

	ADDQ $4, DI // minmatch
	ADDQ CX, DI

	// shortcut complete, load next token
	JMP loop

lit_len_loop_pre:
	// if lit_len > 0
	CMPQ CX, $0
	JEQ offset
	CMPQ CX, $0xF
	JNE copy_literal

lit_len_loop:
	// for src[si] == 0xFF
	CMPB (SI), $0xFF
	JNE lit_len_finalise

	// bounds check src[si+1]
	MOVQ SI, AX
	ADDQ $1, AX
	CMPQ AX, R9
	JGT err_short_buf

	// lit_len += 0xFF
	ADDQ $0xFF, CX
	INCQ SI
	JMP lit_len_loop

lit_len_finalise:
	// lit_len += int(src[si])
	// si++
	MOVBQZX (SI), AX
	ADDQ AX, CX
	INCQ SI

copy_literal:
	// bounds check src and dst
	MOVQ SI, AX
	ADDQ CX, AX
	CMPQ AX, R9
	JGT err_short_buf

	MOVQ DI, AX
	ADDQ CX, AX
	CMPQ AX, R8
	JGT err_short_buf

	// whats a good cut off to call memmove?
	CMPQ CX, $16
	JGT memmove_lit

	// if len(dst[di:]) < 16
	MOVQ R8, AX
	SUBQ DI, AX
	CMPQ AX, $16
	JLT memmove_lit

	// if len(src[si:]) < 16
	MOVQ R9, AX
	SUBQ SI, AX
	CMPQ AX, $16
	JLT memmove_lit

	MOVOU (SI), X0
	MOVOU X0, (DI)

	JMP finish_lit_copy

memmove_lit:
	// memmove(to, from, len)
	MOVQ DI, 0(SP)
	MOVQ SI, 8(SP)
	MOVQ CX, 16(SP)
	// spill
	MOVQ DI, 24(SP)
	MOVQ SI, 32(SP)
	MOVQ CX, 40(SP) // need len to inc SI, DI after
	MOVB DX, 48(SP)
	CALL runtime·memmove(SB)

	// restore registers
	MOVQ 24(SP), DI
	MOVQ 32(SP), SI
	MOVQ 40(SP), CX
	MOVB 48(SP), DX

	// recalc initial values
	MOVQ dst_base+0(FP), R8
	MOVQ R8, R11
	ADDQ dst_len+8(FP), R8
	MOVQ src_base+24(FP), R9
	ADDQ src_len+32(FP), R9
	MOVQ R8, R12
	SUBQ $32, R12
	MOVQ R9, R13
	SUBQ $16, R13

finish_lit_copy:
	ADDQ CX, SI
	ADDQ CX, DI

	CMPQ SI, R9
	JGE end

offset:
	// CX := mLen
	// free up DX to use for offset
	MOVQ DX, CX

	MOVQ SI, AX
	ADDQ $2, AX
	CMPQ AX, R9
	JGT err_short_buf

	// offset
	// DX := int(src[si]) | int(src[si+1])<<8
	MOVWQZX (SI), DX
	ADDQ $2, SI

	// 0 offset is invalid
	CMPQ DX, $0
	JEQ err_corrupt

	ANDB $0xF, CX

match_len_loop_pre:
	// if mlen != 0xF
	CMPB CX, $0xF
	JNE copy_match

match_len_loop:
	// for src[si] == 0xFF
	// lit_len += 0xFF
	CMPB (SI), $0xFF
	JNE match_len_finalise

	// bounds check src[si+1]
	MOVQ SI, AX
	ADDQ $1, AX
	CMPQ AX, R9
	JGT err_short_buf

	ADDQ $0xFF, CX
	INCQ SI
	JMP match_len_loop

match_len_finalise:
	// lit_len += int(src[si])
	// si++
	MOVBQZX (SI), AX
	ADDQ AX, CX
	INCQ SI

copy_match:
	// mLen += minMatch
	ADDQ $4, CX

	// check we have match_len bytes left in dst
	// di+match_len < len(dst)
	MOVQ DI, AX
	ADDQ CX, AX
	CMPQ AX, R8
	JGT err_short_buf

	// DX = offset
	// CX = match_len
	// BX = &dst + (di - offset)
	MOVQ DI, BX
	SUBQ DX, BX

	// check BX is within dst
	// if BX < &dst
	CMPQ BX, R11
	JLT err_short_buf

	// if offset + match_len < di
	MOVQ BX, AX
	ADDQ CX, AX
	CMPQ DI, AX
	JGT copy_interior_match

	// AX := len(dst[:di])
	// MOVQ DI, AX
	// SUBQ R11, AX

	// copy 16 bytes at a time
	// if di-offset < 16 copy 16-(di-offset) bytes to di
	// then do the remaining

copy_match_loop:
	// for match_len >= 0
	// dst[di] = dst[i]
	// di++
	// i++
	MOVB (BX), AX
	MOVB AX, (DI)
	INCQ DI
	INCQ BX
	DECQ CX

	CMPQ CX, $0
	JGT copy_match_loop

	JMP loop

copy_interior_match:
	CMPQ CX, $16
	JGT memmove_match

	// if len(dst[di:]) < 16
	MOVQ R8, AX
	SUBQ DI, AX
	CMPQ AX, $16
	JLT memmove_match

	MOVOU (BX), X0
	MOVOU X0, (DI)

	ADDQ CX, DI
	JMP loop

memmove_match:
	// memmove(to, from, len)
	MOVQ DI, 0(SP)
	MOVQ BX, 8(SP)
	MOVQ CX, 16(SP)
	// spill
	MOVQ DI, 24(SP)
	MOVQ SI, 32(SP)
	MOVQ CX, 40(SP) // need len to inc SI, DI after
	CALL runtime·memmove(SB)

	// restore registers
	MOVQ 24(SP), DI
	MOVQ 32(SP), SI
	MOVQ 40(SP), CX

	// recalc initial values
	MOVQ dst_base+0(FP), R8
	MOVQ R8, R11 // TODO: make these sensible numbers
	ADDQ dst_len+8(FP), R8
	MOVQ src_base+24(FP), R9
	ADDQ src_len+32(FP), R9
	MOVQ R8, R12
	SUBQ $32, R12
	MOVQ R9, R13
	SUBQ $16, R13

	ADDQ CX, DI
	JMP loop

err_corrupt:
	MOVQ $-1, ret+48(FP)
	RET

err_short_buf:
	MOVQ $-2, ret+48(FP)
	RET

end:
	SUBQ R11, DI
	MOVQ DI, ret+48(FP)
	RET
Updated Kafka client 2020-06-25 00:20:22 +03:00			`// +build !appengine`
			`// +build gc`
			`// +build !noasm`

			`#include "textflag.h"`

			`// AX scratch`
			`// BX scratch`
			`// CX scratch`
			`// DX token`
			`//`
			`// DI &dst`
			`// SI &src`
			`// R8 &dst + len(dst)`
			`// R9 &src + len(src)`
			`// R11 &dst`
			`// R12 short output end`
			`// R13 short input end`
			`// func decodeBlock(dst, src []byte) int`
			`// using 50 bytes of stack currently`
			`TEXT ·decodeBlock(SB), NOSPLIT, $64-56`
			`MOVQ dst_base+0(FP), DI`
			`MOVQ DI, R11`
			`MOVQ dst_len+8(FP), R8`
			`ADDQ DI, R8`

			`MOVQ src_base+24(FP), SI`
			`MOVQ src_len+32(FP), R9`
			`ADDQ SI, R9`

			`// shortcut ends`
			`// short output end`
			`MOVQ R8, R12`
			`SUBQ $32, R12`
			`// short input end`
			`MOVQ R9, R13`
			`SUBQ $16, R13`

			`loop:`
			`// for si < len(src)`
			`CMPQ SI, R9`
			`JGE end`

			`// token := uint32(src[si])`
			`MOVBQZX (SI), DX`
			`INCQ SI`

			`// lit_len = token >> 4`
			`// if lit_len > 0`
			`// CX = lit_len`
			`MOVQ DX, CX`
			`SHRQ $4, CX`

			`// if lit_len != 0xF`
			`CMPQ CX, $0xF`
			`JEQ lit_len_loop_pre`
			`CMPQ DI, R12`
			`JGE lit_len_loop_pre`
			`CMPQ SI, R13`
			`JGE lit_len_loop_pre`

			`// copy shortcut`

			`// A two-stage shortcut for the most common case:`
			`// 1) If the literal length is 0..14, and there is enough space,`
			`// enter the shortcut and copy 16 bytes on behalf of the literals`
			`// (in the fast mode, only 8 bytes can be safely copied this way).`
			`// 2) Further if the match length is 4..18, copy 18 bytes in a similar`
			`// manner; but we ensure that there's enough space in the output for`
			`// those 18 bytes earlier, upon entering the shortcut (in other words,`
			`// there is a combined check for both stages).`

			`// copy literal`
			`MOVOU (SI), X0`
			`MOVOU X0, (DI)`
			`ADDQ CX, DI`
			`ADDQ CX, SI`

			`MOVQ DX, CX`
			`ANDQ $0xF, CX`

			`// The second stage: prepare for match copying, decode full info.`
			`// If it doesn't work out, the info won't be wasted.`
			`// offset := uint16(data[:2])`
			`MOVWQZX (SI), DX`
			`ADDQ $2, SI`

			`MOVQ DI, AX`
			`SUBQ DX, AX`
			`CMPQ AX, DI`
			`JGT err_short_buf`

			`// if we can't do the second stage then jump straight to read the`
			`// match length, we already have the offset.`
			`CMPQ CX, $0xF`
			`JEQ match_len_loop_pre`
			`CMPQ DX, $8`
			`JLT match_len_loop_pre`
			`CMPQ AX, R11`
			`JLT err_short_buf`

			`// memcpy(op + 0, match + 0, 8);`
			`MOVQ (AX), BX`
			`MOVQ BX, (DI)`
			`// memcpy(op + 8, match + 8, 8);`
			`MOVQ 8(AX), BX`
			`MOVQ BX, 8(DI)`
			`// memcpy(op +16, match +16, 2);`
			`MOVW 16(AX), BX`
			`MOVW BX, 16(DI)`

			`ADDQ $4, DI // minmatch`
			`ADDQ CX, DI`

			`// shortcut complete, load next token`
			`JMP loop`

			`lit_len_loop_pre:`
			`// if lit_len > 0`
			`CMPQ CX, $0`
			`JEQ offset`
			`CMPQ CX, $0xF`
			`JNE copy_literal`

			`lit_len_loop:`
			`// for src[si] == 0xFF`
			`CMPB (SI), $0xFF`
			`JNE lit_len_finalise`

			`// bounds check src[si+1]`
			`MOVQ SI, AX`
			`ADDQ $1, AX`
			`CMPQ AX, R9`
			`JGT err_short_buf`

			`// lit_len += 0xFF`
			`ADDQ $0xFF, CX`
			`INCQ SI`
			`JMP lit_len_loop`

			`lit_len_finalise:`
			`// lit_len += int(src[si])`
			`// si++`
			`MOVBQZX (SI), AX`
			`ADDQ AX, CX`
			`INCQ SI`

			`copy_literal:`
			`// bounds check src and dst`
			`MOVQ SI, AX`
			`ADDQ CX, AX`
			`CMPQ AX, R9`
			`JGT err_short_buf`

			`MOVQ DI, AX`
			`ADDQ CX, AX`
			`CMPQ AX, R8`
			`JGT err_short_buf`

			`// whats a good cut off to call memmove?`
			`CMPQ CX, $16`
			`JGT memmove_lit`

			`// if len(dst[di:]) < 16`
			`MOVQ R8, AX`
			`SUBQ DI, AX`
			`CMPQ AX, $16`
			`JLT memmove_lit`

			`// if len(src[si:]) < 16`
			`MOVQ R9, AX`
			`SUBQ SI, AX`
			`CMPQ AX, $16`
			`JLT memmove_lit`

			`MOVOU (SI), X0`
			`MOVOU X0, (DI)`

			`JMP finish_lit_copy`

			`memmove_lit:`
			`// memmove(to, from, len)`
			`MOVQ DI, 0(SP)`
			`MOVQ SI, 8(SP)`
			`MOVQ CX, 16(SP)`
			`// spill`
			`MOVQ DI, 24(SP)`
			`MOVQ SI, 32(SP)`
			`MOVQ CX, 40(SP) // need len to inc SI, DI after`
			`MOVB DX, 48(SP)`
			`CALL runtime·memmove(SB)`

			`// restore registers`
			`MOVQ 24(SP), DI`
			`MOVQ 32(SP), SI`
			`MOVQ 40(SP), CX`
			`MOVB 48(SP), DX`

			`// recalc initial values`
			`MOVQ dst_base+0(FP), R8`
			`MOVQ R8, R11`
			`ADDQ dst_len+8(FP), R8`
			`MOVQ src_base+24(FP), R9`
			`ADDQ src_len+32(FP), R9`
			`MOVQ R8, R12`
			`SUBQ $32, R12`
			`MOVQ R9, R13`
			`SUBQ $16, R13`

			`finish_lit_copy:`
			`ADDQ CX, SI`
			`ADDQ CX, DI`

			`CMPQ SI, R9`
			`JGE end`

			`offset:`
			`// CX := mLen`
			`// free up DX to use for offset`
			`MOVQ DX, CX`

			`MOVQ SI, AX`
			`ADDQ $2, AX`
			`CMPQ AX, R9`
			`JGT err_short_buf`

			`// offset`
			`// DX := int(src[si]) \| int(src[si+1])<<8`
			`MOVWQZX (SI), DX`
			`ADDQ $2, SI`

			`// 0 offset is invalid`
			`CMPQ DX, $0`
			`JEQ err_corrupt`

			`ANDB $0xF, CX`

			`match_len_loop_pre:`
			`// if mlen != 0xF`
			`CMPB CX, $0xF`
			`JNE copy_match`

			`match_len_loop:`
			`// for src[si] == 0xFF`
			`// lit_len += 0xFF`
			`CMPB (SI), $0xFF`
			`JNE match_len_finalise`

			`// bounds check src[si+1]`
			`MOVQ SI, AX`
			`ADDQ $1, AX`
			`CMPQ AX, R9`
			`JGT err_short_buf`

			`ADDQ $0xFF, CX`
			`INCQ SI`
			`JMP match_len_loop`

			`match_len_finalise:`
			`// lit_len += int(src[si])`
			`// si++`
			`MOVBQZX (SI), AX`
			`ADDQ AX, CX`
			`INCQ SI`

			`copy_match:`
			`// mLen += minMatch`
			`ADDQ $4, CX`

			`// check we have match_len bytes left in dst`
			`// di+match_len < len(dst)`
			`MOVQ DI, AX`
			`ADDQ CX, AX`
			`CMPQ AX, R8`
			`JGT err_short_buf`

			`// DX = offset`
			`// CX = match_len`
			`// BX = &dst + (di - offset)`
			`MOVQ DI, BX`
			`SUBQ DX, BX`

			`// check BX is within dst`
			`// if BX < &dst`
			`CMPQ BX, R11`
			`JLT err_short_buf`

			`// if offset + match_len < di`
			`MOVQ BX, AX`
			`ADDQ CX, AX`
			`CMPQ DI, AX`
			`JGT copy_interior_match`

			`// AX := len(dst[:di])`
			`// MOVQ DI, AX`
			`// SUBQ R11, AX`

			`// copy 16 bytes at a time`
			`// if di-offset < 16 copy 16-(di-offset) bytes to di`
			`// then do the remaining`

			`copy_match_loop:`
			`// for match_len >= 0`
			`// dst[di] = dst[i]`
			`// di++`
			`// i++`
			`MOVB (BX), AX`
			`MOVB AX, (DI)`
			`INCQ DI`
			`INCQ BX`
			`DECQ CX`

			`CMPQ CX, $0`
			`JGT copy_match_loop`

			`JMP loop`

			`copy_interior_match:`
			`CMPQ CX, $16`
			`JGT memmove_match`

			`// if len(dst[di:]) < 16`
			`MOVQ R8, AX`
			`SUBQ DI, AX`
			`CMPQ AX, $16`
			`JLT memmove_match`

			`MOVOU (BX), X0`
			`MOVOU X0, (DI)`

			`ADDQ CX, DI`
			`JMP loop`

			`memmove_match:`
			`// memmove(to, from, len)`
			`MOVQ DI, 0(SP)`
			`MOVQ BX, 8(SP)`
			`MOVQ CX, 16(SP)`
			`// spill`
			`MOVQ DI, 24(SP)`
			`MOVQ SI, 32(SP)`
			`MOVQ CX, 40(SP) // need len to inc SI, DI after`
			`CALL runtime·memmove(SB)`

			`// restore registers`
			`MOVQ 24(SP), DI`
			`MOVQ 32(SP), SI`
			`MOVQ 40(SP), CX`

			`// recalc initial values`
			`MOVQ dst_base+0(FP), R8`
			`MOVQ R8, R11 // TODO: make these sensible numbers`
			`ADDQ dst_len+8(FP), R8`
			`MOVQ src_base+24(FP), R9`
			`ADDQ src_len+32(FP), R9`
			`MOVQ R8, R12`
			`SUBQ $32, R12`
			`MOVQ R9, R13`
			`SUBQ $16, R13`

			`ADDQ CX, DI`
			`JMP loop`

			`err_corrupt:`
			`MOVQ $-1, ret+48(FP)`
			`RET`

			`err_short_buf:`
			`MOVQ $-2, ret+48(FP)`
			`RET`

			`end:`
			`SUBQ R11, DI`
			`MOVQ DI, ret+48(FP)`
			`RET`