Add AVX2 assembler functions.
Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96xmaster
parent
eef4ac6aa3
commit
627f48f59e
|
@ -9,9 +9,18 @@ import (
|
|||
"github.com/klauspost/cpuid"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func galMulSSSE3(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulSSSE3Xor(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulAVX2Xor(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulAVX2(low, high, in, out []byte)
|
||||
|
||||
// This is what the assembler rountes does in blocks of 16 bytes:
|
||||
/*
|
||||
func galMulSSSE3(low, high, in, out []byte) {
|
||||
|
@ -33,7 +42,10 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
|
|||
|
||||
func galMulSlice(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSSE3() {
|
||||
if cpuid.CPU.AVX2() {
|
||||
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
} else if cpuid.CPU.SSSE3() {
|
||||
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
|
@ -48,7 +60,10 @@ func galMulSlice(c byte, in, out []byte) {
|
|||
|
||||
func galMulSliceXor(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSSE3() {
|
||||
if cpuid.CPU.AVX2() {
|
||||
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
} else if cpuid.CPU.SSSE3() {
|
||||
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
|
|
106
galois_amd64.s
106
galois_amd64.s
|
@ -13,7 +13,7 @@ TEXT ·galMulSSSE3Xor(SB), 7, $0
|
|||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X8
|
||||
PXOR X5, X5
|
||||
PXOR X5, X5
|
||||
MOVQ in+48(FP),SI // R11: &in
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
|
@ -36,7 +36,7 @@ loopback_xor:
|
|||
PXOR X4, X3 // X3: Result xor existing out
|
||||
MOVOU X3, (DX) // Store
|
||||
ADDQ $16, SI // in+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_xor
|
||||
done_xor:
|
||||
|
@ -50,7 +50,7 @@ TEXT ·galMulSSSE3(SB), 7, $0
|
|||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X8
|
||||
PXOR X5, X5
|
||||
PXOR X5, X5
|
||||
MOVQ in+48(FP),SI // R11: &in
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
|
@ -71,10 +71,108 @@ loopback:
|
|||
PXOR X2, X3 // X3: Result
|
||||
MOVOU X3, (DX) // Store
|
||||
ADDQ $16, SI // in+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
SUBQ $1, R9
|
||||
JNZ loopback
|
||||
done:
|
||||
RET
|
||||
|
||||
// func galMulAVX2Xor(low, high, in, out []byte)
|
||||
TEXT ·galMulAVX2Xor(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X5
|
||||
MOVOU (SI), X6 // X6 low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
|
||||
/*
|
||||
YASM:
|
||||
|
||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
*/
|
||||
|
||||
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
|
||||
|
||||
SHRQ $5, R9 // len(in) /32
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
MOVQ in+48(FP), SI // R11: &in
|
||||
TESTQ R9, R9
|
||||
JZ done_xor_avx2
|
||||
loopback_xor_avx2:
|
||||
/* Yasm:
|
||||
|
||||
VMOVDQU YMM0, [rsi]
|
||||
VMOVDQU YMM4, [rdx]
|
||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
VPXOR YMM3, YMM2, YMM3 ; X3: Result
|
||||
VPXOR YMM4, YMM3, YMM4 ; X4: Result
|
||||
VMOVDQU [rdx], YMM4
|
||||
*/
|
||||
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
|
||||
|
||||
ADDQ $32, SI // in+=32
|
||||
ADDQ $32, DX // out+=32
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_xor_avx2
|
||||
done_xor_avx2:
|
||||
// VZEROUPPER
|
||||
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
|
||||
RET
|
||||
|
||||
// func galMulAVX2(low, high, in, out []byte)
|
||||
TEXT ·galMulAVX2(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X5
|
||||
MOVOU (SI), X6 // X6 low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
|
||||
/*
|
||||
YASM:
|
||||
|
||||
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
|
||||
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
|
||||
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
|
||||
*/
|
||||
|
||||
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
|
||||
|
||||
SHRQ $5, R9 // len(in) /32
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
MOVQ in+48(FP), SI // R11: &in
|
||||
TESTQ R9, R9
|
||||
JZ done_avx2
|
||||
loopback_avx2:
|
||||
/* Yasm:
|
||||
|
||||
VMOVDQU YMM0, [rsi]
|
||||
VPSRLQ YMM1, YMM0, 4 ; X1: high input
|
||||
VPAND YMM0, YMM0, YMM8 ; X0: low input
|
||||
VPAND YMM1, YMM1, YMM8 ; X1: high input
|
||||
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
|
||||
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
|
||||
VPXOR YMM4, YMM2, YMM3 ; X4: Result
|
||||
VMOVDQU [rdx], YMM4
|
||||
*/
|
||||
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
|
||||
|
||||
ADDQ $32, SI // in+=32
|
||||
ADDQ $32, DX // out+=32
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_avx2
|
||||
JMP done_avx2
|
||||
|
||||
done_avx2:
|
||||
// VZEROUPPER
|
||||
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
|
||||
RET
|
||||
|
|
Loading…
Reference in New Issue