Add AVX2 assembler functions.

Benchmarks on a VM (therefore a bit more noisy)

benchmark                             old ns/op     new ns/op     delta
BenchmarkEncode10x2x10000-8           58372         47421         -18.76%
BenchmarkEncode100x20x10000-8         2635444       1550511       -41.17%
BenchmarkEncode17x3x1M-8              3885495       2231034       -42.58%
BenchmarkEncode10x4x16M-8             24180221      21467661      -11.22%
BenchmarkEncode5x2x1M-8               2395287       2261452       -5.59%
BenchmarkEncode10x2x1M-8              2571278       2566560       -0.18%
BenchmarkEncode10x4x1M-8              3396774       3431916       +1.03%
BenchmarkEncode50x20x1M-8             27004601      20325731      -24.73%
BenchmarkEncode17x3x16M-8             29671393      23668596      -20.23%
BenchmarkVerify10x2x10000-8           109730        101519        -7.48%
BenchmarkVerify50x5x50000-8           3904166       3101568       -20.56%
BenchmarkVerify10x2x1M-8              4398490       4721719       +7.35%
BenchmarkVerify5x2x1M-8               3174574       3296440       +3.84%
BenchmarkVerify10x4x1M-8              5247394       5346667       +1.89%
BenchmarkVerify50x20x1M-8             35742777      26154681      -26.83%
BenchmarkVerify10x4x16M-8             52873512      54931253      +3.89%

benchmark                             old MB/s     new MB/s     speedup
BenchmarkEncode10x2x10000-8           1713.14      2108.73      1.23x
BenchmarkEncode100x20x10000-8         379.44       644.95       1.70x
BenchmarkEncode17x3x1M-8              4587.78      7989.92      1.74x
BenchmarkEncode10x4x16M-8             6938.40      7815.11      1.13x
BenchmarkEncode5x2x1M-8               2188.83      2318.37      1.06x
BenchmarkEncode10x2x1M-8              4078.03      4085.53      1.00x
BenchmarkEncode10x4x1M-8              3086.98      3055.37      0.99x
BenchmarkEncode50x20x1M-8             1941.48      2579.43      1.33x
BenchmarkEncode17x3x16M-8             9612.38      12050.26     1.25x
BenchmarkVerify10x2x10000-8           911.32       985.03       1.08x
BenchmarkVerify50x5x50000-8           1280.68      1612.09      1.26x
BenchmarkVerify10x2x1M-8              2383.94      2220.75      0.93x
BenchmarkVerify5x2x1M-8               1651.52      1590.47      0.96x
BenchmarkVerify10x4x1M-8              1998.28      1961.18      0.98x
BenchmarkVerify50x20x1M-8             1466.84      2004.57      1.37x
BenchmarkVerify10x4x16M-8             3173.09      3054.22      0.96x
master
klauspost 2015-12-07 13:40:57 +01:00
parent eef4ac6aa3
commit 627f48f59e
2 changed files with 119 additions and 6 deletions

View File

@ -9,9 +9,18 @@ import (
"github.com/klauspost/cpuid"
)
//go:noescape
func galMulSSSE3(low, high, in, out []byte)
//go:noescape
func galMulSSSE3Xor(low, high, in, out []byte)
//go:noescape
func galMulAVX2Xor(low, high, in, out []byte)
//go:noescape
func galMulAVX2(low, high, in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes:
/*
func galMulSSSE3(low, high, in, out []byte) {
@ -33,7 +42,10 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
func galMulSlice(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSSE3() {
if cpuid.CPU.AVX2() {
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 5) << 5
} else if cpuid.CPU.SSSE3() {
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
@ -48,7 +60,10 @@ func galMulSlice(c byte, in, out []byte) {
func galMulSliceXor(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSSE3() {
if cpuid.CPU.AVX2() {
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 5) << 5
} else if cpuid.CPU.SSSE3() {
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}

View File

@ -13,7 +13,7 @@ TEXT ·galMulSSSE3Xor(SB), 7, $0
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
PXOR X5, X5
MOVQ in+48(FP),SI // R11: &in
MOVQ in_len+56(FP),R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
@ -36,7 +36,7 @@ loopback_xor:
PXOR X4, X3 // X3: Result xor existing out
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor
done_xor:
@ -50,7 +50,7 @@ TEXT ·galMulSSSE3(SB), 7, $0
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
PXOR X5, X5
MOVQ in+48(FP),SI // R11: &in
MOVQ in_len+56(FP),R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
@ -71,10 +71,108 @@ loopback:
PXOR X2, X3 // X3: Result
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback
done:
RET
// func galMulAVX2Xor(low, high, in, out []byte)
TEXT ·galMulAVX2Xor(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP),R9 // R9: len(in)
/*
YASM:
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
*/
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
SHRQ $5, R9 // len(in) /32
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // R11: &in
TESTQ R9, R9
JZ done_xor_avx2
loopback_xor_avx2:
/* Yasm:
VMOVDQU YMM0, [rsi]
VMOVDQU YMM4, [rdx]
VPSRLQ YMM1, YMM0, 4 ; X1: high input
VPAND YMM0, YMM0, YMM8 ; X0: low input
VPAND YMM1, YMM1, YMM8 ; X1: high input
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
VPXOR YMM3, YMM2, YMM3 ; X3: Result
VPXOR YMM4, YMM3, YMM4 ; X4: Result
VMOVDQU [rdx], YMM4
*/
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x22;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xdb;BYTE $0xc5;BYTE $0xe5;BYTE $0xef;BYTE $0xe4;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_xor_avx2
done_xor_avx2:
// VZEROUPPER
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
RET
// func galMulAVX2(low, high, in, out []byte)
TEXT ·galMulAVX2(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP),R9 // R9: len(in)
/*
YASM:
VINSERTI128 YMM6, YMM6, XMM6, 1 ; low
VINSERTI128 YMM7, YMM7, XMM7, 1 ; high
VPBROADCASTB YMM8, XMM5 ; X8: lomask (unpacked)
*/
BYTE $0xc4;BYTE $0xe3;BYTE $0x4d;BYTE $0x38;BYTE $0xf6;BYTE $0x01;BYTE $0xc4;BYTE $0xe3;BYTE $0x45;BYTE $0x38;BYTE $0xff;BYTE $0x01;BYTE $0xc4;BYTE $0x62;BYTE $0x7d;BYTE $0x78;BYTE $0xc5;
SHRQ $5, R9 // len(in) /32
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // R11: &in
TESTQ R9, R9
JZ done_avx2
loopback_avx2:
/* Yasm:
VMOVDQU YMM0, [rsi]
VPSRLQ YMM1, YMM0, 4 ; X1: high input
VPAND YMM0, YMM0, YMM8 ; X0: low input
VPAND YMM1, YMM1, YMM8 ; X1: high input
VPSHUFB YMM2, YMM6, YMM0 ; X2: mul low part
VPSHUFB YMM3, YMM7, YMM1 ; X2: mul high part
VPXOR YMM4, YMM2, YMM3 ; X4: Result
VMOVDQU [rdx], YMM4
*/
BYTE $0xc5;BYTE $0xfe;BYTE $0x6f;BYTE $0x06;BYTE $0xc5;BYTE $0xf5;BYTE $0x73;BYTE $0xd0;BYTE $0x04;BYTE $0xc4;BYTE $0xc1;BYTE $0x7d;BYTE $0xdb;BYTE $0xc0;BYTE $0xc4;BYTE $0xc1;BYTE $0x75;BYTE $0xdb;BYTE $0xc8;BYTE $0xc4;BYTE $0xe2;BYTE $0x4d;BYTE $0x00;BYTE $0xd0;BYTE $0xc4;BYTE $0xe2;BYTE $0x45;BYTE $0x00;BYTE $0xd9;BYTE $0xc5;BYTE $0xed;BYTE $0xef;BYTE $0xe3;BYTE $0xc5;BYTE $0xfe;BYTE $0x7f;BYTE $0x22;
ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_avx2
JMP done_avx2
done_avx2:
// VZEROUPPER
BYTE $0xc5;BYTE $0xf8;BYTE $0x77;
RET