Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster.

BenchmarkEncode10x2x10000         333.31       5827.17      17.48x
BenchmarkEncode10x2x10000-2       431.20       2802.53      6.50x
BenchmarkEncode10x2x10000-4       553.98       2432.95      4.39x
BenchmarkEncode10x2x10000-8       585.79       3469.61      5.92x
BenchmarkEncode100x20x10000       32.59        583.40       17.90x
BenchmarkEncode100x20x10000-2     59.52        726.70       12.21x
BenchmarkEncode100x20x10000-4     108.04       1363.25      12.62x
BenchmarkEncode100x20x10000-8     113.76       1274.62      11.20x
BenchmarkEncode17x3x1M            215.28       3141.85      14.59x
BenchmarkEncode17x3x1M-2          398.76       3650.12      9.15x
BenchmarkEncode17x3x1M-4          655.32       6071.11      9.26x
BenchmarkEncode17x3x1M-8          832.16       6616.47      7.95x
BenchmarkEncode10x4x16M           154.48       1357.30      8.79x
BenchmarkEncode10x4x16M-2         295.62       2377.92      8.04x
BenchmarkEncode10x4x16M-4         529.89       3519.49      6.64x
BenchmarkEncode10x4x16M-8         632.11       4521.90      7.15x
BenchmarkEncode5x2x1M             327.87       4879.09      14.88x
BenchmarkEncode5x2x1M-2           576.11       2599.20      4.51x
BenchmarkEncode5x2x1M-4           1043.65      3559.12      3.41x
BenchmarkEncode5x2x1M-8           1227.77      4255.34      3.47x
BenchmarkEncode10x2x1M            321.24       4574.68      14.24x
BenchmarkEncode10x2x1M-2          587.73       3100.28      5.28x
BenchmarkEncode10x2x1M-4          1101.96      4770.32      4.33x
BenchmarkEncode10x2x1M-8          1217.08      5812.17      4.78x
BenchmarkEncode10x4x1M            155.34       2037.27      13.11x
BenchmarkEncode10x4x1M-2          298.38       2470.97      8.28x
BenchmarkEncode10x4x1M-4          548.67       3603.15      6.57x
BenchmarkEncode10x4x1M-8          625.23       4827.42      7.72x
BenchmarkEncode50x20x1M           31.37        347.65       11.08x
BenchmarkEncode50x20x1M-2         59.81        713.28       11.93x
BenchmarkEncode50x20x1M-4         105.34       1175.47      11.16x
BenchmarkEncode50x20x1M-8         123.84       1491.91      12.05x
BenchmarkEncode17x3x16M           209.55       1861.59      8.88x
BenchmarkEncode17x3x16M-2         394.19       3331.73      8.45x
BenchmarkEncode17x3x16M-4         643.30       4942.74      7.68x
BenchmarkEncode17x3x16M-8         839.64       6213.43      7.40x
master
Klaus Post 2015-06-21 21:23:22 +02:00
parent 619e2b7d65
commit 5aa37c3492
4 changed files with 165 additions and 53 deletions

61
galois_amd64.go Normal file
View File

@ -0,0 +1,61 @@
//+build !noasm
// Copyright 2015, Klaus Post, see LICENSE for details.
package reedsolomon
import (
"github.com/klauspost/cpuid"
)
func galMulSSE3(low, high, in, out []byte)
func galMulSSE3Xor(low, high, in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes:
/*
func galMulSSE3(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] = low[l] ^ high[h]
}
}
func galMulSSE3Xor(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] ^= low[l] ^ high[h]
}
}
*/
func galMulSlice(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSE3() {
galMulSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] = mt[in[i]]
}
}
}
func galMulSliceXor(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSE3() {
galMulSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done
if remain > 0 {
mt := mulTable[c]
for i := done; i < len(in); i++ {
out[i] ^= mt[in[i]]
}
}
}

80
galois_amd64.s Normal file
View File

@ -0,0 +1,80 @@
//+build !noasm
// Copyright 2015, Klaus Post, see LICENSE for details.
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
// and http://jerasure.org/jerasure/gf-complete/tree/master
// func galMulSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSE3Xor(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
MOVQ in+48(FP),SI // R11: &in
MOVQ in_len+56(FP),R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
SHRQ $4, R9 // len(in) / 16
CMPQ R9 ,$0
JEQ done_xor
loopback_xor:
MOVOU (SI),X0 // in[x]
MOVOU (DX),X4 // out[x]
MOVOU X0, X1 // in[x]
MOVOU X6, X2 // low copy
MOVOU X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
PXOR X4, X3 // X3: Result xor existing out
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor
done_xor:
RET ,
// func galMulSSE3(low, high, in, out []byte)
TEXT ·galMulSSE3(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
MOVQ in+48(FP),SI // R11: &in
MOVQ in_len+56(FP),R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
SHRQ $4, R9 // len(in) / 16
CMPQ R9 ,$0
JEQ done
loopback:
MOVOU (SI),X0 // in[x]
MOVOU X0, X1 // in[x]
MOVOU X6, X2 // low copy
MOVOU X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback
done:
RET ,

19
galois_noasm.go Normal file
View File

@ -0,0 +1,19 @@
//+build !amd64 noasm
// Copyright 2015, Klaus Post, see LICENSE for details.
package reedsolomon
func galMulSlice(c byte, in, out []byte) {
mt := mulTable[c]
for n, input := range in {
out[n] = mt[input]
}
}
func galMulSliceXor(c byte, in, out []byte) {
mt := mulTable[c]
for n, input := range in {
out[n] ^= mt[input]
}
}

View File

@ -184,16 +184,10 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
for c := 0; c < r.DataShards; c++ {
in := inputs[c]
for iRow := 0; iRow < outputCount; iRow++ {
o := outputs[iRow]
mt := mulTable[matrixRows[iRow][c]]
if c == 0 {
for iByte, input := range in {
o[iByte] = mt[input]
}
galMulSlice(matrixRows[iRow][c], in, outputs[iRow])
} else {
for iByte, input := range in {
o[iByte] ^= mt[input]
}
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
}
}
}
@ -222,16 +216,10 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
for c := 0; c < r.DataShards; c++ {
in := inputs[c]
for iRow := 0; iRow < outputCount; iRow++ {
o := outputs[iRow]
mt := mulTable[matrixRows[iRow][c]]
if c == 0 {
for iByte := start; iByte < stop; iByte++ {
o[iByte] = mt[in[iByte]]
}
galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
} else {
for iByte := start; iByte < stop; iByte++ {
o[iByte] ^= mt[in[iByte]]
}
galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
}
}
}
@ -246,38 +234,6 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
// except this will check values and return
// as soon as a difference is found.
func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
// Always use multiple gorountines, since it returns faster.
return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
/* if runtime.GOMAXPROCS(0) > 1 {
return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
}
outputs := make([][]byte, len(toCheck))
for i := range outputs {
outputs[i] = make([]byte, byteCount)
}
for c := 0; c < r.DataShards; c++ {
in := inputs[c]
for iRow := 0; iRow < outputCount; iRow++ {
o := outputs[iRow]
mt := mulTable[matrixRows[iRow][c]]
for iByte, input := range in {
o[iByte] ^= mt[input]
}
}
}
for i, calc := range outputs {
if bytes.Compare(calc, toCheck[i]) != 0 {
return false
}
}
return true
*/
}
// Parallel version of checkSomeShards
func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
var wg sync.WaitGroup
left := byteCount
start := 0
@ -310,11 +266,7 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
mu.RUnlock()
in := inputs[c][start : start+do]
for iRow := 0; iRow < outputCount; iRow++ {
o := outputs[iRow]
mt := mulTable[matrixRows[iRow][c]]
for iByte := 0; iByte < do; iByte++ {
o[iByte] ^= mt[in[iByte]]
}
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
}
}