Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster.
BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40xmaster
parent
619e2b7d65
commit
5aa37c3492
|
@ -0,0 +1,61 @@
|
|||
//+build !noasm
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
package reedsolomon
|
||||
|
||||
import (
|
||||
"github.com/klauspost/cpuid"
|
||||
)
|
||||
|
||||
func galMulSSE3(low, high, in, out []byte)
|
||||
func galMulSSE3Xor(low, high, in, out []byte)
|
||||
|
||||
// This is what the assembler rountes does in blocks of 16 bytes:
|
||||
/*
|
||||
func galMulSSE3(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
out[n] = low[l] ^ high[h]
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSSE3Xor(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
out[n] ^= low[l] ^ high[h]
|
||||
}
|
||||
}
|
||||
*/
|
||||
|
||||
func galMulSlice(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSE3() {
|
||||
galMulSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
remain := len(in) - done
|
||||
if remain > 0 {
|
||||
mt := mulTable[c]
|
||||
for i := done; i < len(in); i++ {
|
||||
out[i] = mt[in[i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSE3() {
|
||||
galMulSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
remain := len(in) - done
|
||||
if remain > 0 {
|
||||
mt := mulTable[c]
|
||||
for i := done; i < len(in); i++ {
|
||||
out[i] ^= mt[in[i]]
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,80 @@
|
|||
//+build !noasm
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
|
||||
// and http://jerasure.org/jerasure/gf-complete/tree/master
|
||||
|
||||
// func galMulSSE3Xor(low, high, in, out []byte)
|
||||
TEXT ·galMulSSE3Xor(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVOU (SI), X6 // X6 low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X8
|
||||
PXOR X5, X5
|
||||
MOVQ in+48(FP),SI // R11: &in
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||||
SHRQ $4, R9 // len(in) / 16
|
||||
CMPQ R9 ,$0
|
||||
JEQ done_xor
|
||||
loopback_xor:
|
||||
MOVOU (SI),X0 // in[x]
|
||||
MOVOU (DX),X4 // out[x]
|
||||
MOVOU X0, X1 // in[x]
|
||||
MOVOU X6, X2 // low copy
|
||||
MOVOU X7, X3 // high copy
|
||||
PSRLQ $4, X1 // X1: high input
|
||||
PAND X8, X0 // X0: low input
|
||||
PAND X8, X1 // X0: high input
|
||||
PSHUFB X0, X2 // X2: mul low part
|
||||
PSHUFB X1, X3 // X3: mul high part
|
||||
PXOR X2, X3 // X3: Result
|
||||
PXOR X4, X3 // X3: Result xor existing out
|
||||
MOVOU X3, (DX) // Store
|
||||
ADDQ $16, SI // in+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_xor
|
||||
done_xor:
|
||||
RET ,
|
||||
|
||||
// func galMulSSE3(low, high, in, out []byte)
|
||||
TEXT ·galMulSSE3(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVOU (SI), X6 // X6 low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X8
|
||||
PXOR X5, X5
|
||||
MOVQ in+48(FP),SI // R11: &in
|
||||
MOVQ in_len+56(FP),R9 // R9: len(in)
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
PSHUFB X5, X8 // X8: lomask (unpacked)
|
||||
SHRQ $4, R9 // len(in) / 16
|
||||
CMPQ R9 ,$0
|
||||
JEQ done
|
||||
loopback:
|
||||
MOVOU (SI),X0 // in[x]
|
||||
MOVOU X0, X1 // in[x]
|
||||
MOVOU X6, X2 // low copy
|
||||
MOVOU X7, X3 // high copy
|
||||
PSRLQ $4, X1 // X1: high input
|
||||
PAND X8, X0 // X0: low input
|
||||
PAND X8, X1 // X0: high input
|
||||
PSHUFB X0, X2 // X2: mul low part
|
||||
PSHUFB X1, X3 // X3: mul high part
|
||||
PXOR X2, X3 // X3: Result
|
||||
MOVOU X3, (DX) // Store
|
||||
ADDQ $16, SI // in+=16
|
||||
ADDQ $16, DX // out+=16
|
||||
SUBQ $1, R9
|
||||
JNZ loopback
|
||||
done:
|
||||
RET ,
|
||||
|
||||
|
|
@ -0,0 +1,19 @@
|
|||
//+build !amd64 noasm
|
||||
|
||||
// Copyright 2015, Klaus Post, see LICENSE for details.
|
||||
|
||||
package reedsolomon
|
||||
|
||||
func galMulSlice(c byte, in, out []byte) {
|
||||
mt := mulTable[c]
|
||||
for n, input := range in {
|
||||
out[n] = mt[input]
|
||||
}
|
||||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte) {
|
||||
mt := mulTable[c]
|
||||
for n, input := range in {
|
||||
out[n] ^= mt[input]
|
||||
}
|
||||
}
|
|
@ -184,16 +184,10 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
|
|||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
o := outputs[iRow]
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
if c == 0 {
|
||||
for iByte, input := range in {
|
||||
o[iByte] = mt[input]
|
||||
}
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow])
|
||||
} else {
|
||||
for iByte, input := range in {
|
||||
o[iByte] ^= mt[input]
|
||||
}
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -222,16 +216,10 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
o := outputs[iRow]
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
if c == 0 {
|
||||
for iByte := start; iByte < stop; iByte++ {
|
||||
o[iByte] = mt[in[iByte]]
|
||||
}
|
||||
galMulSlice(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
|
||||
} else {
|
||||
for iByte := start; iByte < stop; iByte++ {
|
||||
o[iByte] ^= mt[in[iByte]]
|
||||
}
|
||||
galMulSliceXor(matrixRows[iRow][c], in[start:stop], outputs[iRow][start:stop])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -246,38 +234,6 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
// except this will check values and return
|
||||
// as soon as a difference is found.
|
||||
func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
// Always use multiple gorountines, since it returns faster.
|
||||
return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
|
||||
/* if runtime.GOMAXPROCS(0) > 1 {
|
||||
return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
|
||||
}
|
||||
outputs := make([][]byte, len(toCheck))
|
||||
for i := range outputs {
|
||||
outputs[i] = make([]byte, byteCount)
|
||||
}
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
o := outputs[iRow]
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
for iByte, input := range in {
|
||||
o[iByte] ^= mt[input]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
for i, calc := range outputs {
|
||||
if bytes.Compare(calc, toCheck[i]) != 0 {
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
return true
|
||||
*/
|
||||
}
|
||||
|
||||
// Parallel version of checkSomeShards
|
||||
func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
var wg sync.WaitGroup
|
||||
left := byteCount
|
||||
start := 0
|
||||
|
@ -310,11 +266,7 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
|
|||
mu.RUnlock()
|
||||
in := inputs[c][start : start+do]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
o := outputs[iRow]
|
||||
mt := mulTable[matrixRows[iRow][c]]
|
||||
for iByte := 0; iByte < do; iByte++ {
|
||||
o[iByte] ^= mt[in[iByte]]
|
||||
}
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow])
|
||||
}
|
||||
}
|
||||
|
||||
|
|
Loading…
Reference in New Issue