Faster AVX2 encoding (#153)

* Remove 50% of bounds checks when copying.
* Use RIP only addressing, free one register.

```
benchmark                                 old MB/s      new MB/s      speedup
BenchmarkGalois128K-32                    57663.49      58005.87      1.01x
BenchmarkGalois1M-32                      49479.31      49848.29      1.01x
BenchmarkGaloisXor128K-32                 46310.69      46501.88      1.00x
BenchmarkGaloisXor1M-32                   43804.86      43984.39      1.00x
BenchmarkEncode10x2x10000-32              25926.93      27457.75      1.06x
BenchmarkEncode100x20x10000-32            2635.82       2818.95       1.07x
BenchmarkEncode17x3x1M-32                 63215.11      61576.76      0.97x
BenchmarkEncode10x4x16M-32                19551.54      19505.07      1.00x
BenchmarkEncode5x2x1M-32                  79612.06      81985.14      1.03x
BenchmarkEncode10x2x1M-32                 121478.29     127739.41     1.05x
BenchmarkEncode10x4x1M-32                 70757.61      74423.67      1.05x
BenchmarkEncode50x20x1M-32                19811.96      20103.32      1.01x
BenchmarkEncode17x3x16M-32                27202.10      27825.34      1.02x
BenchmarkEncode_8x4x8M-32                 19029.04      19701.31      1.04x
BenchmarkEncode_12x4x12M-32               22449.87      22480.51      1.00x
BenchmarkEncode_16x4x16M-32               24536.74      24672.24      1.01x
BenchmarkEncode_16x4x32M-32               24381.34      24981.99      1.02x
BenchmarkEncode_16x4x64M-32               24717.69      25086.94      1.01x
BenchmarkEncode_8x5x8M-32                 16763.51      17154.04      1.02x
BenchmarkEncode_8x6x8M-32                 15067.22      15205.87      1.01x
BenchmarkEncode_8x7x8M-32                 13156.38      13589.40      1.03x
BenchmarkEncode_8x9x8M-32                 11363.74      11523.70      1.01x
BenchmarkEncode_8x10x8M-32                10359.37      10474.91      1.01x
BenchmarkEncode_8x11x8M-32                9627.07       9463.24       0.98x
BenchmarkEncode_8x8x05M-32                30104.80      32634.89      1.08x
BenchmarkEncode_8x8x1M-32                 36497.28      36425.88      1.00x
BenchmarkEncode_8x8x8M-32                 12186.19      11602.41      0.95x
BenchmarkEncode_8x8x32M-32                11670.72      11413.71      0.98x
BenchmarkEncode_24x8x24M-32               21709.83      21652.50      1.00x
BenchmarkEncode_24x8x48M-32               22494.40      22280.59      0.99x
BenchmarkVerify10x2x10000-32              10567.56      10483.91      0.99x
BenchmarkVerify50x5x50000-32              28102.84      27923.63      0.99x
BenchmarkVerify10x2x1M-32                 30298.33      30106.18      0.99x
BenchmarkVerify5x2x1M-32                  16115.91      15847.03      0.98x
BenchmarkVerify10x4x1M-32                 15382.13      14852.68      0.97x
BenchmarkVerify50x20x1M-32                8476.02       8466.24       1.00x
BenchmarkVerify10x4x16M-32                15101.03      15434.71      1.02x
BenchmarkReconstruct10x2x10000-32         26228.18      26960.19      1.03x
BenchmarkReconstruct50x5x50000-32         31091.42      30975.82      1.00x
BenchmarkReconstruct10x2x1M-32            58548.87      60281.92      1.03x
BenchmarkReconstruct5x2x1M-32             39499.23      41791.80      1.06x
BenchmarkReconstruct10x4x1M-32            41448.60      43053.15      1.04x
BenchmarkReconstruct50x20x1M-32           17185.99      17354.67      1.01x
BenchmarkReconstruct10x4x16M-32           18798.60      18847.43      1.00x
BenchmarkReconstructData10x2x10000-32     27208.48      27538.38      1.01x
BenchmarkReconstructData50x5x50000-32     32135.65      32078.91      1.00x
BenchmarkReconstructData10x2x1M-32        63180.19      67332.17      1.07x
BenchmarkReconstructData5x2x1M-32         47532.85      49932.17      1.05x
BenchmarkReconstructData10x4x1M-32        50059.14      52323.15      1.05x
BenchmarkReconstructData50x20x1M-32       26679.75      26714.11      1.00x
BenchmarkReconstructData10x4x16M-32       24854.99      24527.23      0.99x
BenchmarkReconstructP10x2x10000-32        115089.87     113229.75     0.98x
BenchmarkReconstructP10x5x20000-32        129838.75     132871.10     1.02x
BenchmarkParallel_8x8x64K-32              69951.43      69980.44      1.00x
BenchmarkParallel_8x8x05M-32              11752.94      11724.35      1.00x
BenchmarkParallel_20x10x05M-32            18553.93      18613.33      1.00x
BenchmarkParallel_8x8x1M-32               11639.19      11746.86      1.01x
BenchmarkParallel_8x8x8M-32               11799.36      11685.63      0.99x
BenchmarkParallel_8x8x32M-32              11510.94      11791.72      1.02x
BenchmarkParallel_8x3x1M-32               20268.92      20678.21      1.02x
BenchmarkParallel_8x4x1M-32               17616.05      17856.17      1.01x
BenchmarkParallel_8x5x1M-32               15590.87      15872.42      1.02x
BenchmarkStreamEncode10x2x10000-32        14917.08      15408.39      1.03x
BenchmarkStreamEncode100x20x10000-32      2014.81       2077.31       1.03x
BenchmarkStreamEncode17x3x1M-32           11839.37      12434.80      1.05x
BenchmarkStreamEncode10x4x16M-32          9151.14       9206.98       1.01x
BenchmarkStreamEncode5x2x1M-32            13598.55      13663.56      1.00x
BenchmarkStreamEncode10x2x1M-32           13192.91      13453.41      1.02x
BenchmarkStreamEncode10x4x1M-32           12109.90      12050.68      1.00x
BenchmarkStreamEncode50x20x1M-32          8640.73       8370.10       0.97x
BenchmarkStreamEncode17x3x16M-32          10473.17      10527.04      1.01x
BenchmarkStreamVerify10x2x10000-32        7032.23       7128.82       1.01x
BenchmarkStreamVerify50x5x50000-32        13023.46      13109.31      1.01x
BenchmarkStreamVerify10x2x1M-32           11941.63      11949.91      1.00x
BenchmarkStreamVerify5x2x1M-32            8029.93       8263.39       1.03x
BenchmarkStreamVerify10x4x1M-32           8137.82       8271.11       1.02x
BenchmarkStreamVerify50x20x1M-32          7378.87       7708.81       1.04x
BenchmarkStreamVerify10x4x16M-32          8973.18       8955.29       1.00x
```
master
Klaus Post 2020-11-10 05:39:23 -08:00 committed by GitHub
parent 04d4482b55
commit 653e76aa26
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 4226 additions and 2450 deletions

View File

@ -917,12 +917,14 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte
for i, row := range matrixRows[:outputs] {
for j, idx := range row[:inputs] {
dstIdx := (j*outputs + i) * 64
dstPart := dst[dstIdx:]
dstPart = dstPart[:64]
lo := mulTableLow[idx][:]
hi := mulTableHigh[idx][:]
copy(dst[dstIdx:], lo)
copy(dst[dstIdx+16:], lo)
copy(dst[dstIdx+32:], hi)
copy(dst[dstIdx+48:], hi)
copy(dstPart[:16], lo)
copy(dstPart[16:32], lo)
copy(dstPart[32:48], hi)
copy(dstPart[48:64], hi)
}
}
return dst

File diff suppressed because it is too large Load Diff

95
gen.go
View File

@ -26,6 +26,11 @@ var switchDefsX [inputMax][outputMax]string
const perLoopBits = 5
const perLoop = 1 << perLoopBits
// Prefetch offsets, set to 0 to disable.
// Disabled since they appear to be consistently slower.
const prefetchSrc = 0
const prefetchDst = 0
func main() {
Constraint(buildtags.Not("appengine").ToConstraint())
Constraint(buildtags.Not("noasm").ToConstraint())
@ -98,6 +103,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
var loadNone bool
// Use registers for destination registers.
var regDst = true
var reloadLength = false
// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
est := total*2 + outputs + 5
@ -109,9 +115,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
if est > 16 {
loadNone = true
// We run out of GP registers first, now.
if inputs+outputs > 12 {
if inputs+outputs > 13 {
regDst = false
}
// Save one register by reloading length.
if inputs+outputs > 12 && regDst {
reloadLength = true
}
}
TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
@ -127,6 +137,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
// loadNone == false
Comment("Loading all tables to registers")
}
if regDst {
Comment("Destination kept in GP registers")
} else {
Comment("Destination kept on stack")
}
Doc(doc...)
Pragma("noescape")
@ -139,21 +154,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
TESTQ(length, length)
JZ(LabelRef(name + "_end"))
dst := make([]reg.VecVirtual, outputs)
dstPtr := make([]reg.GPVirtual, outputs)
outBase := Param("out").Base().MustAddr()
outSlicePtr := GP64()
MOVQ(outBase, outSlicePtr)
for i := range dst {
dst[i] = YMM()
if !regDst {
continue
}
ptr := GP64()
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
dstPtr[i] = ptr
}
inLo := make([]reg.VecVirtual, total)
inHi := make([]reg.VecVirtual, total)
@ -177,6 +177,36 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr)
inPtrs[i] = ptr
}
// Destination
dst := make([]reg.VecVirtual, outputs)
dstPtr := make([]reg.GPVirtual, outputs)
outBase := Param("out").Base().MustAddr()
outSlicePtr := GP64()
MOVQ(outBase, outSlicePtr)
for i := range dst {
dst[i] = YMM()
if !regDst {
continue
}
ptr := GP64()
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
dstPtr[i] = ptr
}
offset := GP64()
MOVQ(Param("start").MustAddr(), offset)
if regDst {
Comment("Add start offset to output")
for _, ptr := range dstPtr {
ADDQ(offset, ptr)
}
}
Comment("Add start offset to input")
for _, ptr := range inPtrs {
ADDQ(offset, ptr)
}
// Offset no longer needed unless not regdst
tmpMask := GP64()
MOVQ(U32(15), tmpMask)
@ -184,8 +214,10 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
MOVQ(tmpMask, lowMask.AsX())
VPBROADCASTB(lowMask.AsX(), lowMask)
offset := GP64()
MOVQ(Param("start").MustAddr(), offset)
if reloadLength {
length = Load(Param("n"), GP64())
SHRQ(U8(perLoopBits), length)
}
Label(name + "_loop")
if xor {
Commentf("Load %d outputs", outputs)
@ -195,12 +227,18 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
for i := range dst {
if xor {
if regDst {
VMOVDQU(Mem{Base: dstPtr[i], Index: offset, Scale: 1}, dst[i])
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
}
continue
}
ptr := GP64()
MOVQ(outBase, ptr)
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
if prefetchDst > 0 {
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
}
} else {
VPXOR(dst[i], dst[i], dst[i])
}
@ -210,7 +248,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
inLow, inHigh := YMM(), YMM()
for i := range inPtrs {
Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs)
VMOVDQU(Mem{Base: inPtrs[i], Index: offset, Scale: 1}, inLow)
VMOVDQU(Mem{Base: inPtrs[i]}, inLow)
if prefetchSrc > 0 {
PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc})
}
ADDQ(U8(perLoop), inPtrs[i])
VPSRLQ(U8(4), inLow, inHigh)
VPAND(lowMask, inLow, inLow)
VPAND(lowMask, inHigh, inHigh)
@ -231,15 +273,24 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
Commentf("Store %d outputs", outputs)
for i := range dst {
if regDst {
VMOVDQU(dst[i], Mem{Base: dstPtr[i], Index: offset, Scale: 1})
VMOVDQU(dst[i], Mem{Base: dstPtr[i]})
if prefetchDst > 0 && !xor {
PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
}
ADDQ(U8(perLoop), dstPtr[i])
continue
}
ptr := GP64()
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1})
if prefetchDst > 0 && !xor {
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
}
}
Comment("Prepare for next loop")
ADDQ(U8(perLoop), offset)
if !regDst {
ADDQ(U8(perLoop), offset)
}
DECQ(length)
JNZ(LabelRef(name + "_loop"))
VZEROUPPER()

View File

@ -520,7 +520,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
if end > len(inputs[0]) {
end = len(inputs[0])
}
if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs)+len(outputs) >= 4 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
r.mPool.Put(m)
@ -550,18 +550,23 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
// several goroutines.
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
var wg sync.WaitGroup
do := byteCount / r.o.maxGoroutines
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
// Make sizes divisible by 64
do = (do + 63) & (^63)
start := 0
gor := r.o.maxGoroutines
var avx2Matrix []byte
if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
useAvx2 := avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs)+len(outputs) >= 4 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs
if useAvx2 {
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
defer r.mPool.Put(avx2Matrix)
}
do := byteCount / gor
if do < r.o.minSplitSize {
do = r.o.minSplitSize
}
// Make sizes divisible by 64
do = (do + 63) & (^63)
start := 0
for start < byteCount {
if start+do > byteCount {
do = byteCount - start
@ -569,7 +574,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
wg.Add(1)
go func(start, stop int) {
if avx2CodeGen && r.o.useAVX2 && stop-start >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
if useAvx2 && stop-start >= 32 {
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
}

View File

@ -646,7 +646,8 @@ func testVerify(t *testing.T, o ...Option) {
t.Fatal(err)
}
if !ok {
t.Fatal("Verification failed")
t.Error("Verification failed")
return
}
// Put in random data. Verification should fail