avx2: Improve speed when > 10 input or output shards. (#174)
Speeds are including a limiting the number of goroutines with all AVX2 paths, Before/after ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2240 2240 +0.00% BenchmarkGalois1M-32 19578 18891 -3.51% BenchmarkGaloisXor128K-32 2798 2852 +1.93% BenchmarkGaloisXor1M-32 23334 23345 +0.05% BenchmarkEncode2x1x1M-32 34357 34370 +0.04% BenchmarkEncode10x2x10000-32 3210 3093 -3.64% BenchmarkEncode100x20x10000-32 362925 148214 -59.16% BenchmarkEncode17x3x1M-32 323767 224157 -30.77% BenchmarkEncode10x4x16M-32 8376895 8376737 -0.00% BenchmarkEncode5x2x1M-32 68365 66861 -2.20% BenchmarkEncode10x2x1M-32 101407 93023 -8.27% BenchmarkEncode10x4x1M-32 171880 155477 -9.54% BenchmarkEncode50x20x1M-32 3704691 3015047 -18.62% BenchmarkEncode17x3x16M-32 10279233 10106658 -1.68% BenchmarkEncode_8x4x8M-32 3438245 3326479 -3.25% BenchmarkEncode_12x4x12M-32 6632257 6581637 -0.76% BenchmarkEncode_16x4x16M-32 10815755 10788377 -0.25% BenchmarkEncode_16x4x32M-32 21029061 21507995 +2.28% BenchmarkEncode_16x4x64M-32 42145450 43876850 +4.11% BenchmarkEncode_8x5x8M-32 4543208 3846378 -15.34% BenchmarkEncode_8x6x8M-32 5065494 4397218 -13.19% BenchmarkEncode_8x7x8M-32 5818995 4962884 -14.71% BenchmarkEncode_8x9x8M-32 6215449 6114898 -1.62% BenchmarkEncode_8x10x8M-32 6923415 6610501 -4.52% BenchmarkEncode_8x11x8M-32 7365988 7010473 -4.83% BenchmarkEncode_8x8x05M-32 150857 136820 -9.30% BenchmarkEncode_8x8x1M-32 256722 254854 -0.73% BenchmarkEncode_8x8x8M-32 5547790 5422048 -2.27% BenchmarkEncode_8x8x32M-32 23038643 22705859 -1.44% BenchmarkEncode_24x8x24M-32 27729259 30332216 +9.39% BenchmarkEncode_24x8x48M-32 53865705 61187658 +13.59% BenchmarkVerify10x2x10000-32 8769 8154 -7.01% BenchmarkVerify10x2x1M-32 516149 476180 -7.74% BenchmarkVerify5x2x1M-32 443888 419541 -5.48% BenchmarkVerify10x4x1M-32 1030299 948021 -7.99% BenchmarkVerify50x20x1M-32 7209689 6186891 -14.19% BenchmarkVerify10x4x16M-32 17774456 17681879 -0.52% BenchmarkReconstruct10x2x10000-32 3352 3256 -2.86% BenchmarkReconstruct50x5x50000-32 166417 140900 -15.33% BenchmarkReconstruct10x2x1M-32 189711 174615 -7.96% BenchmarkReconstruct5x2x1M-32 128080 126520 -1.22% BenchmarkReconstruct10x4x1M-32 273312 254017 -7.06% BenchmarkReconstruct50x20x1M-32 3628812 3192474 -12.02% BenchmarkReconstruct10x4x16M-32 8562186 8781479 +2.56% BenchmarkReconstructData10x2x10000-32 3241 3116 -3.86% BenchmarkReconstructData50x5x50000-32 162520 134794 -17.06% BenchmarkReconstructData10x2x1M-32 171253 161955 -5.43% BenchmarkReconstructData5x2x1M-32 102215 106942 +4.62% BenchmarkReconstructData10x4x1M-32 225593 219969 -2.49% BenchmarkReconstructData50x20x1M-32 2515311 2129721 -15.33% BenchmarkReconstructData10x4x16M-32 6980308 6698111 -4.04% BenchmarkReconstructP10x2x10000-32 924 937 +1.35% BenchmarkReconstructP10x5x20000-32 1639 1703 +3.90% BenchmarkSplit10x4x160M-32 4984993 4898045 -1.74% BenchmarkSplit5x2x5M-32 380415 221446 -41.79% BenchmarkSplit10x2x1M-32 58761 53335 -9.23% BenchmarkSplit10x4x10M-32 643188 410959 -36.11% BenchmarkSplit50x20x50M-32 1843879 1647205 -10.67% BenchmarkSplit17x3x272M-32 3684920 3613951 -1.93% BenchmarkParallel_8x8x64K-32 7022 6630 -5.58% BenchmarkParallel_8x8x05M-32 348308 348369 +0.02% BenchmarkParallel_20x10x05M-32 575672 581028 +0.93% BenchmarkParallel_8x8x1M-32 716033 697167 -2.63% BenchmarkParallel_8x8x8M-32 5716048 5616437 -1.74% BenchmarkParallel_8x8x32M-32 22650878 22098667 -2.44% BenchmarkParallel_8x3x1M-32 406839 399125 -1.90% BenchmarkParallel_8x4x1M-32 459107 463890 +1.04% BenchmarkParallel_8x5x1M-32 527488 520334 -1.36% BenchmarkStreamEncode10x2x10000-32 6013 5878 -2.25% BenchmarkStreamEncode100x20x10000-32 503124 267894 -46.75% BenchmarkStreamEncode17x3x1M-32 1561838 1376618 -11.86% BenchmarkStreamEncode10x4x16M-32 19124427 17762582 -7.12% BenchmarkStreamEncode5x2x1M-32 429701 384666 -10.48% BenchmarkStreamEncode10x2x1M-32 801257 763637 -4.70% BenchmarkStreamEncode10x4x1M-32 876065 820744 -6.31% BenchmarkStreamEncode50x20x1M-32 7205112 6081398 -15.60% BenchmarkStreamEncode17x3x16M-32 27182786 26117143 -3.92% BenchmarkStreamVerify10x2x10000-32 13767 14026 +1.88% BenchmarkStreamVerify50x5x50000-32 826983 690453 -16.51% BenchmarkStreamVerify10x2x1M-32 1238566 1182591 -4.52% BenchmarkStreamVerify5x2x1M-32 892661 806301 -9.67% BenchmarkStreamVerify10x4x1M-32 1676394 1631495 -2.68% BenchmarkStreamVerify50x20x1M-32 10877875 10037678 -7.72% BenchmarkStreamVerify10x4x16M-32 27599576 30435400 +10.27% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 58518.53 58510.17 1.00x BenchmarkGalois1M-32 53558.10 55507.44 1.04x BenchmarkGaloisXor128K-32 46839.74 45961.09 0.98x BenchmarkGaloisXor1M-32 44936.98 44917.46 1.00x BenchmarkEncode2x1x1M-32 91561.27 91524.11 1.00x BenchmarkEncode10x2x10000-32 37385.54 38792.54 1.04x BenchmarkEncode100x20x10000-32 3306.47 8096.40 2.45x BenchmarkEncode17x3x1M-32 64773.49 93557.14 1.44x BenchmarkEncode10x4x16M-32 28039.15 28039.68 1.00x BenchmarkEncode5x2x1M-32 107365.88 109781.16 1.02x BenchmarkEncode10x2x1M-32 124083.62 135266.27 1.09x BenchmarkEncode10x4x1M-32 85408.99 94419.71 1.11x BenchmarkEncode50x20x1M-32 19812.81 24344.67 1.23x BenchmarkEncode17x3x16M-32 32642.93 33200.32 1.02x BenchmarkEncode_8x4x8M-32 29277.52 30261.21 1.03x BenchmarkEncode_12x4x12M-32 30355.67 30589.14 1.01x BenchmarkEncode_16x4x16M-32 31023.66 31102.39 1.00x BenchmarkEncode_16x4x32M-32 31912.44 31201.82 0.98x BenchmarkEncode_16x4x64M-32 31846.32 30589.65 0.96x BenchmarkEncode_8x5x8M-32 24003.28 28351.84 1.18x BenchmarkEncode_8x6x8M-32 23184.41 26707.91 1.15x BenchmarkEncode_8x7x8M-32 21623.86 25354.03 1.17x BenchmarkEncode_8x9x8M-32 22943.85 23321.13 1.02x BenchmarkEncode_8x10x8M-32 21809.31 22841.68 1.05x BenchmarkEncode_8x11x8M-32 21637.77 22735.06 1.05x BenchmarkEncode_8x8x05M-32 55606.22 61311.47 1.10x BenchmarkEncode_8x8x1M-32 65351.80 65830.73 1.01x BenchmarkEncode_8x8x8M-32 24193.01 24754.07 1.02x BenchmarkEncode_8x8x32M-32 23303.06 23644.60 1.01x BenchmarkEncode_24x8x24M-32 29041.76 26549.54 0.91x BenchmarkEncode_24x8x48M-32 29900.52 26322.51 0.88x BenchmarkVerify10x2x10000-32 13685.12 14717.10 1.08x BenchmarkVerify10x2x1M-32 24378.43 26424.72 1.08x BenchmarkVerify5x2x1M-32 16535.79 17495.41 1.06x BenchmarkVerify10x4x1M-32 14248.35 15484.96 1.09x BenchmarkVerify50x20x1M-32 10180.79 11863.85 1.17x BenchmarkVerify10x4x16M-32 13214.53 13283.71 1.01x BenchmarkReconstruct10x2x10000-32 35799.16 36854.89 1.03x BenchmarkReconstruct50x5x50000-32 33049.47 39034.89 1.18x BenchmarkReconstruct10x2x1M-32 66326.88 72061.06 1.09x BenchmarkReconstruct5x2x1M-32 57308.21 58014.92 1.01x BenchmarkReconstruct10x4x1M-32 53711.74 57791.66 1.08x BenchmarkReconstruct50x20x1M-32 20227.09 22991.67 1.14x BenchmarkReconstruct10x4x16M-32 27432.37 26747.32 0.98x BenchmarkReconstructData10x2x10000-32 37030.86 38511.87 1.04x BenchmarkReconstructData50x5x50000-32 33842.07 40802.85 1.21x BenchmarkReconstructData10x2x1M-32 73475.57 77693.87 1.06x BenchmarkReconstructData5x2x1M-32 71809.58 68635.57 0.96x BenchmarkReconstructData10x4x1M-32 65073.27 66736.88 1.03x BenchmarkReconstructData50x20x1M-32 29181.41 34464.76 1.18x BenchmarkReconstructData10x4x16M-32 33649.09 35066.75 1.04x BenchmarkReconstructP10x2x10000-32 129819.98 128086.76 0.99x BenchmarkReconstructP10x5x20000-32 183073.89 176202.21 0.96x BenchmarkParallel_8x8x64K-32 149327.33 158153.67 1.06x BenchmarkParallel_8x8x05M-32 24083.89 24079.69 1.00x BenchmarkParallel_20x10x05M-32 27322.20 27070.35 0.99x BenchmarkParallel_8x8x1M-32 23430.78 24064.83 1.03x BenchmarkParallel_8x8x8M-32 23480.86 23897.31 1.02x BenchmarkParallel_8x8x32M-32 23701.99 24294.27 1.02x BenchmarkParallel_8x3x1M-32 28351.11 28899.03 1.02x BenchmarkParallel_8x4x1M-32 27407.34 27124.76 0.99x BenchmarkParallel_8x5x1M-32 25842.27 26197.58 1.01x BenchmarkStreamEncode10x2x10000-32 16629.76 17012.26 1.02x BenchmarkStreamEncode100x20x10000-32 1987.58 3732.83 1.88x BenchmarkStreamEncode17x3x1M-32 11413.34 12948.97 1.13x BenchmarkStreamEncode10x4x16M-32 8772.66 9445.26 1.08x BenchmarkStreamEncode5x2x1M-32 12201.21 13629.70 1.12x BenchmarkStreamEncode10x2x1M-32 13086.64 13731.34 1.05x BenchmarkStreamEncode10x4x1M-32 11969.16 12775.92 1.07x BenchmarkStreamEncode50x20x1M-32 7276.61 8621.18 1.18x BenchmarkStreamEncode17x3x16M-32 10492.40 10920.52 1.04x BenchmarkStreamVerify10x2x10000-32 7264.00 7129.49 0.98x BenchmarkStreamVerify50x5x50000-32 6046.07 7241.62 1.20x BenchmarkStreamVerify10x2x1M-32 8466.05 8866.77 1.05x BenchmarkStreamVerify5x2x1M-32 5873.31 6502.39 1.11x BenchmarkStreamVerify10x4x1M-32 6254.95 6427.09 1.03x BenchmarkStreamVerify50x20x1M-32 4819.76 5223.20 1.08x BenchmarkStreamVerify10x4x16M-32 6078.79 5512.40 0.91x ```master
parent
5593e2b2dd
commit
1bb4d699e1
152
_gen/gen.go
152
_gen/gen.go
|
@ -2,7 +2,8 @@
|
||||||
// +build generate
|
// +build generate
|
||||||
|
|
||||||
//go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon
|
//go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon
|
||||||
//go:generate gofmt -w ../galois_gen_switch_amd64.go
|
//go:generate go fmt ../galois_gen_switch_amd64.go
|
||||||
|
//go:generate go fmt ../galois_gen_amd64.go
|
||||||
|
|
||||||
package main
|
package main
|
||||||
|
|
||||||
|
@ -36,14 +37,15 @@ func main() {
|
||||||
Constraint(buildtags.Not("nogen").ToConstraint())
|
Constraint(buildtags.Not("nogen").ToConstraint())
|
||||||
Constraint(buildtags.Term("gc").ToConstraint())
|
Constraint(buildtags.Term("gc").ToConstraint())
|
||||||
|
|
||||||
const perLoopBits = 5
|
const perLoopBits = 6
|
||||||
const perLoop = 1 << perLoopBits
|
const perLoop = 1 << perLoopBits
|
||||||
|
|
||||||
for i := 1; i <= inputMax; i++ {
|
for i := 1; i <= inputMax; i++ {
|
||||||
for j := 1; j <= outputMax; j++ {
|
for j := 1; j <= outputMax; j++ {
|
||||||
//genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true)
|
|
||||||
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false)
|
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false)
|
||||||
genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false)
|
genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false)
|
||||||
|
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true)
|
||||||
|
genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
f, err := os.Create("../galois_gen_switch_amd64.go")
|
f, err := os.Create("../galois_gen_switch_amd64.go")
|
||||||
|
@ -62,19 +64,26 @@ func main() {
|
||||||
|
|
||||||
package reedsolomon
|
package reedsolomon
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
`)
|
`)
|
||||||
|
|
||||||
w.WriteString("const avx2CodeGen = true\n")
|
w.WriteString(fmt.Sprintf(`const (
|
||||||
w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax))
|
avx2CodeGen = true
|
||||||
|
maxAvx2Inputs = %d
|
||||||
|
maxAvx2Outputs = %d
|
||||||
|
minAvx2Size = %d
|
||||||
|
avxSizeMask = maxInt - (minAvx2Size-1)
|
||||||
|
)`, inputMax, outputMax, perLoop))
|
||||||
w.WriteString(`
|
w.WriteString(`
|
||||||
|
|
||||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
n := stop-start
|
n := (stop-start) & avxSizeMask
|
||||||
|
|
||||||
`)
|
`)
|
||||||
|
|
||||||
w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits))
|
|
||||||
w.WriteString(`switch len(in) {
|
w.WriteString(`switch len(in) {
|
||||||
`)
|
`)
|
||||||
for in, defs := range switchDefs[:] {
|
for in, defs := range switchDefs[:] {
|
||||||
|
@ -88,6 +97,25 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
w.WriteString(`}
|
w.WriteString(`}
|
||||||
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
|
n := (stop-start) & avxSizeMask
|
||||||
|
|
||||||
|
`)
|
||||||
|
|
||||||
|
w.WriteString(`switch len(in) {
|
||||||
|
`)
|
||||||
|
for in, defs := range switchDefsX[:] {
|
||||||
|
w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1))
|
||||||
|
for out, def := range defs[:] {
|
||||||
|
w.WriteString(fmt.Sprintf(" case %d:\n", out+1))
|
||||||
|
w.WriteString(def)
|
||||||
|
}
|
||||||
|
w.WriteString("}\n")
|
||||||
|
}
|
||||||
|
w.WriteString(`}
|
||||||
|
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||||
|
}
|
||||||
`)
|
`)
|
||||||
Generate()
|
Generate()
|
||||||
}
|
}
|
||||||
|
@ -129,12 +157,21 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
x := ""
|
||||||
|
if xor {
|
||||||
|
x = "Xor"
|
||||||
|
}
|
||||||
|
|
||||||
TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
|
TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
|
||||||
|
|
||||||
// SWITCH DEFINITION:
|
// SWITCH DEFINITION:
|
||||||
s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs)
|
s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x)
|
||||||
s += fmt.Sprintf("\t\t\t\treturn n\n")
|
s += fmt.Sprintf("\t\t\t\treturn n\n")
|
||||||
switchDefs[inputs-1][outputs-1] = s
|
if xor {
|
||||||
|
switchDefsX[inputs-1][outputs-1] = s
|
||||||
|
} else {
|
||||||
|
switchDefs[inputs-1][outputs-1] = s
|
||||||
|
}
|
||||||
|
|
||||||
if loadNone {
|
if loadNone {
|
||||||
Comment("Loading no tables to registers")
|
Comment("Loading no tables to registers")
|
||||||
|
@ -197,7 +234,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
if err != nil {
|
if err != nil {
|
||||||
panic(err)
|
panic(err)
|
||||||
}
|
}
|
||||||
outBase := addr.Addr
|
|
||||||
outSlicePtr := GP64()
|
outSlicePtr := GP64()
|
||||||
MOVQ(addr.Addr, outSlicePtr)
|
MOVQ(addr.Addr, outSlicePtr)
|
||||||
for i := range dst {
|
for i := range dst {
|
||||||
|
@ -241,13 +277,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
SHRQ(U8(perLoopBits), length)
|
SHRQ(U8(perLoopBits), length)
|
||||||
}
|
}
|
||||||
Label(name + "_loop")
|
Label(name + "_loop")
|
||||||
if xor {
|
|
||||||
|
// Load data before loop or during first iteration?
|
||||||
|
// No clear winner.
|
||||||
|
preloadInput := xor && false
|
||||||
|
if preloadInput {
|
||||||
Commentf("Load %d outputs", outputs)
|
Commentf("Load %d outputs", outputs)
|
||||||
} else {
|
for i := range dst {
|
||||||
Commentf("Clear %d outputs", outputs)
|
|
||||||
}
|
|
||||||
for i := range dst {
|
|
||||||
if xor {
|
|
||||||
if regDst {
|
if regDst {
|
||||||
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
|
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
|
||||||
if prefetchDst > 0 {
|
if prefetchDst > 0 {
|
||||||
|
@ -256,13 +292,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ptr := GP64()
|
ptr := GP64()
|
||||||
MOVQ(outBase, ptr)
|
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
|
||||||
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
|
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
|
||||||
if prefetchDst > 0 {
|
if prefetchDst > 0 {
|
||||||
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
|
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
VPXOR(dst[i], dst[i], dst[i])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -279,6 +313,22 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
VPAND(lowMask, inLow, inLow)
|
VPAND(lowMask, inLow, inLow)
|
||||||
VPAND(lowMask, inHigh, inHigh)
|
VPAND(lowMask, inHigh, inHigh)
|
||||||
for j := range dst {
|
for j := range dst {
|
||||||
|
//Commentf(" xor:%v i: %v", xor, i)
|
||||||
|
if !preloadInput && xor && i == 0 {
|
||||||
|
if regDst {
|
||||||
|
VMOVDQU(Mem{Base: dstPtr[j]}, dst[j])
|
||||||
|
if prefetchDst > 0 {
|
||||||
|
PREFETCHT0(Mem{Base: dstPtr[j], Disp: prefetchDst})
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
ptr := GP64()
|
||||||
|
MOVQ(Mem{Base: outSlicePtr, Disp: j * 24}, ptr)
|
||||||
|
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[j])
|
||||||
|
if prefetchDst > 0 {
|
||||||
|
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
if loadNone {
|
if loadNone {
|
||||||
VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow)
|
VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow)
|
||||||
VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh)
|
VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh)
|
||||||
|
@ -288,8 +338,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||||
VPSHUFB(inLow, inLo[i*outputs+j], lookLow)
|
VPSHUFB(inLow, inLo[i*outputs+j], lookLow)
|
||||||
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
|
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
|
||||||
}
|
}
|
||||||
VPXOR(lookLow, lookHigh, lookLow)
|
if i == 0 && !xor {
|
||||||
VPXOR(lookLow, dst[j], dst[j])
|
// We don't have any existing data, write directly.
|
||||||
|
VPXOR(lookLow, lookHigh, dst[j])
|
||||||
|
} else {
|
||||||
|
VPXOR(lookLow, lookHigh, lookLow)
|
||||||
|
VPXOR(lookLow, dst[j], dst[j])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Commentf("Store %d outputs", outputs)
|
Commentf("Store %d outputs", outputs)
|
||||||
|
@ -340,35 +395,42 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
|
||||||
// Load shuffle masks on every use.
|
// Load shuffle masks on every use.
|
||||||
var loadNone bool
|
var loadNone bool
|
||||||
// Use registers for destination registers.
|
// Use registers for destination registers.
|
||||||
var regDst = false
|
var regDst = true
|
||||||
var reloadLength = false
|
var reloadLength = false
|
||||||
|
|
||||||
// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
|
// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
|
||||||
est := total*2 + outputs + 5
|
est := total*4 + outputs + 7
|
||||||
if outputs == 1 {
|
if outputs == 1 {
|
||||||
// We don't need to keep a copy of the input if only 1 output.
|
// We don't need to keep a copy of the input if only 1 output.
|
||||||
est -= 2
|
est -= 2
|
||||||
}
|
}
|
||||||
|
|
||||||
if true || est > 16 {
|
if est > 16 {
|
||||||
loadNone = true
|
loadNone = true
|
||||||
// We run out of GP registers first, now.
|
// We run out of GP registers first, now.
|
||||||
if inputs+outputs > 13 {
|
if inputs+outputs > 13 {
|
||||||
regDst = false
|
regDst = false
|
||||||
}
|
}
|
||||||
// Save one register by reloading length.
|
// Save one register by reloading length.
|
||||||
if true || inputs+outputs > 12 && regDst {
|
if inputs+outputs > 12 && regDst {
|
||||||
reloadLength = true
|
reloadLength = true
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
|
TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
|
||||||
|
x := ""
|
||||||
|
if xor {
|
||||||
|
x = "Xor"
|
||||||
|
}
|
||||||
// SWITCH DEFINITION:
|
// SWITCH DEFINITION:
|
||||||
s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits)
|
//s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits)
|
||||||
s += fmt.Sprintf(" mulAvxTwo_%dx%d_64(matrix, in, out, start, n)\n", inputs, outputs)
|
s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x)
|
||||||
s += fmt.Sprintf("\t\t\t\treturn n\n")
|
s += fmt.Sprintf("\t\t\t\treturn n\n")
|
||||||
switchDefs[inputs-1][outputs-1] = s
|
if xor {
|
||||||
|
switchDefsX[inputs-1][outputs-1] = s
|
||||||
|
} else {
|
||||||
|
switchDefs[inputs-1][outputs-1] = s
|
||||||
|
}
|
||||||
|
|
||||||
if loadNone {
|
if loadNone {
|
||||||
Comment("Loading no tables to registers")
|
Comment("Loading no tables to registers")
|
||||||
|
@ -474,33 +536,31 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
|
||||||
VPBROADCASTB(lowMask.AsX(), lowMask)
|
VPBROADCASTB(lowMask.AsX(), lowMask)
|
||||||
|
|
||||||
if reloadLength {
|
if reloadLength {
|
||||||
|
Commentf("Reload length to save a register")
|
||||||
length = Load(Param("n"), GP64())
|
length = Load(Param("n"), GP64())
|
||||||
SHRQ(U8(perLoopBits), length)
|
SHRQ(U8(perLoopBits), length)
|
||||||
}
|
}
|
||||||
Label(name + "_loop")
|
Label(name + "_loop")
|
||||||
|
|
||||||
if xor {
|
if xor {
|
||||||
Commentf("Load %d outputs", outputs)
|
Commentf("Load %d outputs", outputs)
|
||||||
} else {
|
for i := range dst {
|
||||||
Commentf("Clear %d outputs", outputs)
|
|
||||||
}
|
|
||||||
for i := range dst {
|
|
||||||
if xor {
|
|
||||||
if regDst {
|
if regDst {
|
||||||
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
|
VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
|
||||||
|
VMOVDQU(Mem{Base: dstPtr[i], Disp: 32}, dst2[i])
|
||||||
if prefetchDst > 0 {
|
if prefetchDst > 0 {
|
||||||
PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
|
PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
|
||||||
}
|
}
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
ptr := GP64()
|
ptr := GP64()
|
||||||
MOVQ(outBase, ptr)
|
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
|
||||||
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
|
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
|
||||||
|
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1, Disp: 32}, dst2[i])
|
||||||
|
|
||||||
if prefetchDst > 0 {
|
if prefetchDst > 0 {
|
||||||
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
|
PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
VPXOR(dst[i], dst[i], dst[i])
|
|
||||||
VPXOR(dst2[i], dst2[i], dst2[i])
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -536,10 +596,16 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) {
|
||||||
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
|
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
|
||||||
VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2)
|
VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2)
|
||||||
}
|
}
|
||||||
VPXOR(lookLow, lookHigh, lookLow)
|
if i == 0 && !xor {
|
||||||
VPXOR(lookLow2, lookHigh2, lookLow2)
|
// We don't have any existing data, write directly.
|
||||||
VPXOR(lookLow, dst[j], dst[j])
|
VPXOR(lookLow, lookHigh, dst[j])
|
||||||
VPXOR(lookLow2, dst2[j], dst2[j])
|
VPXOR(lookLow2, lookHigh2, dst2[j])
|
||||||
|
} else {
|
||||||
|
VPXOR(lookLow, lookHigh, lookLow)
|
||||||
|
VPXOR(lookLow2, lookHigh2, lookLow2)
|
||||||
|
VPXOR(lookLow, dst[j], dst[j])
|
||||||
|
VPXOR(lookLow2, dst2[j], dst2[j])
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Commentf("Store %d outputs", outputs)
|
Commentf("Store %d outputs", outputs)
|
||||||
|
|
|
@ -901,7 +901,7 @@ func galExp(a byte, n int) byte {
|
||||||
return expTable[logResult]
|
return expTable[logResult]
|
||||||
}
|
}
|
||||||
|
|
||||||
func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte {
|
func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte {
|
||||||
if !avx2CodeGen {
|
if !avx2CodeGen {
|
||||||
panic("codegen not enabled")
|
panic("codegen not enabled")
|
||||||
}
|
}
|
||||||
|
@ -915,7 +915,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte
|
||||||
dst = dst[:wantBytes]
|
dst = dst[:wantBytes]
|
||||||
}
|
}
|
||||||
for i, row := range matrixRows[:outputs] {
|
for i, row := range matrixRows[:outputs] {
|
||||||
for j, idx := range row[:inputs] {
|
for j, idx := range row[inIdx : inIdx+inputs] {
|
||||||
dstIdx := (j*outputs + i) * 64
|
dstIdx := (j*outputs + i) * 64
|
||||||
dstPart := dst[dstIdx:]
|
dstPart := dst[dstIdx:]
|
||||||
dstPart = dstPart[:64]
|
dstPart = dstPart[:64]
|
||||||
|
|
|
@ -225,8 +225,9 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp
|
||||||
|
|
||||||
// Perform the same as codeSomeShards, but taking advantage of
|
// Perform the same as codeSomeShards, but taking advantage of
|
||||||
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
||||||
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
// Process using no goroutines
|
// Process using no goroutines
|
||||||
|
outputCount := len(outputs)
|
||||||
start, end := 0, r.o.perRound
|
start, end := 0, r.o.perRound
|
||||||
if end > byteCount {
|
if end > byteCount {
|
||||||
end = byteCount
|
end = byteCount
|
||||||
|
@ -272,7 +273,8 @@ func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
||||||
|
|
||||||
// Perform the same as codeSomeShards, but taking advantage of
|
// Perform the same as codeSomeShards, but taking advantage of
|
||||||
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
||||||
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
|
outputCount := len(outputs)
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
do := byteCount / r.o.maxGoroutines
|
do := byteCount / r.o.maxGoroutines
|
||||||
if do < r.o.minSplitSize {
|
if do < r.o.minSplitSize {
|
||||||
|
|
|
@ -331,9 +331,9 @@ func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bo
|
||||||
}
|
}
|
||||||
|
|
||||||
if parallel {
|
if parallel {
|
||||||
r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
|
r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], len(shards[0]))
|
||||||
} else {
|
} else {
|
||||||
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
|
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
|
||||||
}
|
}
|
||||||
|
|
||||||
correct, _ := r.Verify(shards)
|
correct, _ := r.Verify(shards)
|
||||||
|
|
|
@ -107,6 +107,9 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||||
in = in[done:]
|
in = in[done:]
|
||||||
out = out[done:]
|
out = out[done:]
|
||||||
}
|
}
|
||||||
|
if len(in) == 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
out = out[:len(in)]
|
out = out[:len(in)]
|
||||||
mt := mulTable[c][:256]
|
mt := mulTable[c][:256]
|
||||||
for i := range in {
|
for i := range in {
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
41383
galois_gen_amd64.s
41383
galois_gen_amd64.s
File diff suppressed because it is too large
Load Diff
|
@ -3,10 +3,16 @@
|
||||||
|
|
||||||
package reedsolomon
|
package reedsolomon
|
||||||
|
|
||||||
const maxAvx2Inputs = 0
|
const maxAvx2Inputs = 1
|
||||||
const maxAvx2Outputs = 0
|
const maxAvx2Outputs = 1
|
||||||
|
const minAvx2Size = 1
|
||||||
|
const avxSizeMask = 0
|
||||||
const avx2CodeGen = false
|
const avx2CodeGen = false
|
||||||
|
|
||||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
panic("avx2 codegen not available")
|
panic("avx2 codegen not available")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
|
panic("avx2 codegen not available")
|
||||||
|
}
|
||||||
|
|
|
@ -5,29 +5,31 @@
|
||||||
|
|
||||||
package reedsolomon
|
package reedsolomon
|
||||||
|
|
||||||
import "fmt"
|
import (
|
||||||
|
"fmt"
|
||||||
|
)
|
||||||
|
|
||||||
const avx2CodeGen = true
|
const (
|
||||||
const maxAvx2Inputs = 10
|
avx2CodeGen = true
|
||||||
const maxAvx2Outputs = 10
|
maxAvx2Inputs = 10
|
||||||
|
maxAvx2Outputs = 10
|
||||||
|
minAvx2Size = 64
|
||||||
|
avxSizeMask = maxInt - (minAvx2Size - 1)
|
||||||
|
)
|
||||||
|
|
||||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
n := stop - start
|
n := (stop - start) & avxSizeMask
|
||||||
n = (n >> 5) << 5
|
|
||||||
|
|
||||||
switch len(in) {
|
switch len(in) {
|
||||||
case 1:
|
case 1:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_1x1_64(matrix, in, out, start, n)
|
mulAvxTwo_1x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_1x2_64(matrix, in, out, start, n)
|
mulAvxTwo_1x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_1x3_64(matrix, in, out, start, n)
|
mulAvxTwo_1x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -55,15 +57,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 2:
|
case 2:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_2x1_64(matrix, in, out, start, n)
|
mulAvxTwo_2x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_2x2_64(matrix, in, out, start, n)
|
mulAvxTwo_2x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_2x3_64(matrix, in, out, start, n)
|
mulAvxTwo_2x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -91,15 +90,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 3:
|
case 3:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_3x1_64(matrix, in, out, start, n)
|
mulAvxTwo_3x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_3x2_64(matrix, in, out, start, n)
|
mulAvxTwo_3x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_3x3_64(matrix, in, out, start, n)
|
mulAvxTwo_3x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -127,15 +123,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 4:
|
case 4:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_4x1_64(matrix, in, out, start, n)
|
mulAvxTwo_4x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_4x2_64(matrix, in, out, start, n)
|
mulAvxTwo_4x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_4x3_64(matrix, in, out, start, n)
|
mulAvxTwo_4x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -163,15 +156,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 5:
|
case 5:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_5x1_64(matrix, in, out, start, n)
|
mulAvxTwo_5x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_5x2_64(matrix, in, out, start, n)
|
mulAvxTwo_5x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_5x3_64(matrix, in, out, start, n)
|
mulAvxTwo_5x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -199,15 +189,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 6:
|
case 6:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_6x1_64(matrix, in, out, start, n)
|
mulAvxTwo_6x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_6x2_64(matrix, in, out, start, n)
|
mulAvxTwo_6x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_6x3_64(matrix, in, out, start, n)
|
mulAvxTwo_6x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -235,15 +222,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 7:
|
case 7:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_7x1_64(matrix, in, out, start, n)
|
mulAvxTwo_7x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_7x2_64(matrix, in, out, start, n)
|
mulAvxTwo_7x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_7x3_64(matrix, in, out, start, n)
|
mulAvxTwo_7x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -271,15 +255,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 8:
|
case 8:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_8x1_64(matrix, in, out, start, n)
|
mulAvxTwo_8x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_8x2_64(matrix, in, out, start, n)
|
mulAvxTwo_8x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_8x3_64(matrix, in, out, start, n)
|
mulAvxTwo_8x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -307,15 +288,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 9:
|
case 9:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_9x1_64(matrix, in, out, start, n)
|
mulAvxTwo_9x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_9x2_64(matrix, in, out, start, n)
|
mulAvxTwo_9x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_9x3_64(matrix, in, out, start, n)
|
mulAvxTwo_9x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -343,15 +321,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
case 10:
|
case 10:
|
||||||
switch len(out) {
|
switch len(out) {
|
||||||
case 1:
|
case 1:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_10x1_64(matrix, in, out, start, n)
|
mulAvxTwo_10x1_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 2:
|
case 2:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_10x2_64(matrix, in, out, start, n)
|
mulAvxTwo_10x2_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 3:
|
case 3:
|
||||||
n = (n >> 6) << 6
|
|
||||||
mulAvxTwo_10x3_64(matrix, in, out, start, n)
|
mulAvxTwo_10x3_64(matrix, in, out, start, n)
|
||||||
return n
|
return n
|
||||||
case 4:
|
case 4:
|
||||||
|
@ -379,3 +354,341 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
}
|
}
|
||||||
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||||
|
n := (stop - start) & avxSizeMask
|
||||||
|
|
||||||
|
switch len(in) {
|
||||||
|
case 1:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_1x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_1x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_1x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_1x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_1x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_1x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_1x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_1x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_1x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_1x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 2:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_2x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_2x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_2x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_2x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_2x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_2x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_2x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_2x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_2x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_2x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 3:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_3x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_3x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_3x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_3x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_3x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_3x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_3x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_3x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_3x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_3x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 4:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_4x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_4x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_4x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_4x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_4x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_4x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_4x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_4x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_4x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_4x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 5:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_5x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_5x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_5x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_5x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_5x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_5x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_5x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_5x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_5x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_5x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 6:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_6x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_6x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_6x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_6x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_6x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_6x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_6x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_6x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_6x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_6x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 7:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_7x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_7x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_7x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_7x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_7x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_7x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_7x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_7x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_7x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_7x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 8:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_8x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_8x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_8x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_8x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_8x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_8x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_8x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_8x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_8x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_8x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 9:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_9x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_9x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_9x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_9x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_9x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_9x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_9x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_9x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_9x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_9x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
case 10:
|
||||||
|
switch len(out) {
|
||||||
|
case 1:
|
||||||
|
mulAvxTwo_10x1_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 2:
|
||||||
|
mulAvxTwo_10x2_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 3:
|
||||||
|
mulAvxTwo_10x3_64Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 4:
|
||||||
|
mulAvxTwo_10x4Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 5:
|
||||||
|
mulAvxTwo_10x5Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 6:
|
||||||
|
mulAvxTwo_10x6Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 7:
|
||||||
|
mulAvxTwo_10x7Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 8:
|
||||||
|
mulAvxTwo_10x8Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 9:
|
||||||
|
mulAvxTwo_10x9Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
case 10:
|
||||||
|
mulAvxTwo_10x10Xor(matrix, in, out, start, n)
|
||||||
|
return n
|
||||||
|
}
|
||||||
|
}
|
||||||
|
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||||
|
}
|
||||||
|
|
|
@ -5,10 +5,10 @@
|
||||||
|
|
||||||
package reedsolomon
|
package reedsolomon
|
||||||
|
|
||||||
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
panic("codeSomeShardsAvx512 should not be called if built without asm")
|
panic("codeSomeShardsAvx512 should not be called if built without asm")
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
panic("codeSomeShardsAvx512P should not be called if built without asm")
|
panic("codeSomeShardsAvx512P should not be called if built without asm")
|
||||||
}
|
}
|
||||||
|
|
314
reedsolomon.go
314
reedsolomon.go
|
@ -112,6 +112,9 @@ const (
|
||||||
avx2CodeGenMinSize = 64
|
avx2CodeGenMinSize = 64
|
||||||
avx2CodeGenMinShards = 3
|
avx2CodeGenMinShards = 3
|
||||||
avx2CodeGenMaxGoroutines = 8
|
avx2CodeGenMaxGoroutines = 8
|
||||||
|
|
||||||
|
intSize = 32 << (^uint(0) >> 63) // 32 or 64
|
||||||
|
maxInt = 1<<(intSize-1) - 1
|
||||||
)
|
)
|
||||||
|
|
||||||
// reedSolomon contains a matrix for a specific
|
// reedSolomon contains a matrix for a specific
|
||||||
|
@ -291,6 +294,24 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
||||||
|
|
||||||
// Calculate what we want per round
|
// Calculate what we want per round
|
||||||
r.o.perRound = cpuid.CPU.Cache.L2
|
r.o.perRound = cpuid.CPU.Cache.L2
|
||||||
|
|
||||||
|
divide := parityShards + 1
|
||||||
|
if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) {
|
||||||
|
// Base on L1 cache if we have many inputs.
|
||||||
|
r.o.perRound = cpuid.CPU.Cache.L1D
|
||||||
|
divide = 0
|
||||||
|
if dataShards > maxAvx2Inputs {
|
||||||
|
divide += maxAvx2Inputs
|
||||||
|
} else {
|
||||||
|
divide += dataShards
|
||||||
|
}
|
||||||
|
if parityShards > maxAvx2Inputs {
|
||||||
|
divide += maxAvx2Outputs
|
||||||
|
} else {
|
||||||
|
divide += parityShards
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if r.o.perRound <= 0 {
|
if r.o.perRound <= 0 {
|
||||||
// Set to 128K if undetectable.
|
// Set to 128K if undetectable.
|
||||||
r.o.perRound = 128 << 10
|
r.o.perRound = 128 << 10
|
||||||
|
@ -300,8 +321,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
||||||
// If multiple threads per core, make sure they don't contend for cache.
|
// If multiple threads per core, make sure they don't contend for cache.
|
||||||
r.o.perRound /= cpuid.CPU.ThreadsPerCore
|
r.o.perRound /= cpuid.CPU.ThreadsPerCore
|
||||||
}
|
}
|
||||||
|
|
||||||
// 1 input + parity must fit in cache, and we add one more to be safer.
|
// 1 input + parity must fit in cache, and we add one more to be safer.
|
||||||
r.o.perRound = r.o.perRound / (1 + parityShards)
|
r.o.perRound = r.o.perRound / divide
|
||||||
// Align to 64 bytes.
|
// Align to 64 bytes.
|
||||||
r.o.perRound = ((r.o.perRound + 63) / 64) * 64
|
r.o.perRound = ((r.o.perRound + 63) / 64) * 64
|
||||||
|
|
||||||
|
@ -319,10 +341,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if r.o.perRound < r.o.minSplitSize {
|
|
||||||
r.o.perRound = r.o.minSplitSize
|
|
||||||
}
|
|
||||||
|
|
||||||
if r.o.shardSize > 0 {
|
if r.o.shardSize > 0 {
|
||||||
p := runtime.GOMAXPROCS(0)
|
p := runtime.GOMAXPROCS(0)
|
||||||
if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 {
|
if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 {
|
||||||
|
@ -347,7 +365,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
||||||
|
|
||||||
// Generated AVX2 does not need data to stay in L1 cache between runs.
|
// Generated AVX2 does not need data to stay in L1 cache between runs.
|
||||||
// We will be purely limited by RAM speed.
|
// We will be purely limited by RAM speed.
|
||||||
if r.canAVX2C(avx2CodeGenMinSize, r.DataShards, r.ParityShards) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
|
if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines {
|
||||||
r.o.maxGoroutines = avx2CodeGenMaxGoroutines
|
r.o.maxGoroutines = avx2CodeGenMaxGoroutines
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -366,8 +384,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
||||||
}
|
}
|
||||||
|
|
||||||
if avx2CodeGen && r.o.useAVX2 {
|
if avx2CodeGen && r.o.useAVX2 {
|
||||||
|
sz := r.DataShards * r.ParityShards * 2 * 32
|
||||||
r.mPool.New = func() interface{} {
|
r.mPool.New = func() interface{} {
|
||||||
return make([]byte, r.Shards*2*32)
|
return make([]byte, sz)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return &r, err
|
return &r, err
|
||||||
|
@ -398,7 +417,7 @@ func (r *reedSolomon) Encode(shards [][]byte) error {
|
||||||
output := shards[r.DataShards:]
|
output := shards[r.DataShards:]
|
||||||
|
|
||||||
// Do the coding.
|
// Do the coding.
|
||||||
r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0]))
|
r.codeSomeShards(r.parity, shards[0:r.DataShards], output[:r.ParityShards], len(shards[0]))
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -558,7 +577,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
|
||||||
toCheck := shards[r.DataShards:]
|
toCheck := shards[r.DataShards:]
|
||||||
|
|
||||||
// Do the checking.
|
// Do the checking.
|
||||||
return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil
|
return r.checkSomeShards(r.parity, shards[:r.DataShards], toCheck[:r.ParityShards], len(shards[0])), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
|
func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
|
||||||
|
@ -576,19 +595,19 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool {
|
||||||
// The number of outputs computed, and the
|
// The number of outputs computed, and the
|
||||||
// number of matrix rows used, is determined by
|
// number of matrix rows used, is determined by
|
||||||
// outputCount, which is the number of outputs to compute.
|
// outputCount, which is the number of outputs to compute.
|
||||||
func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
if len(outputs) == 0 {
|
if len(outputs) == 0 {
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
switch {
|
switch {
|
||||||
case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2:
|
case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2:
|
||||||
r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount)
|
r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, byteCount)
|
||||||
return
|
return
|
||||||
case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2:
|
case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2:
|
||||||
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount)
|
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, byteCount)
|
||||||
return
|
return
|
||||||
case r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize:
|
case byteCount > r.o.minSplitSize:
|
||||||
r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
|
r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount)
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -598,16 +617,49 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
|
||||||
end = len(inputs[0])
|
end = len(inputs[0])
|
||||||
}
|
}
|
||||||
if r.canAVX2C(byteCount, len(inputs), len(outputs)) {
|
if r.canAVX2C(byteCount, len(inputs), len(outputs)) {
|
||||||
m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
|
m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte))
|
||||||
start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
|
start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
|
||||||
r.mPool.Put(m)
|
r.mPool.Put(m)
|
||||||
end = len(inputs[0])
|
end = len(inputs[0])
|
||||||
|
} else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) {
|
||||||
|
end = len(inputs[0])
|
||||||
|
inIdx := 0
|
||||||
|
m := r.mPool.Get().([]byte)
|
||||||
|
defer r.mPool.Put(m)
|
||||||
|
ins := inputs
|
||||||
|
for len(ins) > 0 {
|
||||||
|
inPer := ins
|
||||||
|
if len(inPer) > maxAvx2Inputs {
|
||||||
|
inPer = inPer[:maxAvx2Inputs]
|
||||||
|
}
|
||||||
|
outs := outputs
|
||||||
|
outIdx := 0
|
||||||
|
for len(outs) > 0 {
|
||||||
|
outPer := outs
|
||||||
|
if len(outPer) > maxAvx2Outputs {
|
||||||
|
outPer = outPer[:maxAvx2Outputs]
|
||||||
|
}
|
||||||
|
m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m)
|
||||||
|
if inIdx == 0 {
|
||||||
|
galMulSlicesAvx2(m, inPer, outPer, 0, byteCount)
|
||||||
|
} else {
|
||||||
|
galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount)
|
||||||
|
}
|
||||||
|
start = byteCount & avxSizeMask
|
||||||
|
outIdx += len(outPer)
|
||||||
|
outs = outs[len(outPer):]
|
||||||
|
}
|
||||||
|
inIdx += len(inPer)
|
||||||
|
ins = ins[len(inPer):]
|
||||||
|
}
|
||||||
|
if start >= end {
|
||||||
|
return
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
for start < len(inputs[0]) {
|
for start < len(inputs[0]) {
|
||||||
for c := 0; c < r.DataShards; c++ {
|
for c := 0; c < len(inputs); c++ {
|
||||||
in := inputs[c][start:end]
|
in := inputs[c][start:end]
|
||||||
for iRow := 0; iRow < outputCount; iRow++ {
|
for iRow := 0; iRow < len(outputs); iRow++ {
|
||||||
if c == 0 {
|
if c == 0 {
|
||||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o)
|
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o)
|
||||||
} else {
|
} else {
|
||||||
|
@ -625,15 +677,21 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
|
||||||
|
|
||||||
// Perform the same as codeSomeShards, but split the workload into
|
// Perform the same as codeSomeShards, but split the workload into
|
||||||
// several goroutines.
|
// several goroutines.
|
||||||
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
var wg sync.WaitGroup
|
var wg sync.WaitGroup
|
||||||
gor := r.o.maxGoroutines
|
gor := r.o.maxGoroutines
|
||||||
|
|
||||||
var avx2Matrix []byte
|
var avx2Matrix []byte
|
||||||
useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
|
useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs))
|
||||||
if useAvx2 {
|
if useAvx2 {
|
||||||
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
|
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte))
|
||||||
defer r.mPool.Put(avx2Matrix)
|
defer r.mPool.Put(avx2Matrix)
|
||||||
|
} else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards &&
|
||||||
|
r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) {
|
||||||
|
// It appears there is a switchover point at around 10MB where
|
||||||
|
// Regular processing is faster...
|
||||||
|
r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount)
|
||||||
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
do := byteCount / gor
|
do := byteCount / gor
|
||||||
|
@ -641,6 +699,40 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
|
||||||
do = r.o.minSplitSize
|
do = r.o.minSplitSize
|
||||||
}
|
}
|
||||||
|
|
||||||
|
exec := func(start, stop int) {
|
||||||
|
if useAvx2 && stop-start >= 64 {
|
||||||
|
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
|
||||||
|
}
|
||||||
|
|
||||||
|
lstart, lstop := start, start+r.o.perRound
|
||||||
|
if lstop > stop {
|
||||||
|
lstop = stop
|
||||||
|
}
|
||||||
|
for lstart < stop {
|
||||||
|
for c := 0; c < len(inputs); c++ {
|
||||||
|
in := inputs[c][lstart:lstop]
|
||||||
|
for iRow := 0; iRow < len(outputs); iRow++ {
|
||||||
|
if c == 0 {
|
||||||
|
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
||||||
|
} else {
|
||||||
|
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lstart = lstop
|
||||||
|
lstop += r.o.perRound
|
||||||
|
if lstop > stop {
|
||||||
|
lstop = stop
|
||||||
|
}
|
||||||
|
}
|
||||||
|
wg.Done()
|
||||||
|
}
|
||||||
|
if gor <= 1 {
|
||||||
|
wg.Add(1)
|
||||||
|
exec(0, byteCount)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
// Make sizes divisible by 64
|
// Make sizes divisible by 64
|
||||||
do = (do + 63) & (^63)
|
do = (do + 63) & (^63)
|
||||||
start := 0
|
start := 0
|
||||||
|
@ -650,34 +742,162 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
|
||||||
}
|
}
|
||||||
|
|
||||||
wg.Add(1)
|
wg.Add(1)
|
||||||
go func(start, stop int) {
|
go exec(start, start+do)
|
||||||
if useAvx2 && stop-start >= 64 {
|
start += do
|
||||||
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
|
}
|
||||||
|
wg.Wait()
|
||||||
|
}
|
||||||
|
|
||||||
|
// Perform the same as codeSomeShards, but split the workload into
|
||||||
|
// several goroutines.
|
||||||
|
func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) {
|
||||||
|
var wg sync.WaitGroup
|
||||||
|
gor := r.o.maxGoroutines
|
||||||
|
|
||||||
|
type state struct {
|
||||||
|
input [][]byte
|
||||||
|
output [][]byte
|
||||||
|
m []byte
|
||||||
|
first bool
|
||||||
|
}
|
||||||
|
// Make a plan...
|
||||||
|
plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs))
|
||||||
|
|
||||||
|
tmp := r.mPool.Get().([]byte)
|
||||||
|
defer func(b []byte) {
|
||||||
|
r.mPool.Put(b)
|
||||||
|
}(tmp)
|
||||||
|
|
||||||
|
// Flips between input first to output first.
|
||||||
|
// We put the smallest data load in the inner loop.
|
||||||
|
if len(inputs) > len(outputs) {
|
||||||
|
inIdx := 0
|
||||||
|
ins := inputs
|
||||||
|
for len(ins) > 0 {
|
||||||
|
inPer := ins
|
||||||
|
if len(inPer) > maxAvx2Inputs {
|
||||||
|
inPer = inPer[:maxAvx2Inputs]
|
||||||
|
}
|
||||||
|
outs := outputs
|
||||||
|
outIdx := 0
|
||||||
|
for len(outs) > 0 {
|
||||||
|
outPer := outs
|
||||||
|
if len(outPer) > maxAvx2Outputs {
|
||||||
|
outPer = outPer[:maxAvx2Outputs]
|
||||||
|
}
|
||||||
|
// Generate local matrix
|
||||||
|
m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
|
||||||
|
tmp = tmp[len(m):]
|
||||||
|
plan = append(plan, state{
|
||||||
|
input: inPer,
|
||||||
|
output: outPer,
|
||||||
|
m: m,
|
||||||
|
first: inIdx == 0,
|
||||||
|
})
|
||||||
|
outIdx += len(outPer)
|
||||||
|
outs = outs[len(outPer):]
|
||||||
|
}
|
||||||
|
inIdx += len(inPer)
|
||||||
|
ins = ins[len(inPer):]
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
outs := outputs
|
||||||
|
outIdx := 0
|
||||||
|
for len(outs) > 0 {
|
||||||
|
outPer := outs
|
||||||
|
if len(outPer) > maxAvx2Outputs {
|
||||||
|
outPer = outPer[:maxAvx2Outputs]
|
||||||
}
|
}
|
||||||
|
|
||||||
lstart, lstop := start, start+r.o.perRound
|
inIdx := 0
|
||||||
|
ins := inputs
|
||||||
|
for len(ins) > 0 {
|
||||||
|
inPer := ins
|
||||||
|
if len(inPer) > maxAvx2Inputs {
|
||||||
|
inPer = inPer[:maxAvx2Inputs]
|
||||||
|
}
|
||||||
|
// Generate local matrix
|
||||||
|
m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp)
|
||||||
|
tmp = tmp[len(m):]
|
||||||
|
//fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound)
|
||||||
|
plan = append(plan, state{
|
||||||
|
input: inPer,
|
||||||
|
output: outPer,
|
||||||
|
m: m,
|
||||||
|
first: inIdx == 0,
|
||||||
|
})
|
||||||
|
inIdx += len(inPer)
|
||||||
|
ins = ins[len(inPer):]
|
||||||
|
}
|
||||||
|
outIdx += len(outPer)
|
||||||
|
outs = outs[len(outPer):]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
do := byteCount / gor
|
||||||
|
if do < r.o.minSplitSize {
|
||||||
|
do = r.o.minSplitSize
|
||||||
|
}
|
||||||
|
|
||||||
|
exec := func(start, stop int) {
|
||||||
|
lstart, lstop := start, start+r.o.perRound
|
||||||
|
if lstop > stop {
|
||||||
|
lstop = stop
|
||||||
|
}
|
||||||
|
for lstart < stop {
|
||||||
|
if lstop-lstart >= minAvx2Size {
|
||||||
|
// Execute plan...
|
||||||
|
for _, p := range plan {
|
||||||
|
if p.first {
|
||||||
|
galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop)
|
||||||
|
} else {
|
||||||
|
galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lstart += (lstop - lstart) & avxSizeMask
|
||||||
|
if lstart == lstop {
|
||||||
|
lstop += r.o.perRound
|
||||||
|
if lstop > stop {
|
||||||
|
lstop = stop
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for c := range inputs {
|
||||||
|
in := inputs[c][lstart:lstop]
|
||||||
|
for iRow := 0; iRow < len(outputs); iRow++ {
|
||||||
|
if c == 0 {
|
||||||
|
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
||||||
|
} else {
|
||||||
|
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
lstart = lstop
|
||||||
|
lstop += r.o.perRound
|
||||||
if lstop > stop {
|
if lstop > stop {
|
||||||
lstop = stop
|
lstop = stop
|
||||||
}
|
}
|
||||||
for lstart < stop {
|
}
|
||||||
for c := 0; c < r.DataShards; c++ {
|
wg.Done()
|
||||||
in := inputs[c][lstart:lstop]
|
}
|
||||||
for iRow := 0; iRow < outputCount; iRow++ {
|
if gor == 1 {
|
||||||
if c == 0 {
|
wg.Add(1)
|
||||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
exec(0, byteCount)
|
||||||
} else {
|
return
|
||||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o)
|
}
|
||||||
}
|
|
||||||
}
|
// Make sizes divisible by 64
|
||||||
}
|
do = (do + 63) & (^63)
|
||||||
lstart = lstop
|
start := 0
|
||||||
lstop += r.o.perRound
|
for start < byteCount {
|
||||||
if lstop > stop {
|
if start+do > byteCount {
|
||||||
lstop = stop
|
do = byteCount - start
|
||||||
}
|
}
|
||||||
}
|
|
||||||
wg.Done()
|
wg.Add(1)
|
||||||
}(start, start+do)
|
go exec(start, start+do)
|
||||||
start += do
|
start += do
|
||||||
}
|
}
|
||||||
wg.Wait()
|
wg.Wait()
|
||||||
|
@ -686,7 +906,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp
|
||||||
// checkSomeShards is mostly the same as codeSomeShards,
|
// checkSomeShards is mostly the same as codeSomeShards,
|
||||||
// except this will check values and return
|
// except this will check values and return
|
||||||
// as soon as a difference is found.
|
// as soon as a difference is found.
|
||||||
func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool {
|
||||||
if len(toCheck) == 0 {
|
if len(toCheck) == 0 {
|
||||||
return true
|
return true
|
||||||
}
|
}
|
||||||
|
@ -695,7 +915,7 @@ func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outp
|
||||||
for i := range outputs {
|
for i := range outputs {
|
||||||
outputs[i] = make([]byte, byteCount)
|
outputs[i] = make([]byte, byteCount)
|
||||||
}
|
}
|
||||||
r.codeSomeShards(matrixRows, inputs, outputs, outputCount, byteCount)
|
r.codeSomeShards(matrixRows, inputs, outputs, byteCount)
|
||||||
|
|
||||||
for i, calc := range outputs {
|
for i, calc := range outputs {
|
||||||
if !bytes.Equal(calc, toCheck[i]) {
|
if !bytes.Equal(calc, toCheck[i]) {
|
||||||
|
@ -902,7 +1122,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
|
||||||
outputCount++
|
outputCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize)
|
r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize)
|
||||||
|
|
||||||
if dataOnly {
|
if dataOnly {
|
||||||
// Exit out early if we are only interested in the data shards
|
// Exit out early if we are only interested in the data shards
|
||||||
|
@ -928,7 +1148,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
|
||||||
outputCount++
|
outputCount++
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize)
|
r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], shardSize)
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -191,7 +191,7 @@ func TestEncoding(t *testing.T) {
|
||||||
// note that par1 matric will fail on some combinations.
|
// note that par1 matric will fail on some combinations.
|
||||||
var testSizes = [][2]int{
|
var testSizes = [][2]int{
|
||||||
{1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0},
|
{1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0},
|
||||||
{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}}
|
{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}}
|
||||||
var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
|
var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
|
||||||
var testDataSizesShort = []int{10, 10001, 100003}
|
var testDataSizesShort = []int{10, 10001, 100003}
|
||||||
|
|
||||||
|
@ -893,6 +893,7 @@ func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) {
|
||||||
|
|
||||||
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
|
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
err = r.Encode(shards)
|
err = r.Encode(shards)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -937,7 +938,7 @@ func BenchmarkEncode10x4x1M(b *testing.B) {
|
||||||
benchmarkEncode(b, 10, 4, 1024*1024)
|
benchmarkEncode(b, 10, 4, 1024*1024)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Benchmark 50 data shards and 20 parity shards with 1MB each.
|
// Benchmark 50 data shards and 20 parity shards with 1M each.
|
||||||
func BenchmarkEncode50x20x1M(b *testing.B) {
|
func BenchmarkEncode50x20x1M(b *testing.B) {
|
||||||
benchmarkEncode(b, 50, 20, 1024*1024)
|
benchmarkEncode(b, 50, 20, 1024*1024)
|
||||||
}
|
}
|
||||||
|
@ -989,6 +990,7 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) {
|
||||||
|
|
||||||
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
|
b.SetBytes(int64(shardSize * (dataShards + parityShards)))
|
||||||
b.ResetTimer()
|
b.ResetTimer()
|
||||||
|
b.ReportAllocs()
|
||||||
for i := 0; i < b.N; i++ {
|
for i := 0; i < b.N; i++ {
|
||||||
_, err = r.Verify(shards)
|
_, err = r.Verify(shards)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
@ -1003,7 +1005,7 @@ func BenchmarkVerify10x2x10000(b *testing.B) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// Benchmark 50 data slices with 5 parity slices holding 100000 bytes each
|
// Benchmark 50 data slices with 5 parity slices holding 100000 bytes each
|
||||||
func BenchmarkVerify50x5x50000(b *testing.B) {
|
func BenchmarkVerify50x5x100000(b *testing.B) {
|
||||||
benchmarkVerify(b, 50, 5, 100000)
|
benchmarkVerify(b, 50, 5, 100000)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1359,11 +1361,11 @@ func TestCodeSomeShards(t *testing.T) {
|
||||||
shards, _ := enc.Split(data)
|
shards, _ := enc.Split(data)
|
||||||
|
|
||||||
old := runtime.GOMAXPROCS(1)
|
old := runtime.GOMAXPROCS(1)
|
||||||
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
|
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
|
||||||
|
|
||||||
// hopefully more than 1 CPU
|
// hopefully more than 1 CPU
|
||||||
runtime.GOMAXPROCS(runtime.NumCPU())
|
runtime.GOMAXPROCS(runtime.NumCPU())
|
||||||
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
|
r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
|
||||||
|
|
||||||
// reset MAXPROCS, otherwise testing complains
|
// reset MAXPROCS, otherwise testing complains
|
||||||
runtime.GOMAXPROCS(old)
|
runtime.GOMAXPROCS(old)
|
||||||
|
@ -1642,7 +1644,9 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) {
|
||||||
c := runtime.GOMAXPROCS(0)
|
c := runtime.GOMAXPROCS(0)
|
||||||
|
|
||||||
// Note that concurrency also affects total data size and will make caches less effective.
|
// Note that concurrency also affects total data size and will make caches less effective.
|
||||||
b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB")
|
if testing.Verbose() {
|
||||||
|
b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB")
|
||||||
|
}
|
||||||
// Create independent shards
|
// Create independent shards
|
||||||
shardsCh := make(chan [][]byte, c)
|
shardsCh := make(chan [][]byte, c)
|
||||||
for i := 0; i < c; i++ {
|
for i := 0; i < c; i++ {
|
||||||
|
|
Loading…
Reference in New Issue