reedsolomon-go/galoisAvx512_amd64_test.go

416 lines
12 KiB
Go
Raw Permalink Normal View History

//go:build !noasm && !appengine && !gccgo
// +build !noasm,!appengine,!gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
package reedsolomon
import (
"bytes"
"math/rand"
"testing"
"time"
)
func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
rand.Seed(time.Now().UnixNano())
var size = 1024 * 1024
if testing.Short() {
size = 4096
}
in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut81]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel81(in, out, &matrix, false)
expect := make([][]byte, len(out))
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
rand.Seed(time.Now().UnixNano())
var size = 1024 * 1024
if testing.Short() {
size = 4096
}
in, out := make([][]byte, inputSize), make([][]byte, dimOut82)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut82]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel82(in, out, &matrix, false)
expect := make([][]byte, len(out))
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel82(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel12(t *testing.T) { testGaloisAvx512Parallelx2(t, 1) }
func TestGaloisAvx512Parallel22(t *testing.T) { testGaloisAvx512Parallelx2(t, 2) }
func TestGaloisAvx512Parallel32(t *testing.T) { testGaloisAvx512Parallelx2(t, 3) }
func TestGaloisAvx512Parallel42(t *testing.T) { testGaloisAvx512Parallelx2(t, 4) }
func TestGaloisAvx512Parallel52(t *testing.T) { testGaloisAvx512Parallelx2(t, 5) }
func TestGaloisAvx512Parallel62(t *testing.T) { testGaloisAvx512Parallelx2(t, 6) }
func TestGaloisAvx512Parallel72(t *testing.T) { testGaloisAvx512Parallelx2(t, 7) }
func TestGaloisAvx512Parallel82(t *testing.T) { testGaloisAvx512Parallelx2(t, 8) }
func testGaloisAvx512Parallelx4(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
rand.Seed(time.Now().UnixNano())
var size = 1 << 20
if testing.Short() {
size = 4096
}
in, out := make([][]byte, inputSize), make([][]byte, dimOut84)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut84]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel84(in, out, &matrix, false)
expect := make([][]byte, 4)
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{})
galMulSlice(coeffs[dimIn*2+i], in[i], expect[2], &options{})
galMulSlice(coeffs[dimIn*3+i], in[i], expect[3], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{})
galMulSliceXor(coeffs[dimIn*2+i], in[i], expect[2], &options{})
galMulSliceXor(coeffs[dimIn*3+i], in[i], expect[3], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel84(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{})
galMulSliceXor(coeffs[dimIn*2+i], inToAdd[i], expect[2], &options{})
galMulSliceXor(coeffs[dimIn*3+i], inToAdd[i], expect[3], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel14(t *testing.T) { testGaloisAvx512Parallelx4(t, 1) }
func TestGaloisAvx512Parallel24(t *testing.T) { testGaloisAvx512Parallelx4(t, 2) }
func TestGaloisAvx512Parallel34(t *testing.T) { testGaloisAvx512Parallelx4(t, 3) }
func TestGaloisAvx512Parallel44(t *testing.T) { testGaloisAvx512Parallelx4(t, 4) }
func TestGaloisAvx512Parallel54(t *testing.T) { testGaloisAvx512Parallelx4(t, 5) }
func TestGaloisAvx512Parallel64(t *testing.T) { testGaloisAvx512Parallelx4(t, 6) }
func TestGaloisAvx512Parallel74(t *testing.T) { testGaloisAvx512Parallelx4(t, 7) }
func TestGaloisAvx512Parallel84(t *testing.T) { testGaloisAvx512Parallelx4(t, 8) }
func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bool) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
var data = make([]byte, l)
fillRandom(data)
enc, _ := New(ds, ps)
r := enc.(*reedSolomon) // need to access private methods
shards, _ := enc.Split(data)
// Fill shards to encode with garbage
for i := r.DataShards; i < r.DataShards+r.ParityShards; i++ {
rand.Read(shards[i])
}
if parallel {
avx2: Improve speed when > 10 input or output shards. (#174) Speeds are including a limiting the number of goroutines with all AVX2 paths, Before/after ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2240 2240 +0.00% BenchmarkGalois1M-32 19578 18891 -3.51% BenchmarkGaloisXor128K-32 2798 2852 +1.93% BenchmarkGaloisXor1M-32 23334 23345 +0.05% BenchmarkEncode2x1x1M-32 34357 34370 +0.04% BenchmarkEncode10x2x10000-32 3210 3093 -3.64% BenchmarkEncode100x20x10000-32 362925 148214 -59.16% BenchmarkEncode17x3x1M-32 323767 224157 -30.77% BenchmarkEncode10x4x16M-32 8376895 8376737 -0.00% BenchmarkEncode5x2x1M-32 68365 66861 -2.20% BenchmarkEncode10x2x1M-32 101407 93023 -8.27% BenchmarkEncode10x4x1M-32 171880 155477 -9.54% BenchmarkEncode50x20x1M-32 3704691 3015047 -18.62% BenchmarkEncode17x3x16M-32 10279233 10106658 -1.68% BenchmarkEncode_8x4x8M-32 3438245 3326479 -3.25% BenchmarkEncode_12x4x12M-32 6632257 6581637 -0.76% BenchmarkEncode_16x4x16M-32 10815755 10788377 -0.25% BenchmarkEncode_16x4x32M-32 21029061 21507995 +2.28% BenchmarkEncode_16x4x64M-32 42145450 43876850 +4.11% BenchmarkEncode_8x5x8M-32 4543208 3846378 -15.34% BenchmarkEncode_8x6x8M-32 5065494 4397218 -13.19% BenchmarkEncode_8x7x8M-32 5818995 4962884 -14.71% BenchmarkEncode_8x9x8M-32 6215449 6114898 -1.62% BenchmarkEncode_8x10x8M-32 6923415 6610501 -4.52% BenchmarkEncode_8x11x8M-32 7365988 7010473 -4.83% BenchmarkEncode_8x8x05M-32 150857 136820 -9.30% BenchmarkEncode_8x8x1M-32 256722 254854 -0.73% BenchmarkEncode_8x8x8M-32 5547790 5422048 -2.27% BenchmarkEncode_8x8x32M-32 23038643 22705859 -1.44% BenchmarkEncode_24x8x24M-32 27729259 30332216 +9.39% BenchmarkEncode_24x8x48M-32 53865705 61187658 +13.59% BenchmarkVerify10x2x10000-32 8769 8154 -7.01% BenchmarkVerify10x2x1M-32 516149 476180 -7.74% BenchmarkVerify5x2x1M-32 443888 419541 -5.48% BenchmarkVerify10x4x1M-32 1030299 948021 -7.99% BenchmarkVerify50x20x1M-32 7209689 6186891 -14.19% BenchmarkVerify10x4x16M-32 17774456 17681879 -0.52% BenchmarkReconstruct10x2x10000-32 3352 3256 -2.86% BenchmarkReconstruct50x5x50000-32 166417 140900 -15.33% BenchmarkReconstruct10x2x1M-32 189711 174615 -7.96% BenchmarkReconstruct5x2x1M-32 128080 126520 -1.22% BenchmarkReconstruct10x4x1M-32 273312 254017 -7.06% BenchmarkReconstruct50x20x1M-32 3628812 3192474 -12.02% BenchmarkReconstruct10x4x16M-32 8562186 8781479 +2.56% BenchmarkReconstructData10x2x10000-32 3241 3116 -3.86% BenchmarkReconstructData50x5x50000-32 162520 134794 -17.06% BenchmarkReconstructData10x2x1M-32 171253 161955 -5.43% BenchmarkReconstructData5x2x1M-32 102215 106942 +4.62% BenchmarkReconstructData10x4x1M-32 225593 219969 -2.49% BenchmarkReconstructData50x20x1M-32 2515311 2129721 -15.33% BenchmarkReconstructData10x4x16M-32 6980308 6698111 -4.04% BenchmarkReconstructP10x2x10000-32 924 937 +1.35% BenchmarkReconstructP10x5x20000-32 1639 1703 +3.90% BenchmarkSplit10x4x160M-32 4984993 4898045 -1.74% BenchmarkSplit5x2x5M-32 380415 221446 -41.79% BenchmarkSplit10x2x1M-32 58761 53335 -9.23% BenchmarkSplit10x4x10M-32 643188 410959 -36.11% BenchmarkSplit50x20x50M-32 1843879 1647205 -10.67% BenchmarkSplit17x3x272M-32 3684920 3613951 -1.93% BenchmarkParallel_8x8x64K-32 7022 6630 -5.58% BenchmarkParallel_8x8x05M-32 348308 348369 +0.02% BenchmarkParallel_20x10x05M-32 575672 581028 +0.93% BenchmarkParallel_8x8x1M-32 716033 697167 -2.63% BenchmarkParallel_8x8x8M-32 5716048 5616437 -1.74% BenchmarkParallel_8x8x32M-32 22650878 22098667 -2.44% BenchmarkParallel_8x3x1M-32 406839 399125 -1.90% BenchmarkParallel_8x4x1M-32 459107 463890 +1.04% BenchmarkParallel_8x5x1M-32 527488 520334 -1.36% BenchmarkStreamEncode10x2x10000-32 6013 5878 -2.25% BenchmarkStreamEncode100x20x10000-32 503124 267894 -46.75% BenchmarkStreamEncode17x3x1M-32 1561838 1376618 -11.86% BenchmarkStreamEncode10x4x16M-32 19124427 17762582 -7.12% BenchmarkStreamEncode5x2x1M-32 429701 384666 -10.48% BenchmarkStreamEncode10x2x1M-32 801257 763637 -4.70% BenchmarkStreamEncode10x4x1M-32 876065 820744 -6.31% BenchmarkStreamEncode50x20x1M-32 7205112 6081398 -15.60% BenchmarkStreamEncode17x3x16M-32 27182786 26117143 -3.92% BenchmarkStreamVerify10x2x10000-32 13767 14026 +1.88% BenchmarkStreamVerify50x5x50000-32 826983 690453 -16.51% BenchmarkStreamVerify10x2x1M-32 1238566 1182591 -4.52% BenchmarkStreamVerify5x2x1M-32 892661 806301 -9.67% BenchmarkStreamVerify10x4x1M-32 1676394 1631495 -2.68% BenchmarkStreamVerify50x20x1M-32 10877875 10037678 -7.72% BenchmarkStreamVerify10x4x16M-32 27599576 30435400 +10.27% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 58518.53 58510.17 1.00x BenchmarkGalois1M-32 53558.10 55507.44 1.04x BenchmarkGaloisXor128K-32 46839.74 45961.09 0.98x BenchmarkGaloisXor1M-32 44936.98 44917.46 1.00x BenchmarkEncode2x1x1M-32 91561.27 91524.11 1.00x BenchmarkEncode10x2x10000-32 37385.54 38792.54 1.04x BenchmarkEncode100x20x10000-32 3306.47 8096.40 2.45x BenchmarkEncode17x3x1M-32 64773.49 93557.14 1.44x BenchmarkEncode10x4x16M-32 28039.15 28039.68 1.00x BenchmarkEncode5x2x1M-32 107365.88 109781.16 1.02x BenchmarkEncode10x2x1M-32 124083.62 135266.27 1.09x BenchmarkEncode10x4x1M-32 85408.99 94419.71 1.11x BenchmarkEncode50x20x1M-32 19812.81 24344.67 1.23x BenchmarkEncode17x3x16M-32 32642.93 33200.32 1.02x BenchmarkEncode_8x4x8M-32 29277.52 30261.21 1.03x BenchmarkEncode_12x4x12M-32 30355.67 30589.14 1.01x BenchmarkEncode_16x4x16M-32 31023.66 31102.39 1.00x BenchmarkEncode_16x4x32M-32 31912.44 31201.82 0.98x BenchmarkEncode_16x4x64M-32 31846.32 30589.65 0.96x BenchmarkEncode_8x5x8M-32 24003.28 28351.84 1.18x BenchmarkEncode_8x6x8M-32 23184.41 26707.91 1.15x BenchmarkEncode_8x7x8M-32 21623.86 25354.03 1.17x BenchmarkEncode_8x9x8M-32 22943.85 23321.13 1.02x BenchmarkEncode_8x10x8M-32 21809.31 22841.68 1.05x BenchmarkEncode_8x11x8M-32 21637.77 22735.06 1.05x BenchmarkEncode_8x8x05M-32 55606.22 61311.47 1.10x BenchmarkEncode_8x8x1M-32 65351.80 65830.73 1.01x BenchmarkEncode_8x8x8M-32 24193.01 24754.07 1.02x BenchmarkEncode_8x8x32M-32 23303.06 23644.60 1.01x BenchmarkEncode_24x8x24M-32 29041.76 26549.54 0.91x BenchmarkEncode_24x8x48M-32 29900.52 26322.51 0.88x BenchmarkVerify10x2x10000-32 13685.12 14717.10 1.08x BenchmarkVerify10x2x1M-32 24378.43 26424.72 1.08x BenchmarkVerify5x2x1M-32 16535.79 17495.41 1.06x BenchmarkVerify10x4x1M-32 14248.35 15484.96 1.09x BenchmarkVerify50x20x1M-32 10180.79 11863.85 1.17x BenchmarkVerify10x4x16M-32 13214.53 13283.71 1.01x BenchmarkReconstruct10x2x10000-32 35799.16 36854.89 1.03x BenchmarkReconstruct50x5x50000-32 33049.47 39034.89 1.18x BenchmarkReconstruct10x2x1M-32 66326.88 72061.06 1.09x BenchmarkReconstruct5x2x1M-32 57308.21 58014.92 1.01x BenchmarkReconstruct10x4x1M-32 53711.74 57791.66 1.08x BenchmarkReconstruct50x20x1M-32 20227.09 22991.67 1.14x BenchmarkReconstruct10x4x16M-32 27432.37 26747.32 0.98x BenchmarkReconstructData10x2x10000-32 37030.86 38511.87 1.04x BenchmarkReconstructData50x5x50000-32 33842.07 40802.85 1.21x BenchmarkReconstructData10x2x1M-32 73475.57 77693.87 1.06x BenchmarkReconstructData5x2x1M-32 71809.58 68635.57 0.96x BenchmarkReconstructData10x4x1M-32 65073.27 66736.88 1.03x BenchmarkReconstructData50x20x1M-32 29181.41 34464.76 1.18x BenchmarkReconstructData10x4x16M-32 33649.09 35066.75 1.04x BenchmarkReconstructP10x2x10000-32 129819.98 128086.76 0.99x BenchmarkReconstructP10x5x20000-32 183073.89 176202.21 0.96x BenchmarkParallel_8x8x64K-32 149327.33 158153.67 1.06x BenchmarkParallel_8x8x05M-32 24083.89 24079.69 1.00x BenchmarkParallel_20x10x05M-32 27322.20 27070.35 0.99x BenchmarkParallel_8x8x1M-32 23430.78 24064.83 1.03x BenchmarkParallel_8x8x8M-32 23480.86 23897.31 1.02x BenchmarkParallel_8x8x32M-32 23701.99 24294.27 1.02x BenchmarkParallel_8x3x1M-32 28351.11 28899.03 1.02x BenchmarkParallel_8x4x1M-32 27407.34 27124.76 0.99x BenchmarkParallel_8x5x1M-32 25842.27 26197.58 1.01x BenchmarkStreamEncode10x2x10000-32 16629.76 17012.26 1.02x BenchmarkStreamEncode100x20x10000-32 1987.58 3732.83 1.88x BenchmarkStreamEncode17x3x1M-32 11413.34 12948.97 1.13x BenchmarkStreamEncode10x4x16M-32 8772.66 9445.26 1.08x BenchmarkStreamEncode5x2x1M-32 12201.21 13629.70 1.12x BenchmarkStreamEncode10x2x1M-32 13086.64 13731.34 1.05x BenchmarkStreamEncode10x4x1M-32 11969.16 12775.92 1.07x BenchmarkStreamEncode50x20x1M-32 7276.61 8621.18 1.18x BenchmarkStreamEncode17x3x16M-32 10492.40 10920.52 1.04x BenchmarkStreamVerify10x2x10000-32 7264.00 7129.49 0.98x BenchmarkStreamVerify50x5x50000-32 6046.07 7241.62 1.20x BenchmarkStreamVerify10x2x1M-32 8466.05 8866.77 1.05x BenchmarkStreamVerify5x2x1M-32 5873.31 6502.39 1.11x BenchmarkStreamVerify10x4x1M-32 6254.95 6427.09 1.03x BenchmarkStreamVerify50x20x1M-32 4819.76 5223.20 1.08x BenchmarkStreamVerify10x4x16M-32 6078.79 5512.40 0.91x ```
2021-12-09 14:28:44 +03:00
r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], len(shards[0]))
} else {
avx2: Improve speed when > 10 input or output shards. (#174) Speeds are including a limiting the number of goroutines with all AVX2 paths, Before/after ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2240 2240 +0.00% BenchmarkGalois1M-32 19578 18891 -3.51% BenchmarkGaloisXor128K-32 2798 2852 +1.93% BenchmarkGaloisXor1M-32 23334 23345 +0.05% BenchmarkEncode2x1x1M-32 34357 34370 +0.04% BenchmarkEncode10x2x10000-32 3210 3093 -3.64% BenchmarkEncode100x20x10000-32 362925 148214 -59.16% BenchmarkEncode17x3x1M-32 323767 224157 -30.77% BenchmarkEncode10x4x16M-32 8376895 8376737 -0.00% BenchmarkEncode5x2x1M-32 68365 66861 -2.20% BenchmarkEncode10x2x1M-32 101407 93023 -8.27% BenchmarkEncode10x4x1M-32 171880 155477 -9.54% BenchmarkEncode50x20x1M-32 3704691 3015047 -18.62% BenchmarkEncode17x3x16M-32 10279233 10106658 -1.68% BenchmarkEncode_8x4x8M-32 3438245 3326479 -3.25% BenchmarkEncode_12x4x12M-32 6632257 6581637 -0.76% BenchmarkEncode_16x4x16M-32 10815755 10788377 -0.25% BenchmarkEncode_16x4x32M-32 21029061 21507995 +2.28% BenchmarkEncode_16x4x64M-32 42145450 43876850 +4.11% BenchmarkEncode_8x5x8M-32 4543208 3846378 -15.34% BenchmarkEncode_8x6x8M-32 5065494 4397218 -13.19% BenchmarkEncode_8x7x8M-32 5818995 4962884 -14.71% BenchmarkEncode_8x9x8M-32 6215449 6114898 -1.62% BenchmarkEncode_8x10x8M-32 6923415 6610501 -4.52% BenchmarkEncode_8x11x8M-32 7365988 7010473 -4.83% BenchmarkEncode_8x8x05M-32 150857 136820 -9.30% BenchmarkEncode_8x8x1M-32 256722 254854 -0.73% BenchmarkEncode_8x8x8M-32 5547790 5422048 -2.27% BenchmarkEncode_8x8x32M-32 23038643 22705859 -1.44% BenchmarkEncode_24x8x24M-32 27729259 30332216 +9.39% BenchmarkEncode_24x8x48M-32 53865705 61187658 +13.59% BenchmarkVerify10x2x10000-32 8769 8154 -7.01% BenchmarkVerify10x2x1M-32 516149 476180 -7.74% BenchmarkVerify5x2x1M-32 443888 419541 -5.48% BenchmarkVerify10x4x1M-32 1030299 948021 -7.99% BenchmarkVerify50x20x1M-32 7209689 6186891 -14.19% BenchmarkVerify10x4x16M-32 17774456 17681879 -0.52% BenchmarkReconstruct10x2x10000-32 3352 3256 -2.86% BenchmarkReconstruct50x5x50000-32 166417 140900 -15.33% BenchmarkReconstruct10x2x1M-32 189711 174615 -7.96% BenchmarkReconstruct5x2x1M-32 128080 126520 -1.22% BenchmarkReconstruct10x4x1M-32 273312 254017 -7.06% BenchmarkReconstruct50x20x1M-32 3628812 3192474 -12.02% BenchmarkReconstruct10x4x16M-32 8562186 8781479 +2.56% BenchmarkReconstructData10x2x10000-32 3241 3116 -3.86% BenchmarkReconstructData50x5x50000-32 162520 134794 -17.06% BenchmarkReconstructData10x2x1M-32 171253 161955 -5.43% BenchmarkReconstructData5x2x1M-32 102215 106942 +4.62% BenchmarkReconstructData10x4x1M-32 225593 219969 -2.49% BenchmarkReconstructData50x20x1M-32 2515311 2129721 -15.33% BenchmarkReconstructData10x4x16M-32 6980308 6698111 -4.04% BenchmarkReconstructP10x2x10000-32 924 937 +1.35% BenchmarkReconstructP10x5x20000-32 1639 1703 +3.90% BenchmarkSplit10x4x160M-32 4984993 4898045 -1.74% BenchmarkSplit5x2x5M-32 380415 221446 -41.79% BenchmarkSplit10x2x1M-32 58761 53335 -9.23% BenchmarkSplit10x4x10M-32 643188 410959 -36.11% BenchmarkSplit50x20x50M-32 1843879 1647205 -10.67% BenchmarkSplit17x3x272M-32 3684920 3613951 -1.93% BenchmarkParallel_8x8x64K-32 7022 6630 -5.58% BenchmarkParallel_8x8x05M-32 348308 348369 +0.02% BenchmarkParallel_20x10x05M-32 575672 581028 +0.93% BenchmarkParallel_8x8x1M-32 716033 697167 -2.63% BenchmarkParallel_8x8x8M-32 5716048 5616437 -1.74% BenchmarkParallel_8x8x32M-32 22650878 22098667 -2.44% BenchmarkParallel_8x3x1M-32 406839 399125 -1.90% BenchmarkParallel_8x4x1M-32 459107 463890 +1.04% BenchmarkParallel_8x5x1M-32 527488 520334 -1.36% BenchmarkStreamEncode10x2x10000-32 6013 5878 -2.25% BenchmarkStreamEncode100x20x10000-32 503124 267894 -46.75% BenchmarkStreamEncode17x3x1M-32 1561838 1376618 -11.86% BenchmarkStreamEncode10x4x16M-32 19124427 17762582 -7.12% BenchmarkStreamEncode5x2x1M-32 429701 384666 -10.48% BenchmarkStreamEncode10x2x1M-32 801257 763637 -4.70% BenchmarkStreamEncode10x4x1M-32 876065 820744 -6.31% BenchmarkStreamEncode50x20x1M-32 7205112 6081398 -15.60% BenchmarkStreamEncode17x3x16M-32 27182786 26117143 -3.92% BenchmarkStreamVerify10x2x10000-32 13767 14026 +1.88% BenchmarkStreamVerify50x5x50000-32 826983 690453 -16.51% BenchmarkStreamVerify10x2x1M-32 1238566 1182591 -4.52% BenchmarkStreamVerify5x2x1M-32 892661 806301 -9.67% BenchmarkStreamVerify10x4x1M-32 1676394 1631495 -2.68% BenchmarkStreamVerify50x20x1M-32 10877875 10037678 -7.72% BenchmarkStreamVerify10x4x16M-32 27599576 30435400 +10.27% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 58518.53 58510.17 1.00x BenchmarkGalois1M-32 53558.10 55507.44 1.04x BenchmarkGaloisXor128K-32 46839.74 45961.09 0.98x BenchmarkGaloisXor1M-32 44936.98 44917.46 1.00x BenchmarkEncode2x1x1M-32 91561.27 91524.11 1.00x BenchmarkEncode10x2x10000-32 37385.54 38792.54 1.04x BenchmarkEncode100x20x10000-32 3306.47 8096.40 2.45x BenchmarkEncode17x3x1M-32 64773.49 93557.14 1.44x BenchmarkEncode10x4x16M-32 28039.15 28039.68 1.00x BenchmarkEncode5x2x1M-32 107365.88 109781.16 1.02x BenchmarkEncode10x2x1M-32 124083.62 135266.27 1.09x BenchmarkEncode10x4x1M-32 85408.99 94419.71 1.11x BenchmarkEncode50x20x1M-32 19812.81 24344.67 1.23x BenchmarkEncode17x3x16M-32 32642.93 33200.32 1.02x BenchmarkEncode_8x4x8M-32 29277.52 30261.21 1.03x BenchmarkEncode_12x4x12M-32 30355.67 30589.14 1.01x BenchmarkEncode_16x4x16M-32 31023.66 31102.39 1.00x BenchmarkEncode_16x4x32M-32 31912.44 31201.82 0.98x BenchmarkEncode_16x4x64M-32 31846.32 30589.65 0.96x BenchmarkEncode_8x5x8M-32 24003.28 28351.84 1.18x BenchmarkEncode_8x6x8M-32 23184.41 26707.91 1.15x BenchmarkEncode_8x7x8M-32 21623.86 25354.03 1.17x BenchmarkEncode_8x9x8M-32 22943.85 23321.13 1.02x BenchmarkEncode_8x10x8M-32 21809.31 22841.68 1.05x BenchmarkEncode_8x11x8M-32 21637.77 22735.06 1.05x BenchmarkEncode_8x8x05M-32 55606.22 61311.47 1.10x BenchmarkEncode_8x8x1M-32 65351.80 65830.73 1.01x BenchmarkEncode_8x8x8M-32 24193.01 24754.07 1.02x BenchmarkEncode_8x8x32M-32 23303.06 23644.60 1.01x BenchmarkEncode_24x8x24M-32 29041.76 26549.54 0.91x BenchmarkEncode_24x8x48M-32 29900.52 26322.51 0.88x BenchmarkVerify10x2x10000-32 13685.12 14717.10 1.08x BenchmarkVerify10x2x1M-32 24378.43 26424.72 1.08x BenchmarkVerify5x2x1M-32 16535.79 17495.41 1.06x BenchmarkVerify10x4x1M-32 14248.35 15484.96 1.09x BenchmarkVerify50x20x1M-32 10180.79 11863.85 1.17x BenchmarkVerify10x4x16M-32 13214.53 13283.71 1.01x BenchmarkReconstruct10x2x10000-32 35799.16 36854.89 1.03x BenchmarkReconstruct50x5x50000-32 33049.47 39034.89 1.18x BenchmarkReconstruct10x2x1M-32 66326.88 72061.06 1.09x BenchmarkReconstruct5x2x1M-32 57308.21 58014.92 1.01x BenchmarkReconstruct10x4x1M-32 53711.74 57791.66 1.08x BenchmarkReconstruct50x20x1M-32 20227.09 22991.67 1.14x BenchmarkReconstruct10x4x16M-32 27432.37 26747.32 0.98x BenchmarkReconstructData10x2x10000-32 37030.86 38511.87 1.04x BenchmarkReconstructData50x5x50000-32 33842.07 40802.85 1.21x BenchmarkReconstructData10x2x1M-32 73475.57 77693.87 1.06x BenchmarkReconstructData5x2x1M-32 71809.58 68635.57 0.96x BenchmarkReconstructData10x4x1M-32 65073.27 66736.88 1.03x BenchmarkReconstructData50x20x1M-32 29181.41 34464.76 1.18x BenchmarkReconstructData10x4x16M-32 33649.09 35066.75 1.04x BenchmarkReconstructP10x2x10000-32 129819.98 128086.76 0.99x BenchmarkReconstructP10x5x20000-32 183073.89 176202.21 0.96x BenchmarkParallel_8x8x64K-32 149327.33 158153.67 1.06x BenchmarkParallel_8x8x05M-32 24083.89 24079.69 1.00x BenchmarkParallel_20x10x05M-32 27322.20 27070.35 0.99x BenchmarkParallel_8x8x1M-32 23430.78 24064.83 1.03x BenchmarkParallel_8x8x8M-32 23480.86 23897.31 1.02x BenchmarkParallel_8x8x32M-32 23701.99 24294.27 1.02x BenchmarkParallel_8x3x1M-32 28351.11 28899.03 1.02x BenchmarkParallel_8x4x1M-32 27407.34 27124.76 0.99x BenchmarkParallel_8x5x1M-32 25842.27 26197.58 1.01x BenchmarkStreamEncode10x2x10000-32 16629.76 17012.26 1.02x BenchmarkStreamEncode100x20x10000-32 1987.58 3732.83 1.88x BenchmarkStreamEncode17x3x1M-32 11413.34 12948.97 1.13x BenchmarkStreamEncode10x4x16M-32 8772.66 9445.26 1.08x BenchmarkStreamEncode5x2x1M-32 12201.21 13629.70 1.12x BenchmarkStreamEncode10x2x1M-32 13086.64 13731.34 1.05x BenchmarkStreamEncode10x4x1M-32 11969.16 12775.92 1.07x BenchmarkStreamEncode50x20x1M-32 7276.61 8621.18 1.18x BenchmarkStreamEncode17x3x16M-32 10492.40 10920.52 1.04x BenchmarkStreamVerify10x2x10000-32 7264.00 7129.49 0.98x BenchmarkStreamVerify50x5x50000-32 6046.07 7241.62 1.20x BenchmarkStreamVerify10x2x1M-32 8466.05 8866.77 1.05x BenchmarkStreamVerify5x2x1M-32 5873.31 6502.39 1.11x BenchmarkStreamVerify10x4x1M-32 6254.95 6427.09 1.03x BenchmarkStreamVerify50x20x1M-32 4819.76 5223.20 1.08x BenchmarkStreamVerify10x4x16M-32 6078.79 5512.40 0.91x ```
2021-12-09 14:28:44 +03:00
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0]))
}
correct, _ := r.Verify(shards)
if !correct {
t.Errorf("Verification of encoded shards failed")
}
}
func testCodeSomeShardsAvx512(t *testing.T, ds, ps int) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
step := 1
if testing.Short() {
// A prime for variation
step += 29
}
for l := 1; l <= 8192; l += step {
testCodeSomeShardsAvx512WithLength(t, ds, ps, l, false)
testCodeSomeShardsAvx512WithLength(t, ds, ps, l, true)
}
}
func TestCodeSomeShardsAvx512_8x2(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 2) }
func TestCodeSomeShardsAvx512_1x4(t *testing.T) { testCodeSomeShardsAvx512(t, 1, 4) }
func TestCodeSomeShardsAvx512_2x4(t *testing.T) { testCodeSomeShardsAvx512(t, 2, 4) }
func TestCodeSomeShardsAvx512_3x4(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 4) }
func TestCodeSomeShardsAvx512_4x4(t *testing.T) { testCodeSomeShardsAvx512(t, 4, 4) }
func TestCodeSomeShardsAvx512_5x4(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 4) }
func TestCodeSomeShardsAvx512_6x4(t *testing.T) { testCodeSomeShardsAvx512(t, 6, 4) }
func TestCodeSomeShardsAvx512_7x4(t *testing.T) { testCodeSomeShardsAvx512(t, 7, 4) }
func TestCodeSomeShardsAvx512_8x4(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 4) }
func TestCodeSomeShardsAvx512_9x4(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 4) }
func TestCodeSomeShardsAvx512_10x4(t *testing.T) { testCodeSomeShardsAvx512(t, 10, 4) }
func TestCodeSomeShardsAvx512_12x4(t *testing.T) { testCodeSomeShardsAvx512(t, 12, 4) }
func TestCodeSomeShardsAvx512_16x4(t *testing.T) { testCodeSomeShardsAvx512(t, 16, 4) }
func TestCodeSomeShardsAvx512_3x6(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 6) }
func TestCodeSomeShardsAvx512_8x6(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 6) }
func TestCodeSomeShardsAvx512_8x7(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 7) }
func TestCodeSomeShardsAvx512_3x8(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 8) }
func TestCodeSomeShardsAvx512_8x8(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 8) }
func TestCodeSomeShardsAvx512_5x10(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 10) }
func TestCodeSomeShardsAvx512_8x10(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 10) }
func TestCodeSomeShardsAvx512_9x10(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 10) }
func TestCodeSomeShardsAvx512_Manyx4(t *testing.T) {
if !defaultOptions.useAVX512 {
return
}
step := 1
if testing.Short() {
step += 7
}
for inputs := 1; inputs <= 200; inputs += step {
testCodeSomeShardsAvx512WithLength(t, inputs, 4, 1024+33, false)
testCodeSomeShardsAvx512WithLength(t, inputs, 4, 1024+33, true)
}
}
func TestCodeSomeShardsAvx512_ManyxMany(t *testing.T) {
if !defaultOptions.useAVX512 {
return
}
step := 1
if testing.Short() {
step += 5
}
for outputs := 1; outputs <= 32; outputs += step {
for inputs := 1; inputs <= 32; inputs += step {
testCodeSomeShardsAvx512WithLength(t, inputs, outputs, 1024+33, false)
testCodeSomeShardsAvx512WithLength(t, inputs, outputs, 1024+33, true)
}
}
}