Generate AVX2 code (#141)
Replaces AVX2 up to 10x8 configurations with specific generated functions. If code size is a concern `-tags=nogen` can be used. Biggest speedup when not memory constrained. ``` benchmark old MB/s new MB/s speedup BenchmarkEncode_8x5x8M 5895.75 9648.18 1.64x BenchmarkEncode_8x5x8M-4 16773.41 17220.67 1.03x BenchmarkEncode_8x5x8M-16 18263.12 17176.28 0.94x BenchmarkEncode_8x6x8M 5075.89 8548.39 1.68x BenchmarkEncode_8x6x8M-4 14559.83 15370.95 1.06x BenchmarkEncode_8x6x8M-16 16183.37 15291.98 0.94x BenchmarkEncode_8x7x8M 4481.18 7015.60 1.57x BenchmarkEncode_8x7x8M-4 12835.35 13695.90 1.07x BenchmarkEncode_8x7x8M-16 14246.94 13737.36 0.96x BenchmarkEncode_8x8x05M 5569.95 7947.70 1.43x BenchmarkEncode_8x8x05M-4 17334.91 25271.37 1.46x BenchmarkEncode_8x8x05M-16 29349.42 35043.36 1.19x BenchmarkEncode_8x8x1M 4830.58 7891.32 1.63x BenchmarkEncode_8x8x1M-4 17531.36 27371.42 1.56x BenchmarkEncode_8x8x1M-16 29593.98 39241.09 1.33x BenchmarkEncode_8x8x8M 3953.66 6584.26 1.67x BenchmarkEncode_8x8x8M-4 11527.34 12331.23 1.07x BenchmarkEncode_8x8x8M-16 12718.89 12173.08 0.96x BenchmarkEncode_8x8x32M 3927.51 6195.91 1.58x BenchmarkEncode_8x8x32M-4 11490.85 11424.39 0.99x BenchmarkEncode_8x8x32M-16 12506.09 11888.55 0.95x benchmark old MB/s new MB/s speedup BenchmarkParallel_8x8x64K 5490.24 6959.57 1.27x BenchmarkParallel_8x8x64K-4 21078.94 29557.51 1.40x BenchmarkParallel_8x8x64K-16 57508.45 73672.54 1.28x BenchmarkParallel_8x8x1M 4755.49 7667.84 1.61x BenchmarkParallel_8x8x1M-4 11818.66 12013.49 1.02x BenchmarkParallel_8x8x1M-16 12923.12 12109.42 0.94x BenchmarkParallel_8x8x8M 3973.94 6525.85 1.64x BenchmarkParallel_8x8x8M-4 11725.68 11312.46 0.96x BenchmarkParallel_8x8x8M-16 12608.20 11484.98 0.91x BenchmarkParallel_8x3x1M 14139.71 17993.04 1.27x BenchmarkParallel_8x3x1M-4 21805.97 23053.92 1.06x BenchmarkParallel_8x3x1M-16 24673.05 23596.71 0.96x BenchmarkParallel_8x4x1M 10617.88 14474.54 1.36x BenchmarkParallel_8x4x1M-4 18635.82 18965.65 1.02x BenchmarkParallel_8x4x1M-16 21518.12 20171.47 0.94x BenchmarkParallel_8x5x1M 8669.88 11833.96 1.36x BenchmarkParallel_8x5x1M-4 16321.00 17500.30 1.07x BenchmarkParallel_8x5x1M-16 17267.16 17191.04 1.00x ```master
parent
01b307ec91
commit
7daa20bf74
27
galois.go
27
galois.go
|
@ -900,3 +900,30 @@ func galExp(a byte, n int) byte {
|
|||
}
|
||||
return expTable[logResult]
|
||||
}
|
||||
|
||||
func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte {
|
||||
if !avx2CodeGen {
|
||||
panic("codegen not enabled")
|
||||
}
|
||||
total := inputs * outputs
|
||||
|
||||
// Duplicated in+out
|
||||
wantBytes := total * 32 * 2
|
||||
if cap(dst) < wantBytes {
|
||||
dst = make([]byte, wantBytes)
|
||||
} else {
|
||||
dst = dst[:wantBytes]
|
||||
}
|
||||
for i, row := range matrixRows[:outputs] {
|
||||
for j, idx := range row[:inputs] {
|
||||
dstIdx := (j*outputs + i) * 64
|
||||
lo := mulTableLow[idx][:]
|
||||
hi := mulTableHigh[idx][:]
|
||||
copy(dst[dstIdx:], lo)
|
||||
copy(dst[dstIdx+16:], lo)
|
||||
copy(dst[dstIdx+32:], hi)
|
||||
copy(dst[dstIdx+48:], hi)
|
||||
}
|
||||
}
|
||||
return dst
|
||||
}
|
||||
|
|
|
@ -7,7 +7,9 @@
|
|||
|
||||
package reedsolomon
|
||||
|
||||
import "sync"
|
||||
import (
|
||||
"sync"
|
||||
)
|
||||
|
||||
//go:noescape
|
||||
func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
|
||||
|
@ -224,7 +226,7 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp
|
|||
|
||||
// Perform the same as codeSomeShards, but taking advantage of
|
||||
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
// Process using no goroutines
|
||||
start, end := 0, r.o.perRound
|
||||
if end > byteCount {
|
||||
|
@ -271,7 +273,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
|||
|
||||
// Perform the same as codeSomeShards, but taking advantage of
|
||||
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
|
||||
func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
var wg sync.WaitGroup
|
||||
do := byteCount / r.o.maxGoroutines
|
||||
if do < r.o.minSplitSize {
|
||||
|
|
|
@ -0,0 +1,408 @@
|
|||
// Code generated by command: go run gen.go -out galois_gen_amd64.s -stubs galois_gen_amd64.go. DO NOT EDIT.
|
||||
|
||||
// +build !appengine
|
||||
// +build !noasm
|
||||
// +build !nogen
|
||||
// +build gc
|
||||
|
||||
package reedsolomon
|
||||
|
||||
// mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
||||
|
||||
// mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs.
|
||||
// The output is initialized to 0.
|
||||
//go:noescape
|
||||
func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int)
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,11 @@
|
|||
//+build !amd64 noasm appengine gccgo nogen
|
||||
|
||||
package reedsolomon
|
||||
|
||||
const maxAvx2Inputs = 0
|
||||
const maxAvx2Outputs = 0
|
||||
const avx2CodeGen = false
|
||||
|
||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||
panic("avx2 codegen not available")
|
||||
}
|
|
@ -0,0 +1,293 @@
|
|||
// Code generated by command: go generate gen.go. DO NOT EDIT.
|
||||
|
||||
// +build !appengine
|
||||
// +build !noasm
|
||||
// +build gc
|
||||
// +build !nogen
|
||||
|
||||
package reedsolomon
|
||||
|
||||
import "fmt"
|
||||
|
||||
const avx2CodeGen = true
|
||||
const maxAvx2Inputs = 10
|
||||
const maxAvx2Outputs = 8
|
||||
|
||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||
n := stop - start
|
||||
n = (n >> 5) << 5
|
||||
|
||||
switch len(in) {
|
||||
case 1:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_1x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_1x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_1x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_1x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_1x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_1x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_1x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_1x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 2:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_2x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_2x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_2x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_2x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_2x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_2x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_2x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_2x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 3:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_3x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_3x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_3x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_3x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_3x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_3x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_3x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_3x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 4:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_4x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_4x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_4x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_4x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_4x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_4x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_4x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_4x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 5:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_5x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_5x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_5x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_5x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_5x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_5x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_5x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_5x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 6:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_6x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_6x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_6x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_6x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_6x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_6x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_6x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_6x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 7:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_7x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_7x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_7x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_7x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_7x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_7x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_7x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_7x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 8:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_8x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_8x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_8x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_8x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_8x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_8x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_8x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_8x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 9:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_9x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_9x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_9x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_9x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_9x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_9x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_9x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_9x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
case 10:
|
||||
switch len(out) {
|
||||
case 1:
|
||||
mulAvxTwo_10x1(matrix, in, out, start, n)
|
||||
return n
|
||||
case 2:
|
||||
mulAvxTwo_10x2(matrix, in, out, start, n)
|
||||
return n
|
||||
case 3:
|
||||
mulAvxTwo_10x3(matrix, in, out, start, n)
|
||||
return n
|
||||
case 4:
|
||||
mulAvxTwo_10x4(matrix, in, out, start, n)
|
||||
return n
|
||||
case 5:
|
||||
mulAvxTwo_10x5(matrix, in, out, start, n)
|
||||
return n
|
||||
case 6:
|
||||
mulAvxTwo_10x6(matrix, in, out, start, n)
|
||||
return n
|
||||
case 7:
|
||||
mulAvxTwo_10x7(matrix, in, out, start, n)
|
||||
return n
|
||||
case 8:
|
||||
mulAvxTwo_10x8(matrix, in, out, start, n)
|
||||
return n
|
||||
}
|
||||
}
|
||||
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||
}
|
|
@ -4,10 +4,10 @@
|
|||
|
||||
package reedsolomon
|
||||
|
||||
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
panic("codeSomeShardsAvx512 should not be called if built without asm")
|
||||
}
|
||||
|
||||
func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
panic("codeSomeShardsAvx512P should not be called if built without asm")
|
||||
}
|
||||
|
|
|
@ -0,0 +1,249 @@
|
|||
//+build generate
|
||||
|
||||
//go:generate go run gen.go -out galois_gen_amd64.s -stubs galois_gen_amd64.go
|
||||
//go:generate gofmt -w galois_gen_switch_amd64.go
|
||||
|
||||
package main
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"fmt"
|
||||
"os"
|
||||
|
||||
. "github.com/mmcloughlin/avo/build"
|
||||
"github.com/mmcloughlin/avo/buildtags"
|
||||
. "github.com/mmcloughlin/avo/operand"
|
||||
"github.com/mmcloughlin/avo/reg"
|
||||
)
|
||||
|
||||
// Technically we can do slightly bigger, but we stay reasonable.
|
||||
const inputMax = 10
|
||||
const outputMax = 8
|
||||
|
||||
var switchDefs [inputMax][outputMax]string
|
||||
var switchDefsX [inputMax][outputMax]string
|
||||
|
||||
const perLoopBits = 5
|
||||
const perLoop = 1 << perLoopBits
|
||||
|
||||
func main() {
|
||||
Constraint(buildtags.Not("appengine").ToConstraint())
|
||||
Constraint(buildtags.Not("noasm").ToConstraint())
|
||||
Constraint(buildtags.Not("nogen").ToConstraint())
|
||||
Constraint(buildtags.Term("gc").ToConstraint())
|
||||
|
||||
for i := 1; i <= inputMax; i++ {
|
||||
for j := 1; j <= outputMax; j++ {
|
||||
//genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true)
|
||||
genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false)
|
||||
}
|
||||
}
|
||||
f, err := os.Create("galois_gen_switch_amd64.go")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
defer f.Close()
|
||||
w := bufio.NewWriter(f)
|
||||
defer w.Flush()
|
||||
w.WriteString(`// Code generated by command: go generate ` + os.Getenv("GOFILE") + `. DO NOT EDIT.
|
||||
|
||||
// +build !appengine
|
||||
// +build !noasm
|
||||
// +build gc
|
||||
// +build !nogen
|
||||
|
||||
package reedsolomon
|
||||
|
||||
import "fmt"
|
||||
|
||||
`)
|
||||
|
||||
w.WriteString("const avx2CodeGen = true\n")
|
||||
w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax))
|
||||
w.WriteString(`
|
||||
|
||||
func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int {
|
||||
n := stop-start
|
||||
`)
|
||||
|
||||
w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits))
|
||||
w.WriteString(`switch len(in) {
|
||||
`)
|
||||
for in, defs := range switchDefs[:] {
|
||||
w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1))
|
||||
for out, def := range defs[:] {
|
||||
w.WriteString(fmt.Sprintf(" case %d:\n", out+1))
|
||||
w.WriteString(def)
|
||||
}
|
||||
w.WriteString("}\n")
|
||||
}
|
||||
w.WriteString(`}
|
||||
panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out)))
|
||||
}
|
||||
`)
|
||||
Generate()
|
||||
}
|
||||
|
||||
func genMulAvx2(name string, inputs int, outputs int, xor bool) {
|
||||
total := inputs * outputs
|
||||
|
||||
doc := []string{
|
||||
fmt.Sprintf("%s takes %d inputs and produces %d outputs.", name, inputs, outputs),
|
||||
}
|
||||
if !xor {
|
||||
doc = append(doc, "The output is initialized to 0.")
|
||||
}
|
||||
|
||||
// Load shuffle masks on every use.
|
||||
var loadNone bool
|
||||
// Use registers for destination registers.
|
||||
var regDst = true
|
||||
|
||||
// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
|
||||
est := total*2 + outputs + 5
|
||||
if outputs == 1 {
|
||||
// We don't need to keep a copy of the input if only 1 output.
|
||||
est -= 2
|
||||
}
|
||||
|
||||
if est > 16 {
|
||||
loadNone = true
|
||||
// We run out of GP registers first, now.
|
||||
if inputs+outputs > 12 {
|
||||
regDst = false
|
||||
}
|
||||
}
|
||||
|
||||
TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
|
||||
|
||||
// SWITCH DEFINITION:
|
||||
s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs)
|
||||
s += fmt.Sprintf("\t\t\t\treturn n\n")
|
||||
switchDefs[inputs-1][outputs-1] = s
|
||||
|
||||
if loadNone {
|
||||
Comment("Loading no tables to registers")
|
||||
} else {
|
||||
// loadNone == false
|
||||
Comment("Loading all tables to registers")
|
||||
}
|
||||
|
||||
Doc(doc...)
|
||||
Pragma("noescape")
|
||||
Commentf("Full registers estimated %d YMM used", est)
|
||||
|
||||
length := Load(Param("n"), GP64())
|
||||
matrixBase := GP64()
|
||||
MOVQ(Param("matrix").Base().MustAddr(), matrixBase)
|
||||
SHRQ(U8(perLoopBits), length)
|
||||
TESTQ(length, length)
|
||||
JZ(LabelRef(name + "_end"))
|
||||
|
||||
dst := make([]reg.VecVirtual, outputs)
|
||||
dstPtr := make([]reg.GPVirtual, outputs)
|
||||
outBase := Param("out").Base().MustAddr()
|
||||
outSlicePtr := GP64()
|
||||
MOVQ(outBase, outSlicePtr)
|
||||
for i := range dst {
|
||||
dst[i] = YMM()
|
||||
if !regDst {
|
||||
continue
|
||||
}
|
||||
ptr := GP64()
|
||||
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
|
||||
dstPtr[i] = ptr
|
||||
}
|
||||
|
||||
inLo := make([]reg.VecVirtual, total)
|
||||
inHi := make([]reg.VecVirtual, total)
|
||||
|
||||
for i := range inLo {
|
||||
if loadNone {
|
||||
break
|
||||
}
|
||||
tableLo := YMM()
|
||||
tableHi := YMM()
|
||||
VMOVDQU(Mem{Base: matrixBase, Disp: i * 64}, tableLo)
|
||||
VMOVDQU(Mem{Base: matrixBase, Disp: i*64 + 32}, tableHi)
|
||||
inLo[i] = tableLo
|
||||
inHi[i] = tableHi
|
||||
}
|
||||
|
||||
inPtrs := make([]reg.GPVirtual, inputs)
|
||||
inSlicePtr := GP64()
|
||||
MOVQ(Param("in").Base().MustAddr(), inSlicePtr)
|
||||
for i := range inPtrs {
|
||||
ptr := GP64()
|
||||
MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr)
|
||||
inPtrs[i] = ptr
|
||||
}
|
||||
|
||||
tmpMask := GP64()
|
||||
MOVQ(U32(15), tmpMask)
|
||||
lowMask := YMM()
|
||||
MOVQ(tmpMask, lowMask.AsX())
|
||||
VPBROADCASTB(lowMask.AsX(), lowMask)
|
||||
|
||||
offset := GP64()
|
||||
MOVQ(Param("start").MustAddr(), offset)
|
||||
Label(name + "_loop")
|
||||
if xor {
|
||||
Commentf("Load %d outputs", outputs)
|
||||
} else {
|
||||
Commentf("Clear %d outputs", outputs)
|
||||
}
|
||||
for i := range dst {
|
||||
if xor {
|
||||
if regDst {
|
||||
VMOVDQU(Mem{Base: dstPtr[i], Index: offset, Scale: 1}, dst[i])
|
||||
continue
|
||||
}
|
||||
ptr := GP64()
|
||||
MOVQ(outBase, ptr)
|
||||
VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
|
||||
} else {
|
||||
VPXOR(dst[i], dst[i], dst[i])
|
||||
}
|
||||
}
|
||||
|
||||
lookLow, lookHigh := YMM(), YMM()
|
||||
inLow, inHigh := YMM(), YMM()
|
||||
for i := range inPtrs {
|
||||
Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs)
|
||||
VMOVDQU(Mem{Base: inPtrs[i], Index: offset, Scale: 1}, inLow)
|
||||
VPSRLQ(U8(4), inLow, inHigh)
|
||||
VPAND(lowMask, inLow, inLow)
|
||||
VPAND(lowMask, inHigh, inHigh)
|
||||
for j := range dst {
|
||||
if loadNone {
|
||||
VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow)
|
||||
VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh)
|
||||
VPSHUFB(inLow, lookLow, lookLow)
|
||||
VPSHUFB(inHigh, lookHigh, lookHigh)
|
||||
} else {
|
||||
VPSHUFB(inLow, inLo[i*outputs+j], lookLow)
|
||||
VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh)
|
||||
}
|
||||
VPXOR(lookLow, lookHigh, lookLow)
|
||||
VPXOR(lookLow, dst[j], dst[j])
|
||||
}
|
||||
}
|
||||
Commentf("Store %d outputs", outputs)
|
||||
for i := range dst {
|
||||
if regDst {
|
||||
VMOVDQU(dst[i], Mem{Base: dstPtr[i], Index: offset, Scale: 1})
|
||||
continue
|
||||
}
|
||||
ptr := GP64()
|
||||
MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
|
||||
VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1})
|
||||
}
|
||||
Comment("Prepare for next loop")
|
||||
ADDQ(U8(perLoop), offset)
|
||||
DECQ(length)
|
||||
JNZ(LabelRef(name + "_loop"))
|
||||
VZEROUPPER()
|
||||
|
||||
Label(name + "_end")
|
||||
RET()
|
||||
}
|
4
go.mod
4
go.mod
|
@ -2,4 +2,6 @@ module github.com/klauspost/reedsolomon
|
|||
|
||||
go 1.14
|
||||
|
||||
require github.com/klauspost/cpuid v1.2.4
|
||||
require (
|
||||
github.com/klauspost/cpuid v1.2.4
|
||||
)
|
||||
|
|
2
go.sum
2
go.sum
|
@ -1,4 +1,2 @@
|
|||
github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
|
||||
github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
||||
github.com/klauspost/cpuid v1.2.4 h1:EBfaK0SWSwk+fgk6efYFWdzl8MwRWoOO1gkmiaTXPW4=
|
||||
github.com/klauspost/cpuid v1.2.4/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
|
||||
|
|
|
@ -113,6 +113,7 @@ type reedSolomon struct {
|
|||
tree inversionTree
|
||||
parity [][]byte
|
||||
o options
|
||||
mPool sync.Pool
|
||||
}
|
||||
|
||||
// ErrInvShardNum will be returned by New, if you attempt to create
|
||||
|
@ -339,6 +340,11 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
|
|||
r.parity[i] = r.m[dataShards+i]
|
||||
}
|
||||
|
||||
if avx2CodeGen && r.o.useAVX2 {
|
||||
r.mPool.New = func() interface{} {
|
||||
return make([]byte, r.Shards*2*32)
|
||||
}
|
||||
}
|
||||
return &r, err
|
||||
}
|
||||
|
||||
|
@ -353,7 +359,7 @@ var ErrTooFewShards = errors.New("too few shards given")
|
|||
// Each shard is a byte array, and they must all be the same size.
|
||||
// The parity shards will always be overwritten and the data shards
|
||||
// will remain the same.
|
||||
func (r reedSolomon) Encode(shards [][]byte) error {
|
||||
func (r *reedSolomon) Encode(shards [][]byte) error {
|
||||
if len(shards) != r.Shards {
|
||||
return ErrTooFewShards
|
||||
}
|
||||
|
@ -374,7 +380,7 @@ func (r reedSolomon) Encode(shards [][]byte) error {
|
|||
// ErrInvalidInput is returned if invalid input parameter of Update.
|
||||
var ErrInvalidInput = errors.New("invalid input")
|
||||
|
||||
func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
|
||||
func (r *reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
|
||||
if len(shards) != r.Shards {
|
||||
return ErrTooFewShards
|
||||
}
|
||||
|
@ -414,7 +420,7 @@ func (r reedSolomon) Update(shards [][]byte, newDatashards [][]byte) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
|
||||
r.updateParityShardsP(matrixRows, oldinputs, newinputs, outputs, outputCount, byteCount)
|
||||
return
|
||||
|
@ -434,7 +440,7 @@ func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, output
|
|||
}
|
||||
}
|
||||
|
||||
func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
var wg sync.WaitGroup
|
||||
do := byteCount / r.o.maxGoroutines
|
||||
if do < r.o.minSplitSize {
|
||||
|
@ -468,7 +474,7 @@ func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outpu
|
|||
|
||||
// Verify returns true if the parity shards contain the right data.
|
||||
// The data is the same format as Encode. No data is modified.
|
||||
func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
|
||||
func (r *reedSolomon) Verify(shards [][]byte) (bool, error) {
|
||||
if len(shards) != r.Shards {
|
||||
return false, ErrTooFewShards
|
||||
}
|
||||
|
@ -493,7 +499,10 @@ func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
|
|||
// The number of outputs computed, and the
|
||||
// number of matrix rows used, is determined by
|
||||
// outputCount, which is the number of outputs to compute.
|
||||
func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
if len(outputs) == 0 {
|
||||
return
|
||||
}
|
||||
switch {
|
||||
case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2:
|
||||
r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount)
|
||||
|
@ -511,6 +520,13 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
|
|||
if end > len(inputs[0]) {
|
||||
end = len(inputs[0])
|
||||
}
|
||||
if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
|
||||
m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
|
||||
start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
|
||||
r.mPool.Put(m)
|
||||
end = len(inputs[0])
|
||||
}
|
||||
|
||||
for start < len(inputs[0]) {
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c][start:end]
|
||||
|
@ -532,7 +548,7 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
|
|||
|
||||
// Perform the same as codeSomeShards, but split the workload into
|
||||
// several goroutines.
|
||||
func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
|
||||
var wg sync.WaitGroup
|
||||
do := byteCount / r.o.maxGoroutines
|
||||
if do < r.o.minSplitSize {
|
||||
|
@ -541,6 +557,11 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
// Make sizes divisible by 64
|
||||
do = (do + 63) & (^63)
|
||||
start := 0
|
||||
var avx2Matrix []byte
|
||||
if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
|
||||
avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
|
||||
defer r.mPool.Put(avx2Matrix)
|
||||
}
|
||||
for start < byteCount {
|
||||
if start+do > byteCount {
|
||||
do = byteCount - start
|
||||
|
@ -548,6 +569,10 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
|
||||
wg.Add(1)
|
||||
go func(start, stop int) {
|
||||
if avx2CodeGen && r.o.useAVX2 && stop-start >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
|
||||
start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
|
||||
}
|
||||
|
||||
lstart, lstop := start, start+r.o.perRound
|
||||
if lstop > stop {
|
||||
lstop = stop
|
||||
|
@ -579,7 +604,7 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
// checkSomeShards is mostly the same as codeSomeShards,
|
||||
// except this will check values and return
|
||||
// as soon as a difference is found.
|
||||
func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
|
||||
return r.checkSomeShardsP(matrixRows, inputs, toCheck, outputCount, byteCount)
|
||||
}
|
||||
|
@ -602,7 +627,7 @@ func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outpu
|
|||
return true
|
||||
}
|
||||
|
||||
func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
func (r *reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool {
|
||||
same := true
|
||||
var mu sync.RWMutex // For above
|
||||
|
||||
|
@ -706,7 +731,7 @@ func shardSize(shards [][]byte) int {
|
|||
//
|
||||
// The reconstructed shard set is complete, but integrity is not verified.
|
||||
// Use the Verify function to check if data set is ok.
|
||||
func (r reedSolomon) Reconstruct(shards [][]byte) error {
|
||||
func (r *reedSolomon) Reconstruct(shards [][]byte) error {
|
||||
return r.reconstruct(shards, false)
|
||||
}
|
||||
|
||||
|
@ -725,7 +750,7 @@ func (r reedSolomon) Reconstruct(shards [][]byte) error {
|
|||
//
|
||||
// As the reconstructed shard set may contain missing parity shards,
|
||||
// calling the Verify function is likely to fail.
|
||||
func (r reedSolomon) ReconstructData(shards [][]byte) error {
|
||||
func (r *reedSolomon) ReconstructData(shards [][]byte) error {
|
||||
return r.reconstruct(shards, true)
|
||||
}
|
||||
|
||||
|
@ -737,7 +762,7 @@ func (r reedSolomon) ReconstructData(shards [][]byte) error {
|
|||
//
|
||||
// If there are too few shards to reconstruct the missing
|
||||
// ones, ErrTooFewShards will be returned.
|
||||
func (r reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
|
||||
func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error {
|
||||
if len(shards) != r.Shards {
|
||||
return ErrTooFewShards
|
||||
}
|
||||
|
@ -896,7 +921,7 @@ var ErrShortData = errors.New("not enough data to fill the number of requested s
|
|||
//
|
||||
// The data will not be copied, except for the last shard, so you
|
||||
// should not modify the data of the input slice afterwards.
|
||||
func (r reedSolomon) Split(data []byte) ([][]byte, error) {
|
||||
func (r *reedSolomon) Split(data []byte) ([][]byte, error) {
|
||||
if len(data) == 0 {
|
||||
return nil, ErrShortData
|
||||
}
|
||||
|
@ -945,7 +970,7 @@ var ErrReconstructRequired = errors.New("reconstruction required as one or more
|
|||
// If there are to few shards given, ErrTooFewShards will be returned.
|
||||
// If the total data size is less than outSize, ErrShortData will be returned.
|
||||
// If one or more required data shards are nil, ErrReconstructRequired will be returned.
|
||||
func (r reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
|
||||
func (r *reedSolomon) Join(dst io.Writer, shards [][]byte, outSize int) error {
|
||||
// Do we have enough shards?
|
||||
if len(shards) < r.DataShards {
|
||||
return ErrTooFewShards
|
||||
|
|
|
@ -180,7 +180,7 @@ func TestEncoding(t *testing.T) {
|
|||
|
||||
// matrix sizes to test.
|
||||
// note that par1 matric will fail on some combinations.
|
||||
var testSizes = [][2]int{{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {14, 7}, {41, 17}, {49, 1}}
|
||||
var testSizes = [][2]int{{1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}}
|
||||
var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055}
|
||||
var testDataSizesShort = []int{10, 10001, 100003}
|
||||
|
||||
|
@ -1546,6 +1546,7 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) {
|
|||
})
|
||||
}
|
||||
|
||||
func BenchmarkParallel_8x8x64K(b *testing.B) { benchmarkParallel(b, 8, 8, 64<<10) }
|
||||
func BenchmarkParallel_8x8x05M(b *testing.B) { benchmarkParallel(b, 8, 8, 512<<10) }
|
||||
func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 512<<10) }
|
||||
func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }
|
||||
|
|
Loading…
Reference in New Issue