Avx512 parallel81 (#131)
* AVX512 routine for 8x1 parallel processing (WIP) * Testing and integration of Parallel81 assembly routinemaster
parent
cb7a0b5aef
commit
1b9e129671
|
@ -9,6 +9,9 @@ package reedsolomon
|
|||
|
||||
import "sync"
|
||||
|
||||
//go:noescape
|
||||
func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
|
||||
|
||||
//go:noescape
|
||||
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
|
||||
|
||||
|
@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo
|
|||
|
||||
const (
|
||||
dimIn = 8 // Number of input rows processed simultaneously
|
||||
dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine
|
||||
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
|
||||
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
|
||||
matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
|
||||
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
|
||||
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
|
||||
)
|
||||
|
||||
// Construct block of matrix coefficients for 2 outputs rows in parallel
|
||||
// Construct block of matrix coefficients for single output row in parallel
|
||||
func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
|
||||
offset := 0
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
coeff := matrixRows[iRow][c]
|
||||
copy(matrix[offset*32:], mulTableLow[coeff][:])
|
||||
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
|
||||
} else {
|
||||
// coefficients not used for this input shard (so null out)
|
||||
v := matrix[offset*32 : offset*32+32]
|
||||
for i := range v {
|
||||
v[i] = 0
|
||||
}
|
||||
}
|
||||
offset += dimIn
|
||||
if offset >= dimIn*dimOut81 {
|
||||
offset -= dimIn*dimOut81 - 1
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Construct block of matrix coefficients for 2 output rows in parallel
|
||||
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
|
||||
offset := 0
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
|
@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
|
|||
}
|
||||
}
|
||||
|
||||
// Construct block of matrix coefficients for 4 outputs rows in parallel
|
||||
// Construct block of matrix coefficients for 4 output rows in parallel
|
||||
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
|
||||
offset := 0
|
||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||
|
@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
|
|||
}
|
||||
}
|
||||
|
||||
// Invoke AVX512 routine for single output row in parallel
|
||||
func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
|
||||
done := stop - start
|
||||
if done <= 0 {
|
||||
return
|
||||
}
|
||||
|
||||
inputEnd := inputOffset + dimIn
|
||||
if inputEnd > len(in) {
|
||||
inputEnd = len(in)
|
||||
}
|
||||
outputEnd := outputOffset + dimOut81
|
||||
if outputEnd > len(out) {
|
||||
outputEnd = len(out)
|
||||
}
|
||||
|
||||
// We know the max size, alloc temp array.
|
||||
var inTmp [dimIn][]byte
|
||||
for i, v := range in[inputOffset:inputEnd] {
|
||||
inTmp[i] = v[start:stop]
|
||||
}
|
||||
var outTmp [dimOut81][]byte
|
||||
for i, v := range out[outputOffset:outputEnd] {
|
||||
outTmp[i] = v[start:stop]
|
||||
}
|
||||
|
||||
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||
_galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
|
||||
|
||||
done = start + ((done >> 6) << 6)
|
||||
if done < stop {
|
||||
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||
}
|
||||
}
|
||||
|
||||
// Invoke AVX512 routine for 2 output rows in parallel
|
||||
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
|
||||
done := stop - start
|
||||
|
@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset,
|
|||
_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
|
||||
|
||||
done = start + ((done >> 6) << 6)
|
||||
if done == stop {
|
||||
return
|
||||
}
|
||||
|
||||
for c := inputOffset; c < inputEnd; c++ {
|
||||
for iRow := outputOffset; iRow < outputEnd; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
mt := mulTable[matrixRows[iRow][c]][:256]
|
||||
for i := done; i < stop; i++ {
|
||||
if c == 0 { // only set value for first input column
|
||||
out[iRow][i] = mt[in[c][i]]
|
||||
} else { // and add for all others
|
||||
out[iRow][i] ^= mt[in[c][i]]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if done < stop {
|
||||
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset,
|
|||
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||
_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
|
||||
|
||||
done = (done >> 6) << 6
|
||||
done += start
|
||||
if done == stop {
|
||||
return
|
||||
done = start + ((done >> 6) << 6)
|
||||
if done < stop {
|
||||
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||
}
|
||||
}
|
||||
|
||||
func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
|
||||
for c := inputOffset; c < inputEnd; c++ {
|
||||
for iRow := outputOffset; iRow < outputEnd; iRow++ {
|
||||
if c < len(matrixRows[iRow]) {
|
||||
|
@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
|||
for start < byteCount {
|
||||
matrix84 := [matrixSize84]byte{}
|
||||
matrix82 := [matrixSize82]byte{}
|
||||
matrix81 := [matrixSize81]byte{}
|
||||
|
||||
outputRow := 0
|
||||
// First process (multiple) batches of 4 output rows in parallel
|
||||
|
@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
|||
}
|
||||
// Lastly, we may have a single output row left (for uneven parity)
|
||||
if outputRow < outputCount {
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
if c == 0 {
|
||||
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
||||
} else {
|
||||
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
||||
}
|
||||
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
|
||||
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
|
|||
// Loop for each round.
|
||||
matrix84 := [matrixSize84]byte{}
|
||||
matrix82 := [matrixSize82]byte{}
|
||||
matrix81 := [matrixSize81]byte{}
|
||||
for start < grStop {
|
||||
outputRow := 0
|
||||
// First process (multiple) batches of 4 output rows in parallel
|
||||
|
@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
|
|||
}
|
||||
// Lastly, we may have a single output row left (for uneven parity)
|
||||
if outputRow < outputCount {
|
||||
for c := 0; c < r.DataShards; c++ {
|
||||
in := inputs[c][start:stop]
|
||||
for iRow := 0; iRow < outputCount; iRow++ {
|
||||
if c == 0 {
|
||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
||||
} else {
|
||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
||||
}
|
||||
}
|
||||
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
|
||||
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
|
||||
}
|
||||
}
|
||||
start = stop
|
||||
|
|
|
@ -18,6 +18,97 @@
|
|||
VPTERNLOGD $0x96, LO, HI, OUT
|
||||
|
||||
|
||||
//
|
||||
// Process single output row from a total of 8 input rows
|
||||
//
|
||||
// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
|
||||
TEXT ·_galMulAVX512Parallel81(SB), 7, $0
|
||||
MOVQ in+0(FP), SI //
|
||||
MOVQ 8(SI), R9 // R9: len(in)
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
TESTQ R9, R9
|
||||
JZ done_avx512_parallel81
|
||||
|
||||
MOVQ matrix+48(FP), SI
|
||||
VMOVDQU64 0x000(SI), Z16
|
||||
VMOVDQU64 0x040(SI), Z17
|
||||
VMOVDQU64 0x080(SI), Z18
|
||||
VMOVDQU64 0x0c0(SI), Z19
|
||||
|
||||
MOVQ $15, BX
|
||||
VPBROADCASTB BX, Z2
|
||||
|
||||
MOVB addTo+56(FP), AX
|
||||
IMULQ $-0x1, AX
|
||||
KMOVQ AX, K1
|
||||
MOVQ in+0(FP), SI // SI: &in
|
||||
MOVQ in_len+8(FP), AX // number of inputs
|
||||
XORQ R11, R11
|
||||
MOVQ out+24(FP), DX
|
||||
MOVQ (DX), DX // DX: &out[0][0]
|
||||
|
||||
loopback_avx512_parallel81:
|
||||
VMOVDQU64.Z (DX), K1, Z4
|
||||
|
||||
LOAD(0x00) // &in[0][0]
|
||||
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $1
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x18) // &in[1][0]
|
||||
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $2
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x30) // &in[2][0]
|
||||
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $3
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x48) // &in[3][0]
|
||||
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $4
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x60) // &in[4][0]
|
||||
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $5
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x78) // &in[5][0]
|
||||
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $6
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0x90) // &in[6][0]
|
||||
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
|
||||
|
||||
CMPQ AX, $7
|
||||
JE skip_avx512_parallel81
|
||||
|
||||
LOAD(0xa8) // &in[7][0]
|
||||
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
|
||||
|
||||
skip_avx512_parallel81:
|
||||
VMOVDQU64 Z4, (DX)
|
||||
|
||||
ADDQ $64, R11 // in4+=64
|
||||
|
||||
ADDQ $64, DX // out+=64
|
||||
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_avx512_parallel81
|
||||
|
||||
done_avx512_parallel81:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
//
|
||||
// Process 2 output rows in parallel from a total of 8 input rows
|
||||
//
|
||||
|
|
|
@ -14,6 +14,102 @@ import (
|
|||
"time"
|
||||
)
|
||||
|
||||
func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
|
||||
|
||||
if !defaultOptions.useAVX512 {
|
||||
t.Skip("AVX512 not detected")
|
||||
}
|
||||
|
||||
rand.Seed(time.Now().UnixNano())
|
||||
|
||||
var size = 1024 * 1024
|
||||
if testing.Short() {
|
||||
size = 4096
|
||||
}
|
||||
|
||||
in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
|
||||
|
||||
for i := range in {
|
||||
in[i] = make([]byte, size)
|
||||
rand.Read(in[i])
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
out[i] = make([]byte, size)
|
||||
rand.Read(out[i])
|
||||
}
|
||||
|
||||
opts := defaultOptions
|
||||
opts.useSSSE3 = true
|
||||
|
||||
matrix := [(16 + 16) * dimIn * dimOut81]byte{}
|
||||
coeffs := make([]byte, dimIn*len(out))
|
||||
|
||||
for i := 0; i < dimIn*len(out); i++ {
|
||||
coeffs[i] = byte(rand.Int31n(256))
|
||||
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
|
||||
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
|
||||
}
|
||||
|
||||
// Do first run with clearing out any existing results
|
||||
_galMulAVX512Parallel81(in, out, &matrix, false)
|
||||
|
||||
expect := make([][]byte, len(out))
|
||||
for i := range expect {
|
||||
expect[i] = make([]byte, size)
|
||||
rand.Read(expect[i])
|
||||
}
|
||||
|
||||
for i := range in {
|
||||
if i == 0 {
|
||||
galMulSlice(coeffs[i], in[i], expect[0], &options{})
|
||||
} else {
|
||||
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
|
||||
}
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
if 0 != bytes.Compare(out[i], expect[i]) {
|
||||
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
|
||||
}
|
||||
}
|
||||
|
||||
inToAdd := make([][]byte, len(in))
|
||||
|
||||
for i := range inToAdd {
|
||||
inToAdd[i] = make([]byte, size)
|
||||
rand.Read(inToAdd[i])
|
||||
}
|
||||
|
||||
for i := 0; i < dimIn*len(out); i++ {
|
||||
coeffs[i] = byte(rand.Int31n(256))
|
||||
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
|
||||
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
|
||||
}
|
||||
|
||||
// Do second run by adding to original run
|
||||
_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
|
||||
|
||||
for i := range in {
|
||||
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
|
||||
}
|
||||
|
||||
for i := range out {
|
||||
if 0 != bytes.Compare(out[i], expect[i]) {
|
||||
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
|
||||
func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
|
||||
func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
|
||||
func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
|
||||
func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
|
||||
func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
|
||||
func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
|
||||
func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
|
||||
|
||||
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
|
||||
|
||||
if !defaultOptions.useAVX512 {
|
||||
|
|
|
@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51
|
|||
func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }
|
||||
func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) }
|
||||
func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) }
|
||||
|
||||
func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) }
|
||||
func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) }
|
||||
func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }
|
||||
|
|
Loading…
Reference in New Issue