Avx512 parallel81 (#131)

* AVX512 routine for 8x1 parallel processing (WIP)

* Testing and integration of Parallel81 assembly routine
master
Frank Wessels 2020-05-06 03:32:31 -07:00 committed by GitHub
parent cb7a0b5aef
commit 1b9e129671
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 272 additions and 38 deletions

View File

@ -9,6 +9,9 @@ package reedsolomon
import "sync"
//go:noescape
func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
//go:noescape
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo
const (
dimIn = 8 // Number of input rows processed simultaneously
dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
)
// Construct block of matrix coefficients for 2 outputs rows in parallel
// Construct block of matrix coefficients for single output row in parallel
func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
if c < len(matrixRows[iRow]) {
coeff := matrixRows[iRow][c]
copy(matrix[offset*32:], mulTableLow[coeff][:])
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
} else {
// coefficients not used for this input shard (so null out)
v := matrix[offset*32 : offset*32+32]
for i := range v {
v[i] = 0
}
}
offset += dimIn
if offset >= dimIn*dimOut81 {
offset -= dimIn*dimOut81 - 1
}
}
}
}
// Construct block of matrix coefficients for 2 output rows in parallel
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
}
}
// Construct block of matrix coefficients for 4 outputs rows in parallel
// Construct block of matrix coefficients for 4 output rows in parallel
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
}
}
// Invoke AVX512 routine for single output row in parallel
func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
done := stop - start
if done <= 0 {
return
}
inputEnd := inputOffset + dimIn
if inputEnd > len(in) {
inputEnd = len(in)
}
outputEnd := outputOffset + dimOut81
if outputEnd > len(out) {
outputEnd = len(out)
}
// We know the max size, alloc temp array.
var inTmp [dimIn][]byte
for i, v := range in[inputOffset:inputEnd] {
inTmp[i] = v[start:stop]
}
var outTmp [dimOut81][]byte
for i, v := range out[outputOffset:outputEnd] {
outTmp[i] = v[start:stop]
}
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
done = start + ((done >> 6) << 6)
if done < stop {
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
}
}
// Invoke AVX512 routine for 2 output rows in parallel
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
done := stop - start
@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset,
_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
done = start + ((done >> 6) << 6)
if done == stop {
return
}
for c := inputOffset; c < inputEnd; c++ {
for iRow := outputOffset; iRow < outputEnd; iRow++ {
if c < len(matrixRows[iRow]) {
mt := mulTable[matrixRows[iRow][c]][:256]
for i := done; i < stop; i++ {
if c == 0 { // only set value for first input column
out[iRow][i] = mt[in[c][i]]
} else { // and add for all others
out[iRow][i] ^= mt[in[c][i]]
}
}
}
}
if done < stop {
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
}
}
@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset,
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
done = (done >> 6) << 6
done += start
if done == stop {
return
done = start + ((done >> 6) << 6)
if done < stop {
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
}
}
func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
for c := inputOffset; c < inputEnd; c++ {
for iRow := outputOffset; iRow < outputEnd; iRow++ {
if c < len(matrixRows[iRow]) {
@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
for start < byteCount {
matrix84 := [matrixSize84]byte{}
matrix82 := [matrixSize82]byte{}
matrix81 := [matrixSize81]byte{}
outputRow := 0
// First process (multiple) batches of 4 output rows in parallel
@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
}
// Lastly, we may have a single output row left (for uneven parity)
if outputRow < outputCount {
for c := 0; c < r.DataShards; c++ {
if c == 0 {
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
} else {
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
}
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
}
}
@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
// Loop for each round.
matrix84 := [matrixSize84]byte{}
matrix82 := [matrixSize82]byte{}
matrix81 := [matrixSize81]byte{}
for start < grStop {
outputRow := 0
// First process (multiple) batches of 4 output rows in parallel
@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
}
// Lastly, we may have a single output row left (for uneven parity)
if outputRow < outputCount {
for c := 0; c < r.DataShards; c++ {
in := inputs[c][start:stop]
for iRow := 0; iRow < outputCount; iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
}
}
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
}
}
start = stop

View File

@ -18,6 +18,97 @@
VPTERNLOGD $0x96, LO, HI, OUT
//
// Process single output row from a total of 8 input rows
//
// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
TEXT ·_galMulAVX512Parallel81(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel81
MOVQ matrix+48(FP), SI
VMOVDQU64 0x000(SI), Z16
VMOVDQU64 0x040(SI), Z17
VMOVDQU64 0x080(SI), Z18
VMOVDQU64 0x0c0(SI), Z19
MOVQ $15, BX
VPBROADCASTB BX, Z2
MOVB addTo+56(FP), AX
IMULQ $-0x1, AX
KMOVQ AX, K1
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel81:
VMOVDQU64.Z (DX), K1, Z4
LOAD(0x00) // &in[0][0]
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
CMPQ AX, $1
JE skip_avx512_parallel81
LOAD(0x18) // &in[1][0]
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
CMPQ AX, $2
JE skip_avx512_parallel81
LOAD(0x30) // &in[2][0]
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
CMPQ AX, $3
JE skip_avx512_parallel81
LOAD(0x48) // &in[3][0]
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
CMPQ AX, $4
JE skip_avx512_parallel81
LOAD(0x60) // &in[4][0]
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
CMPQ AX, $5
JE skip_avx512_parallel81
LOAD(0x78) // &in[5][0]
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
CMPQ AX, $6
JE skip_avx512_parallel81
LOAD(0x90) // &in[6][0]
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
CMPQ AX, $7
JE skip_avx512_parallel81
LOAD(0xa8) // &in[7][0]
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
skip_avx512_parallel81:
VMOVDQU64 Z4, (DX)
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel81
done_avx512_parallel81:
VZEROUPPER
RET
//
// Process 2 output rows in parallel from a total of 8 input rows
//

View File

@ -14,6 +14,102 @@ import (
"time"
)
func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
t.Skip("AVX512 not detected")
}
rand.Seed(time.Now().UnixNano())
var size = 1024 * 1024
if testing.Short() {
size = 4096
}
in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut81]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel81(in, out, &matrix, false)
expect := make([][]byte, len(out))
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {

View File

@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51
func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }
func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) }
func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) }
func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) }
func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) }
func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }