From 1b9e1296719a823885ff21fa0c46ea5f3555c709 Mon Sep 17 00:00:00 2001 From: Frank Wessels Date: Wed, 6 May 2020 03:32:31 -0700 Subject: [PATCH] Avx512 parallel81 (#131) * AVX512 routine for 8x1 parallel processing (WIP) * Testing and integration of Parallel81 assembly routine --- galoisAvx512_amd64.go | 119 +++++++++++++++++++++++++------------ galoisAvx512_amd64.s | 91 ++++++++++++++++++++++++++++ galoisAvx512_amd64_test.go | 96 ++++++++++++++++++++++++++++++ reedsolomon_test.go | 4 ++ 4 files changed, 272 insertions(+), 38 deletions(-) diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go index 4f9daa6..d7d87b1 100644 --- a/galoisAvx512_amd64.go +++ b/galoisAvx512_amd64.go @@ -9,6 +9,9 @@ package reedsolomon import "sync" +//go:noescape +func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) + //go:noescape func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool) @@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo const ( dimIn = 8 // Number of input rows processed simultaneously + dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine + matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine ) -// Construct block of matrix coefficients for 2 outputs rows in parallel +// Construct block of matrix coefficients for single output row in parallel +func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) { + offset := 0 + for c := inputOffset; c < inputOffset+dimIn; c++ { + for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ { + if c < len(matrixRows[iRow]) { + coeff := matrixRows[iRow][c] + copy(matrix[offset*32:], mulTableLow[coeff][:]) + copy(matrix[offset*32+16:], mulTableHigh[coeff][:]) + } else { + // coefficients not used for this input shard (so null out) + v := matrix[offset*32 : offset*32+32] + for i := range v { + v[i] = 0 + } + } + offset += dimIn + if offset >= dimIn*dimOut81 { + offset -= dimIn*dimOut81 - 1 + } + } + } +} + +// Construct block of matrix coefficients for 2 output rows in parallel func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { @@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[ } } -// Construct block of matrix coefficients for 4 outputs rows in parallel +// Construct block of matrix coefficients for 4 output rows in parallel func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) { offset := 0 for c := inputOffset; c < inputOffset+dimIn; c++ { @@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[ } } +// Invoke AVX512 routine for single output row in parallel +func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) { + done := stop - start + if done <= 0 { + return + } + + inputEnd := inputOffset + dimIn + if inputEnd > len(in) { + inputEnd = len(in) + } + outputEnd := outputOffset + dimOut81 + if outputEnd > len(out) { + outputEnd = len(out) + } + + // We know the max size, alloc temp array. + var inTmp [dimIn][]byte + for i, v := range in[inputOffset:inputEnd] { + inTmp[i] = v[start:stop] + } + var outTmp [dimOut81][]byte + for i, v := range out[outputOffset:outputEnd] { + outTmp[i] = v[start:stop] + } + + addTo := inputOffset != 0 // Except for the first input column, add to previous results + _galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo) + + done = start + ((done >> 6) << 6) + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) + } +} + // Invoke AVX512 routine for 2 output rows in parallel func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) { done := stop - start @@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, _galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo) done = start + ((done >> 6) << 6) - if done == stop { - return - } - - for c := inputOffset; c < inputEnd; c++ { - for iRow := outputOffset; iRow < outputEnd; iRow++ { - if c < len(matrixRows[iRow]) { - mt := mulTable[matrixRows[iRow][c]][:256] - for i := done; i < stop; i++ { - if c == 0 { // only set value for first input column - out[iRow][i] = mt[in[c][i]] - } else { // and add for all others - out[iRow][i] ^= mt[in[c][i]] - } - } - } - } + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } } @@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, addTo := inputOffset != 0 // Except for the first input column, add to previous results _galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo) - done = (done >> 6) << 6 - done += start - if done == stop { - return + done = start + ((done >> 6) << 6) + if done < stop { + galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in) } +} +func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) { for c := inputOffset; c < inputEnd; c++ { for iRow := outputOffset; iRow < outputEnd; iRow++ { if c < len(matrixRows[iRow]) { @@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, for start < byteCount { matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} + matrix81 := [matrixSize81]byte{} outputRow := 0 // First process (multiple) batches of 4 output rows in parallel @@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { - for c := 0; c < r.DataShards; c++ { - if c == 0 { - galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o) - } else { - galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o) - } + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) + galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81) } } @@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, // Loop for each round. matrix84 := [matrixSize84]byte{} matrix82 := [matrixSize82]byte{} + matrix81 := [matrixSize81]byte{} for start < grStop { outputRow := 0 // First process (multiple) batches of 4 output rows in parallel @@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, } // Lastly, we may have a single output row left (for uneven parity) if outputRow < outputCount { - for c := 0; c < r.DataShards; c++ { - in := inputs[c][start:stop] - for iRow := 0; iRow < outputCount; iRow++ { - if c == 0 { - galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o) - } else { - galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o) - } - } + for inputRow := 0; inputRow < len(inputs); inputRow += dimIn { + setupMatrix81(matrixRows, inputRow, outputRow, &matrix81) + galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81) } } start = stop diff --git a/galoisAvx512_amd64.s b/galoisAvx512_amd64.s index f1494ac..566c548 100644 --- a/galoisAvx512_amd64.s +++ b/galoisAvx512_amd64.s @@ -18,6 +18,97 @@ VPTERNLOGD $0x96, LO, HI, OUT +// +// Process single output row from a total of 8 input rows +// +// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool) +TEXT ยท_galMulAVX512Parallel81(SB), 7, $0 + MOVQ in+0(FP), SI // + MOVQ 8(SI), R9 // R9: len(in) + SHRQ $6, R9 // len(in) / 64 + TESTQ R9, R9 + JZ done_avx512_parallel81 + + MOVQ matrix+48(FP), SI + VMOVDQU64 0x000(SI), Z16 + VMOVDQU64 0x040(SI), Z17 + VMOVDQU64 0x080(SI), Z18 + VMOVDQU64 0x0c0(SI), Z19 + + MOVQ $15, BX + VPBROADCASTB BX, Z2 + + MOVB addTo+56(FP), AX + IMULQ $-0x1, AX + KMOVQ AX, K1 + MOVQ in+0(FP), SI // SI: &in + MOVQ in_len+8(FP), AX // number of inputs + XORQ R11, R11 + MOVQ out+24(FP), DX + MOVQ (DX), DX // DX: &out[0][0] + +loopback_avx512_parallel81: + VMOVDQU64.Z (DX), K1, Z4 + + LOAD(0x00) // &in[0][0] + GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4) + + CMPQ AX, $1 + JE skip_avx512_parallel81 + + LOAD(0x18) // &in[1][0] + GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4) + + CMPQ AX, $2 + JE skip_avx512_parallel81 + + LOAD(0x30) // &in[2][0] + GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4) + + CMPQ AX, $3 + JE skip_avx512_parallel81 + + LOAD(0x48) // &in[3][0] + GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4) + + CMPQ AX, $4 + JE skip_avx512_parallel81 + + LOAD(0x60) // &in[4][0] + GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4) + + CMPQ AX, $5 + JE skip_avx512_parallel81 + + LOAD(0x78) // &in[5][0] + GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4) + + CMPQ AX, $6 + JE skip_avx512_parallel81 + + LOAD(0x90) // &in[6][0] + GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4) + + CMPQ AX, $7 + JE skip_avx512_parallel81 + + LOAD(0xa8) // &in[7][0] + GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4) + +skip_avx512_parallel81: + VMOVDQU64 Z4, (DX) + + ADDQ $64, R11 // in4+=64 + + ADDQ $64, DX // out+=64 + + SUBQ $1, R9 + JNZ loopback_avx512_parallel81 + +done_avx512_parallel81: + VZEROUPPER + RET + // // Process 2 output rows in parallel from a total of 8 input rows // diff --git a/galoisAvx512_amd64_test.go b/galoisAvx512_amd64_test.go index b3de486..24d6846 100644 --- a/galoisAvx512_amd64_test.go +++ b/galoisAvx512_amd64_test.go @@ -14,6 +14,102 @@ import ( "time" ) +func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) { + + if !defaultOptions.useAVX512 { + t.Skip("AVX512 not detected") + } + + rand.Seed(time.Now().UnixNano()) + + var size = 1024 * 1024 + if testing.Short() { + size = 4096 + } + + in, out := make([][]byte, inputSize), make([][]byte, dimOut81) + + for i := range in { + in[i] = make([]byte, size) + rand.Read(in[i]) + } + + for i := range out { + out[i] = make([]byte, size) + rand.Read(out[i]) + } + + opts := defaultOptions + opts.useSSSE3 = true + + matrix := [(16 + 16) * dimIn * dimOut81]byte{} + coeffs := make([]byte, dimIn*len(out)) + + for i := 0; i < dimIn*len(out); i++ { + coeffs[i] = byte(rand.Int31n(256)) + copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) + copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) + } + + // Do first run with clearing out any existing results + _galMulAVX512Parallel81(in, out, &matrix, false) + + expect := make([][]byte, len(out)) + for i := range expect { + expect[i] = make([]byte, size) + rand.Read(expect[i]) + } + + for i := range in { + if i == 0 { + galMulSlice(coeffs[i], in[i], expect[0], &options{}) + } else { + galMulSliceXor(coeffs[i], in[i], expect[0], &options{}) + } + } + + for i := range out { + if 0 != bytes.Compare(out[i], expect[i]) { + t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) + } + } + + inToAdd := make([][]byte, len(in)) + + for i := range inToAdd { + inToAdd[i] = make([]byte, size) + rand.Read(inToAdd[i]) + } + + for i := 0; i < dimIn*len(out); i++ { + coeffs[i] = byte(rand.Int31n(256)) + copy(matrix[i*32:], mulTableLow[coeffs[i]][:]) + copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:]) + } + + // Do second run by adding to original run + _galMulAVX512Parallel81(inToAdd, out, &matrix, true) + + for i := range in { + galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{}) + } + + for i := range out { + if 0 != bytes.Compare(out[i], expect[i]) { + t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8]) + } + } +} + +func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) } +func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) } +func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) } +func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) } +func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) } +func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) } +func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) } +func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) } + func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) { if !defaultOptions.useAVX512 { diff --git a/reedsolomon_test.go b/reedsolomon_test.go index cf6daea..6ba213b 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51 func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) } func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) } func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) } + +func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) } +func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) } +func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }