Avx512 parallel81 (#131)

* AVX512 routine for 8x1 parallel processing (WIP) * Testing and integration of Parallel81 assembly routine
2020-05-06 03:32:31 -07:00 · 2020-05-06 03:32:31 -07:00 · 1b9e129671
parent cb7a0b5aef
commit 1b9e129671
4 changed files with 272 additions and 38 deletions
--- a/galoisAvx512_amd64.go
+++ b/galoisAvx512_amd64.go
@ -9,6 +9,9 @@ package reedsolomon
 import "sync"
 //go:noescape
 func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
 //go:noescape
 func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo
 const (
 	dimIn        = 8                            // Number of input rows processed simultaneously
 	dimOut81     = 1                            // Number of output rows processed simultaneously for x1 routine
 	dimOut82     = 2                            // Number of output rows processed simultaneously for x2 routine
 	dimOut84     = 4                            // Number of output rows processed simultaneously for x4 routine
 	matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
 	matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
 	matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
 )
-// Construct block of matrix coefficients for 2 outputs rows in parallel
+// Construct block of matrix coefficients for single output row in parallel
 func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
 	offset := 0
 	for c := inputOffset; c < inputOffset+dimIn; c++ {
 		for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
 			if c < len(matrixRows[iRow]) {
 				coeff := matrixRows[iRow][c]
 				copy(matrix[offset*32:], mulTableLow[coeff][:])
 				copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
 			} else {
 				// coefficients not used for this input shard (so null out)
 				v := matrix[offset*32 : offset*32+32]
 				for i := range v {
 					v[i] = 0
 				}
 			}
 			offset += dimIn
 			if offset >= dimIn*dimOut81 {
 				offset -= dimIn*dimOut81 - 1
 			}
 		}
 	}
 }
 // Construct block of matrix coefficients for 2 output rows in parallel
 func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
 	offset := 0
 	for c := inputOffset; c < inputOffset+dimIn; c++ {
@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
 	}
 }
-// Construct block of matrix coefficients for 4 outputs rows in parallel
+// Construct block of matrix coefficients for 4 output rows in parallel
 func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
 	offset := 0
 	for c := inputOffset; c < inputOffset+dimIn; c++ {
@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
 	}
 }
 // Invoke AVX512 routine for single output row in parallel
 func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
 	done := stop - start
 	if done <= 0 {
 		return
 	}
 	inputEnd := inputOffset + dimIn
 	if inputEnd > len(in) {
 		inputEnd = len(in)
 	}
 	outputEnd := outputOffset + dimOut81
 	if outputEnd > len(out) {
 		outputEnd = len(out)
 	}
 	// We know the max size, alloc temp array.
 	var inTmp [dimIn][]byte
 	for i, v := range in[inputOffset:inputEnd] {
 		inTmp[i] = v[start:stop]
 	}
 	var outTmp [dimOut81][]byte
 	for i, v := range out[outputOffset:outputEnd] {
 		outTmp[i] = v[start:stop]
 	}
 	addTo := inputOffset != 0 // Except for the first input column, add to previous results
 	_galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
 	done = start + ((done >> 6) << 6)
 	if done < stop {
 		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
 	}
 }
 // Invoke AVX512 routine for 2 output rows in parallel
 func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
 	done := stop - start
@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset,
 	_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
 	done = start + ((done >> 6) << 6)
-	if done == stop {
+	if done < stop {
-		return
+		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
 	}
 	for c := inputOffset; c < inputEnd; c++ {
 		for iRow := outputOffset; iRow < outputEnd; iRow++ {
 			if c < len(matrixRows[iRow]) {
 				mt := mulTable[matrixRows[iRow][c]][:256]
 				for i := done; i < stop; i++ {
 					if c == 0 { // only set value for first input column
 						out[iRow][i] = mt[in[c][i]]
 					} else { // and add for all others
 						out[iRow][i] ^= mt[in[c][i]]
 					}
 				}
 			}
 		}
 	}
 }
@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset,
 	addTo := inputOffset != 0 // Except for the first input column, add to previous results
 	_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
-	done = (done >> 6) << 6
+	done = start + ((done >> 6) << 6)
-	done += start
+	if done < stop {
-	if done == stop {
+		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
 		return
 	}
 }
 func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
 	for c := inputOffset; c < inputEnd; c++ {
 		for iRow := outputOffset; iRow < outputEnd; iRow++ {
 			if c < len(matrixRows[iRow]) {
@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
 	for start < byteCount {
 		matrix84 := [matrixSize84]byte{}
 		matrix82 := [matrixSize82]byte{}
 		matrix81 := [matrixSize81]byte{}
 		outputRow := 0
 		// First process (multiple) batches of 4 output rows in parallel
@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
 		}
 		// Lastly, we may have a single output row left (for uneven parity)
 		if outputRow < outputCount {
-			for c := 0; c < r.DataShards; c++ {
+			for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
-				if c == 0 {
+				setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
-					galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
+				galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
 				} else {
 					galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
 				}
 			}
 		}
@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
 			// Loop for each round.
 			matrix84 := [matrixSize84]byte{}
 			matrix82 := [matrixSize82]byte{}
 			matrix81 := [matrixSize81]byte{}
 			for start < grStop {
 				outputRow := 0
 				// First process (multiple) batches of 4 output rows in parallel
@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
 				}
 				// Lastly, we may have a single output row left (for uneven parity)
 				if outputRow < outputCount {
-					for c := 0; c < r.DataShards; c++ {
+					for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
-						in := inputs[c][start:stop]
+						setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
-						for iRow := 0; iRow < outputCount; iRow++ {
+						galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
 							if c == 0 {
 								galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
 							} else {
 								galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
 							}
 						}
 					}
 				}
 				start = stop
--- a/galoisAvx512_amd64.s
+++ b/galoisAvx512_amd64.s
@ -18,6 +18,97 @@
 	VPTERNLOGD $0x96, LO, HI, OUT
 //
 // Process single output row from a total of 8 input rows
 //
 // func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
 TEXT ·_galMulAVX512Parallel81(SB), 7, $0
 	MOVQ  in+0(FP), SI     //
 	MOVQ  8(SI), R9        // R9: len(in)
 	SHRQ  $6, R9           // len(in) / 64
 	TESTQ R9, R9
 	JZ    done_avx512_parallel81
 	MOVQ matrix+48(FP), SI
 	VMOVDQU64 0x000(SI), Z16
 	VMOVDQU64 0x040(SI), Z17
 	VMOVDQU64 0x080(SI), Z18
 	VMOVDQU64 0x0c0(SI), Z19
 	MOVQ         $15, BX
 	VPBROADCASTB BX, Z2
 	MOVB addTo+56(FP), AX
 	IMULQ $-0x1, AX
 	KMOVQ AX, K1
 	MOVQ in+0(FP), SI  //  SI: &in
 	MOVQ in_len+8(FP), AX  // number of inputs
 	XORQ R11, R11
 	MOVQ out+24(FP), DX
 	MOVQ (DX), DX      //  DX: &out[0][0]
 loopback_avx512_parallel81:
 	VMOVDQU64.Z (DX), K1, Z4
 	LOAD(0x00) // &in[0][0]
 	GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
    CMPQ AX, $1
    JE skip_avx512_parallel81
 	LOAD(0x18) // &in[1][0]
 	GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
    CMPQ AX, $2
    JE skip_avx512_parallel81
 	LOAD(0x30) // &in[2][0]
 	GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
    CMPQ AX, $3
    JE skip_avx512_parallel81
 	LOAD(0x48) // &in[3][0]
 	GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
    CMPQ AX, $4
    JE skip_avx512_parallel81
 	LOAD(0x60) // &in[4][0]
 	GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
    CMPQ AX, $5
    JE skip_avx512_parallel81
 	LOAD(0x78) // &in[5][0]
 	GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
    CMPQ AX, $6
    JE skip_avx512_parallel81
 	LOAD(0x90) // &in[6][0]
 	GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
    CMPQ AX, $7
    JE skip_avx512_parallel81
 	LOAD(0xa8) // &in[7][0]
 	GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
 skip_avx512_parallel81:
 	VMOVDQU64 Z4, (DX)
 	ADDQ $64, R11 // in4+=64
 	ADDQ $64, DX  // out+=64
 	SUBQ $1, R9
 	JNZ  loopback_avx512_parallel81
 done_avx512_parallel81:
 	VZEROUPPER
 	RET
 //
 // Process 2 output rows in parallel from a total of 8 input rows
 //
--- a/galoisAvx512_amd64_test.go
+++ b/galoisAvx512_amd64_test.go
@ -14,6 +14,102 @@ import (
 	"time"
 )
 func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
 	if !defaultOptions.useAVX512 {
 		t.Skip("AVX512 not detected")
 	}
 	rand.Seed(time.Now().UnixNano())
 	var size = 1024 * 1024
 	if testing.Short() {
 		size = 4096
 	}
 	in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
 	for i := range in {
 		in[i] = make([]byte, size)
 		rand.Read(in[i])
 	}
 	for i := range out {
 		out[i] = make([]byte, size)
 		rand.Read(out[i])
 	}
 	opts := defaultOptions
 	opts.useSSSE3 = true
 	matrix := [(16 + 16) * dimIn * dimOut81]byte{}
 	coeffs := make([]byte, dimIn*len(out))
 	for i := 0; i < dimIn*len(out); i++ {
 		coeffs[i] = byte(rand.Int31n(256))
 		copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
 		copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
 	}
 	// Do first run with clearing out any existing results
 	_galMulAVX512Parallel81(in, out, &matrix, false)
 	expect := make([][]byte, len(out))
 	for i := range expect {
 		expect[i] = make([]byte, size)
 		rand.Read(expect[i])
 	}
 	for i := range in {
 		if i == 0 {
 			galMulSlice(coeffs[i], in[i], expect[0], &options{})
 		} else {
 			galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
 		}
 	}
 	for i := range out {
 		if 0 != bytes.Compare(out[i], expect[i]) {
 			t.Errorf("got [%d]%#v...,\n                  expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
 		}
 	}
 	inToAdd := make([][]byte, len(in))
 	for i := range inToAdd {
 		inToAdd[i] = make([]byte, size)
 		rand.Read(inToAdd[i])
 	}
 	for i := 0; i < dimIn*len(out); i++ {
 		coeffs[i] = byte(rand.Int31n(256))
 		copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
 		copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
 	}
 	// Do second run by adding to original run
 	_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
 	for i := range in {
 		galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
 	}
 	for i := range out {
 		if 0 != bytes.Compare(out[i], expect[i]) {
 			t.Errorf("got [%d]%#v...,\n                  expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
 		}
 	}
 }
 func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
 func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
 func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
 func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
 func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
 func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
 func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
 func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
 func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
 	if !defaultOptions.useAVX512 {
--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51
 func BenchmarkParallel_8x8x1M(b *testing.B)    { benchmarkParallel(b, 8, 8, 1<<20) }
 func BenchmarkParallel_8x8x8M(b *testing.B)    { benchmarkParallel(b, 8, 8, 8<<20) }
 func BenchmarkParallel_8x8x32M(b *testing.B)   { benchmarkParallel(b, 8, 8, 32<<20) }
 func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) }
 func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) }
 func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }