From 1b9e1296719a823885ff21fa0c46ea5f3555c709 Mon Sep 17 00:00:00 2001
From: Frank Wessels <fwessels@xs4all.nl>
Date: Wed, 6 May 2020 03:32:31 -0700
Subject: [PATCH] Avx512 parallel81 (#131)

* AVX512 routine for 8x1 parallel processing (WIP)

* Testing and integration of Parallel81 assembly routine
---
 galoisAvx512_amd64.go      | 119 +++++++++++++++++++++++++------------
 galoisAvx512_amd64.s       |  91 ++++++++++++++++++++++++++++
 galoisAvx512_amd64_test.go |  96 ++++++++++++++++++++++++++++++
 reedsolomon_test.go        |   4 ++
 4 files changed, 272 insertions(+), 38 deletions(-)

diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go
index 4f9daa6..d7d87b1 100644
--- a/galoisAvx512_amd64.go
+++ b/galoisAvx512_amd64.go
@@ -9,6 +9,9 @@ package reedsolomon
 
 import "sync"
 
+//go:noescape
+func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
+
 //go:noescape
 func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
 
@@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo
 
 const (
 	dimIn        = 8                            // Number of input rows processed simultaneously
+	dimOut81     = 1                            // Number of output rows processed simultaneously for x1 routine
 	dimOut82     = 2                            // Number of output rows processed simultaneously for x2 routine
 	dimOut84     = 4                            // Number of output rows processed simultaneously for x4 routine
+	matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
 	matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
 	matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
 )
 
-// Construct block of matrix coefficients for 2 outputs rows in parallel
+// Construct block of matrix coefficients for single output row in parallel
+func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
+	offset := 0
+	for c := inputOffset; c < inputOffset+dimIn; c++ {
+		for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
+			if c < len(matrixRows[iRow]) {
+				coeff := matrixRows[iRow][c]
+				copy(matrix[offset*32:], mulTableLow[coeff][:])
+				copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
+			} else {
+				// coefficients not used for this input shard (so null out)
+				v := matrix[offset*32 : offset*32+32]
+				for i := range v {
+					v[i] = 0
+				}
+			}
+			offset += dimIn
+			if offset >= dimIn*dimOut81 {
+				offset -= dimIn*dimOut81 - 1
+			}
+		}
+	}
+}
+
+// Construct block of matrix coefficients for 2 output rows in parallel
 func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
 	offset := 0
 	for c := inputOffset; c < inputOffset+dimIn; c++ {
@@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
 	}
 }
 
-// Construct block of matrix coefficients for 4 outputs rows in parallel
+// Construct block of matrix coefficients for 4 output rows in parallel
 func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
 	offset := 0
 	for c := inputOffset; c < inputOffset+dimIn; c++ {
@@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
 	}
 }
 
+// Invoke AVX512 routine for single output row in parallel
+func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
+	done := stop - start
+	if done <= 0 {
+		return
+	}
+
+	inputEnd := inputOffset + dimIn
+	if inputEnd > len(in) {
+		inputEnd = len(in)
+	}
+	outputEnd := outputOffset + dimOut81
+	if outputEnd > len(out) {
+		outputEnd = len(out)
+	}
+
+	// We know the max size, alloc temp array.
+	var inTmp [dimIn][]byte
+	for i, v := range in[inputOffset:inputEnd] {
+		inTmp[i] = v[start:stop]
+	}
+	var outTmp [dimOut81][]byte
+	for i, v := range out[outputOffset:outputEnd] {
+		outTmp[i] = v[start:stop]
+	}
+
+	addTo := inputOffset != 0 // Except for the first input column, add to previous results
+	_galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
+
+	done = start + ((done >> 6) << 6)
+	if done < stop {
+		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
+	}
+}
+
 // Invoke AVX512 routine for 2 output rows in parallel
 func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
 	done := stop - start
@@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset,
 	_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
 
 	done = start + ((done >> 6) << 6)
-	if done == stop {
-		return
-	}
-
-	for c := inputOffset; c < inputEnd; c++ {
-		for iRow := outputOffset; iRow < outputEnd; iRow++ {
-			if c < len(matrixRows[iRow]) {
-				mt := mulTable[matrixRows[iRow][c]][:256]
-				for i := done; i < stop; i++ {
-					if c == 0 { // only set value for first input column
-						out[iRow][i] = mt[in[c][i]]
-					} else { // and add for all others
-						out[iRow][i] ^= mt[in[c][i]]
-					}
-				}
-			}
-		}
+	if done < stop {
+		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
 	}
 }
 
@@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset,
 	addTo := inputOffset != 0 // Except for the first input column, add to previous results
 	_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
 
-	done = (done >> 6) << 6
-	done += start
-	if done == stop {
-		return
+	done = start + ((done >> 6) << 6)
+	if done < stop {
+		galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
 	}
+}
 
+func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
 	for c := inputOffset; c < inputEnd; c++ {
 		for iRow := outputOffset; iRow < outputEnd; iRow++ {
 			if c < len(matrixRows[iRow]) {
@@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
 	for start < byteCount {
 		matrix84 := [matrixSize84]byte{}
 		matrix82 := [matrixSize82]byte{}
+		matrix81 := [matrixSize81]byte{}
 
 		outputRow := 0
 		// First process (multiple) batches of 4 output rows in parallel
@@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
 		}
 		// Lastly, we may have a single output row left (for uneven parity)
 		if outputRow < outputCount {
-			for c := 0; c < r.DataShards; c++ {
-				if c == 0 {
-					galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
-				} else {
-					galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
-				}
+			for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+				setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
+				galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
 			}
 		}
 
@@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
 			// Loop for each round.
 			matrix84 := [matrixSize84]byte{}
 			matrix82 := [matrixSize82]byte{}
+			matrix81 := [matrixSize81]byte{}
 			for start < grStop {
 				outputRow := 0
 				// First process (multiple) batches of 4 output rows in parallel
@@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
 				}
 				// Lastly, we may have a single output row left (for uneven parity)
 				if outputRow < outputCount {
-					for c := 0; c < r.DataShards; c++ {
-						in := inputs[c][start:stop]
-						for iRow := 0; iRow < outputCount; iRow++ {
-							if c == 0 {
-								galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
-							} else {
-								galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
-							}
-						}
+					for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
+						setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
+						galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
 					}
 				}
 				start = stop
diff --git a/galoisAvx512_amd64.s b/galoisAvx512_amd64.s
index f1494ac..566c548 100644
--- a/galoisAvx512_amd64.s
+++ b/galoisAvx512_amd64.s
@@ -18,6 +18,97 @@
 	VPTERNLOGD $0x96, LO, HI, OUT
 
 
+//
+// Process single output row from a total of 8 input rows
+//
+// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
+TEXT ·_galMulAVX512Parallel81(SB), 7, $0
+	MOVQ  in+0(FP), SI     //
+	MOVQ  8(SI), R9        // R9: len(in)
+	SHRQ  $6, R9           // len(in) / 64
+	TESTQ R9, R9
+	JZ    done_avx512_parallel81
+
+	MOVQ matrix+48(FP), SI
+	VMOVDQU64 0x000(SI), Z16
+	VMOVDQU64 0x040(SI), Z17
+	VMOVDQU64 0x080(SI), Z18
+	VMOVDQU64 0x0c0(SI), Z19
+
+	MOVQ         $15, BX
+	VPBROADCASTB BX, Z2
+
+	MOVB addTo+56(FP), AX
+	IMULQ $-0x1, AX
+	KMOVQ AX, K1
+	MOVQ in+0(FP), SI  //  SI: &in
+	MOVQ in_len+8(FP), AX  // number of inputs
+	XORQ R11, R11
+	MOVQ out+24(FP), DX
+	MOVQ (DX), DX      //  DX: &out[0][0]
+
+loopback_avx512_parallel81:
+	VMOVDQU64.Z (DX), K1, Z4
+
+	LOAD(0x00) // &in[0][0]
+	GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
+
+    CMPQ AX, $1
+    JE skip_avx512_parallel81
+
+	LOAD(0x18) // &in[1][0]
+	GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
+
+    CMPQ AX, $2
+    JE skip_avx512_parallel81
+
+	LOAD(0x30) // &in[2][0]
+	GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
+
+    CMPQ AX, $3
+    JE skip_avx512_parallel81
+
+	LOAD(0x48) // &in[3][0]
+	GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
+
+    CMPQ AX, $4
+    JE skip_avx512_parallel81
+
+	LOAD(0x60) // &in[4][0]
+	GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
+
+    CMPQ AX, $5
+    JE skip_avx512_parallel81
+
+	LOAD(0x78) // &in[5][0]
+	GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
+
+    CMPQ AX, $6
+    JE skip_avx512_parallel81
+
+	LOAD(0x90) // &in[6][0]
+	GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
+
+    CMPQ AX, $7
+    JE skip_avx512_parallel81
+
+	LOAD(0xa8) // &in[7][0]
+	GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
+
+skip_avx512_parallel81:
+	VMOVDQU64 Z4, (DX)
+
+	ADDQ $64, R11 // in4+=64
+
+	ADDQ $64, DX  // out+=64
+
+	SUBQ $1, R9
+	JNZ  loopback_avx512_parallel81
+
+done_avx512_parallel81:
+	VZEROUPPER
+	RET
+
 //
 // Process 2 output rows in parallel from a total of 8 input rows
 //
diff --git a/galoisAvx512_amd64_test.go b/galoisAvx512_amd64_test.go
index b3de486..24d6846 100644
--- a/galoisAvx512_amd64_test.go
+++ b/galoisAvx512_amd64_test.go
@@ -14,6 +14,102 @@ import (
 	"time"
 )
 
+func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
+
+	if !defaultOptions.useAVX512 {
+		t.Skip("AVX512 not detected")
+	}
+
+	rand.Seed(time.Now().UnixNano())
+
+	var size = 1024 * 1024
+	if testing.Short() {
+		size = 4096
+	}
+
+	in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
+
+	for i := range in {
+		in[i] = make([]byte, size)
+		rand.Read(in[i])
+	}
+
+	for i := range out {
+		out[i] = make([]byte, size)
+		rand.Read(out[i])
+	}
+
+	opts := defaultOptions
+	opts.useSSSE3 = true
+
+	matrix := [(16 + 16) * dimIn * dimOut81]byte{}
+	coeffs := make([]byte, dimIn*len(out))
+
+	for i := 0; i < dimIn*len(out); i++ {
+		coeffs[i] = byte(rand.Int31n(256))
+		copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
+		copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
+	}
+
+	// Do first run with clearing out any existing results
+	_galMulAVX512Parallel81(in, out, &matrix, false)
+
+	expect := make([][]byte, len(out))
+	for i := range expect {
+		expect[i] = make([]byte, size)
+		rand.Read(expect[i])
+	}
+
+	for i := range in {
+		if i == 0 {
+			galMulSlice(coeffs[i], in[i], expect[0], &options{})
+		} else {
+			galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
+		}
+	}
+
+	for i := range out {
+		if 0 != bytes.Compare(out[i], expect[i]) {
+			t.Errorf("got [%d]%#v...,\n                  expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
+		}
+	}
+
+	inToAdd := make([][]byte, len(in))
+
+	for i := range inToAdd {
+		inToAdd[i] = make([]byte, size)
+		rand.Read(inToAdd[i])
+	}
+
+	for i := 0; i < dimIn*len(out); i++ {
+		coeffs[i] = byte(rand.Int31n(256))
+		copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
+		copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
+	}
+
+	// Do second run by adding to original run
+	_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
+
+	for i := range in {
+		galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
+	}
+
+	for i := range out {
+		if 0 != bytes.Compare(out[i], expect[i]) {
+			t.Errorf("got [%d]%#v...,\n                  expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
+		}
+	}
+}
+
+func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
+func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
+func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
+func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
+func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
+func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
+func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
+func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
+
 func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
 
 	if !defaultOptions.useAVX512 {
diff --git a/reedsolomon_test.go b/reedsolomon_test.go
index cf6daea..6ba213b 100644
--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51
 func BenchmarkParallel_8x8x1M(b *testing.B)    { benchmarkParallel(b, 8, 8, 1<<20) }
 func BenchmarkParallel_8x8x8M(b *testing.B)    { benchmarkParallel(b, 8, 8, 8<<20) }
 func BenchmarkParallel_8x8x32M(b *testing.B)   { benchmarkParallel(b, 8, 8, 32<<20) }
+
+func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) }
+func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) }
+func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }