Avx512 parallel81 (#131)
* AVX512 routine for 8x1 parallel processing (WIP) * Testing and integration of Parallel81 assembly routinemaster
parent
cb7a0b5aef
commit
1b9e129671
|
@ -9,6 +9,9 @@ package reedsolomon
|
||||||
|
|
||||||
import "sync"
|
import "sync"
|
||||||
|
|
||||||
|
//go:noescape
|
||||||
|
func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
|
||||||
|
|
||||||
//go:noescape
|
//go:noescape
|
||||||
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
|
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
|
||||||
|
|
||||||
|
@ -17,13 +20,39 @@ func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo
|
||||||
|
|
||||||
const (
|
const (
|
||||||
dimIn = 8 // Number of input rows processed simultaneously
|
dimIn = 8 // Number of input rows processed simultaneously
|
||||||
|
dimOut81 = 1 // Number of output rows processed simultaneously for x1 routine
|
||||||
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
|
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
|
||||||
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
|
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
|
||||||
|
matrixSize81 = (16 + 16) * dimIn * dimOut81 // Dimension of slice of matrix coefficient passed into x1 routine
|
||||||
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
|
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
|
||||||
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
|
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
|
||||||
)
|
)
|
||||||
|
|
||||||
// Construct block of matrix coefficients for 2 outputs rows in parallel
|
// Construct block of matrix coefficients for single output row in parallel
|
||||||
|
func setupMatrix81(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize81]byte) {
|
||||||
|
offset := 0
|
||||||
|
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||||
|
for iRow := outputOffset; iRow < outputOffset+dimOut81; iRow++ {
|
||||||
|
if c < len(matrixRows[iRow]) {
|
||||||
|
coeff := matrixRows[iRow][c]
|
||||||
|
copy(matrix[offset*32:], mulTableLow[coeff][:])
|
||||||
|
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
|
||||||
|
} else {
|
||||||
|
// coefficients not used for this input shard (so null out)
|
||||||
|
v := matrix[offset*32 : offset*32+32]
|
||||||
|
for i := range v {
|
||||||
|
v[i] = 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
offset += dimIn
|
||||||
|
if offset >= dimIn*dimOut81 {
|
||||||
|
offset -= dimIn*dimOut81 - 1
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Construct block of matrix coefficients for 2 output rows in parallel
|
||||||
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
|
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
|
||||||
offset := 0
|
offset := 0
|
||||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||||
|
@ -47,7 +76,7 @@ func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Construct block of matrix coefficients for 4 outputs rows in parallel
|
// Construct block of matrix coefficients for 4 output rows in parallel
|
||||||
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
|
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
|
||||||
offset := 0
|
offset := 0
|
||||||
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
for c := inputOffset; c < inputOffset+dimIn; c++ {
|
||||||
|
@ -71,6 +100,41 @@ func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Invoke AVX512 routine for single output row in parallel
|
||||||
|
func galMulAVX512Parallel81(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix81 *[matrixSize81]byte) {
|
||||||
|
done := stop - start
|
||||||
|
if done <= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
inputEnd := inputOffset + dimIn
|
||||||
|
if inputEnd > len(in) {
|
||||||
|
inputEnd = len(in)
|
||||||
|
}
|
||||||
|
outputEnd := outputOffset + dimOut81
|
||||||
|
if outputEnd > len(out) {
|
||||||
|
outputEnd = len(out)
|
||||||
|
}
|
||||||
|
|
||||||
|
// We know the max size, alloc temp array.
|
||||||
|
var inTmp [dimIn][]byte
|
||||||
|
for i, v := range in[inputOffset:inputEnd] {
|
||||||
|
inTmp[i] = v[start:stop]
|
||||||
|
}
|
||||||
|
var outTmp [dimOut81][]byte
|
||||||
|
for i, v := range out[outputOffset:outputEnd] {
|
||||||
|
outTmp[i] = v[start:stop]
|
||||||
|
}
|
||||||
|
|
||||||
|
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||||
|
_galMulAVX512Parallel81(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix81, addTo)
|
||||||
|
|
||||||
|
done = start + ((done >> 6) << 6)
|
||||||
|
if done < stop {
|
||||||
|
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Invoke AVX512 routine for 2 output rows in parallel
|
// Invoke AVX512 routine for 2 output rows in parallel
|
||||||
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
|
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset, start, stop int, matrix82 *[matrixSize82]byte) {
|
||||||
done := stop - start
|
done := stop - start
|
||||||
|
@ -101,23 +165,8 @@ func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset,
|
||||||
_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
|
_galMulAVX512Parallel82(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix82, addTo)
|
||||||
|
|
||||||
done = start + ((done >> 6) << 6)
|
done = start + ((done >> 6) << 6)
|
||||||
if done == stop {
|
if done < stop {
|
||||||
return
|
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||||
}
|
|
||||||
|
|
||||||
for c := inputOffset; c < inputEnd; c++ {
|
|
||||||
for iRow := outputOffset; iRow < outputEnd; iRow++ {
|
|
||||||
if c < len(matrixRows[iRow]) {
|
|
||||||
mt := mulTable[matrixRows[iRow][c]][:256]
|
|
||||||
for i := done; i < stop; i++ {
|
|
||||||
if c == 0 { // only set value for first input column
|
|
||||||
out[iRow][i] = mt[in[c][i]]
|
|
||||||
} else { // and add for all others
|
|
||||||
out[iRow][i] ^= mt[in[c][i]]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -150,12 +199,13 @@ func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset,
|
||||||
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
addTo := inputOffset != 0 // Except for the first input column, add to previous results
|
||||||
_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
|
_galMulAVX512Parallel84(inTmp[:inputEnd-inputOffset], outTmp[:outputEnd-outputOffset], matrix84, addTo)
|
||||||
|
|
||||||
done = (done >> 6) << 6
|
done = start + ((done >> 6) << 6)
|
||||||
done += start
|
if done < stop {
|
||||||
if done == stop {
|
galMulAVX512LastInput(inputOffset, inputEnd, outputOffset, outputEnd, matrixRows, done, stop, out, in)
|
||||||
return
|
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outputEnd int, matrixRows [][]byte, done int, stop int, out [][]byte, in [][]byte) {
|
||||||
for c := inputOffset; c < inputEnd; c++ {
|
for c := inputOffset; c < inputEnd; c++ {
|
||||||
for iRow := outputOffset; iRow < outputEnd; iRow++ {
|
for iRow := outputOffset; iRow < outputEnd; iRow++ {
|
||||||
if c < len(matrixRows[iRow]) {
|
if c < len(matrixRows[iRow]) {
|
||||||
|
@ -183,6 +233,7 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
||||||
for start < byteCount {
|
for start < byteCount {
|
||||||
matrix84 := [matrixSize84]byte{}
|
matrix84 := [matrixSize84]byte{}
|
||||||
matrix82 := [matrixSize82]byte{}
|
matrix82 := [matrixSize82]byte{}
|
||||||
|
matrix81 := [matrixSize81]byte{}
|
||||||
|
|
||||||
outputRow := 0
|
outputRow := 0
|
||||||
// First process (multiple) batches of 4 output rows in parallel
|
// First process (multiple) batches of 4 output rows in parallel
|
||||||
|
@ -204,12 +255,9 @@ func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte,
|
||||||
}
|
}
|
||||||
// Lastly, we may have a single output row left (for uneven parity)
|
// Lastly, we may have a single output row left (for uneven parity)
|
||||||
if outputRow < outputCount {
|
if outputRow < outputCount {
|
||||||
for c := 0; c < r.DataShards; c++ {
|
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||||
if c == 0 {
|
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
|
||||||
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, end, &matrix81)
|
||||||
} else {
|
|
||||||
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -245,6 +293,7 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
|
||||||
// Loop for each round.
|
// Loop for each round.
|
||||||
matrix84 := [matrixSize84]byte{}
|
matrix84 := [matrixSize84]byte{}
|
||||||
matrix82 := [matrixSize82]byte{}
|
matrix82 := [matrixSize82]byte{}
|
||||||
|
matrix81 := [matrixSize81]byte{}
|
||||||
for start < grStop {
|
for start < grStop {
|
||||||
outputRow := 0
|
outputRow := 0
|
||||||
// First process (multiple) batches of 4 output rows in parallel
|
// First process (multiple) batches of 4 output rows in parallel
|
||||||
|
@ -268,15 +317,9 @@ func (r reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte,
|
||||||
}
|
}
|
||||||
// Lastly, we may have a single output row left (for uneven parity)
|
// Lastly, we may have a single output row left (for uneven parity)
|
||||||
if outputRow < outputCount {
|
if outputRow < outputCount {
|
||||||
for c := 0; c < r.DataShards; c++ {
|
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
|
||||||
in := inputs[c][start:stop]
|
setupMatrix81(matrixRows, inputRow, outputRow, &matrix81)
|
||||||
for iRow := 0; iRow < outputCount; iRow++ {
|
galMulAVX512Parallel81(inputs, outputs, matrixRows, inputRow, outputRow, start, stop, &matrix81)
|
||||||
if c == 0 {
|
|
||||||
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
|
||||||
} else {
|
|
||||||
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
start = stop
|
start = stop
|
||||||
|
|
|
@ -18,6 +18,97 @@
|
||||||
VPTERNLOGD $0x96, LO, HI, OUT
|
VPTERNLOGD $0x96, LO, HI, OUT
|
||||||
|
|
||||||
|
|
||||||
|
//
|
||||||
|
// Process single output row from a total of 8 input rows
|
||||||
|
//
|
||||||
|
// func _galMulAVX512Parallel81(in, out [][]byte, matrix *[matrixSize81]byte, addTo bool)
|
||||||
|
TEXT ·_galMulAVX512Parallel81(SB), 7, $0
|
||||||
|
MOVQ in+0(FP), SI //
|
||||||
|
MOVQ 8(SI), R9 // R9: len(in)
|
||||||
|
SHRQ $6, R9 // len(in) / 64
|
||||||
|
TESTQ R9, R9
|
||||||
|
JZ done_avx512_parallel81
|
||||||
|
|
||||||
|
MOVQ matrix+48(FP), SI
|
||||||
|
VMOVDQU64 0x000(SI), Z16
|
||||||
|
VMOVDQU64 0x040(SI), Z17
|
||||||
|
VMOVDQU64 0x080(SI), Z18
|
||||||
|
VMOVDQU64 0x0c0(SI), Z19
|
||||||
|
|
||||||
|
MOVQ $15, BX
|
||||||
|
VPBROADCASTB BX, Z2
|
||||||
|
|
||||||
|
MOVB addTo+56(FP), AX
|
||||||
|
IMULQ $-0x1, AX
|
||||||
|
KMOVQ AX, K1
|
||||||
|
MOVQ in+0(FP), SI // SI: &in
|
||||||
|
MOVQ in_len+8(FP), AX // number of inputs
|
||||||
|
XORQ R11, R11
|
||||||
|
MOVQ out+24(FP), DX
|
||||||
|
MOVQ (DX), DX // DX: &out[0][0]
|
||||||
|
|
||||||
|
loopback_avx512_parallel81:
|
||||||
|
VMOVDQU64.Z (DX), K1, Z4
|
||||||
|
|
||||||
|
LOAD(0x00) // &in[0][0]
|
||||||
|
GALOIS(0x00, 0x55, Z16, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $1
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x18) // &in[1][0]
|
||||||
|
GALOIS(0xaa, 0xff, Z16, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $2
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x30) // &in[2][0]
|
||||||
|
GALOIS(0x00, 0x55, Z17, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $3
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x48) // &in[3][0]
|
||||||
|
GALOIS(0xaa, 0xff, Z17, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $4
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x60) // &in[4][0]
|
||||||
|
GALOIS(0x00, 0x55, Z18, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $5
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x78) // &in[5][0]
|
||||||
|
GALOIS(0xaa, 0xff, Z18, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $6
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0x90) // &in[6][0]
|
||||||
|
GALOIS(0x00, 0x55, Z19, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
CMPQ AX, $7
|
||||||
|
JE skip_avx512_parallel81
|
||||||
|
|
||||||
|
LOAD(0xa8) // &in[7][0]
|
||||||
|
GALOIS(0xaa, 0xff, Z19, Z14, Z15, Z4)
|
||||||
|
|
||||||
|
skip_avx512_parallel81:
|
||||||
|
VMOVDQU64 Z4, (DX)
|
||||||
|
|
||||||
|
ADDQ $64, R11 // in4+=64
|
||||||
|
|
||||||
|
ADDQ $64, DX // out+=64
|
||||||
|
|
||||||
|
SUBQ $1, R9
|
||||||
|
JNZ loopback_avx512_parallel81
|
||||||
|
|
||||||
|
done_avx512_parallel81:
|
||||||
|
VZEROUPPER
|
||||||
|
RET
|
||||||
|
|
||||||
//
|
//
|
||||||
// Process 2 output rows in parallel from a total of 8 input rows
|
// Process 2 output rows in parallel from a total of 8 input rows
|
||||||
//
|
//
|
||||||
|
|
|
@ -14,6 +14,102 @@ import (
|
||||||
"time"
|
"time"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
func testGaloisAvx512Parallelx1(t *testing.T, inputSize int) {
|
||||||
|
|
||||||
|
if !defaultOptions.useAVX512 {
|
||||||
|
t.Skip("AVX512 not detected")
|
||||||
|
}
|
||||||
|
|
||||||
|
rand.Seed(time.Now().UnixNano())
|
||||||
|
|
||||||
|
var size = 1024 * 1024
|
||||||
|
if testing.Short() {
|
||||||
|
size = 4096
|
||||||
|
}
|
||||||
|
|
||||||
|
in, out := make([][]byte, inputSize), make([][]byte, dimOut81)
|
||||||
|
|
||||||
|
for i := range in {
|
||||||
|
in[i] = make([]byte, size)
|
||||||
|
rand.Read(in[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range out {
|
||||||
|
out[i] = make([]byte, size)
|
||||||
|
rand.Read(out[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
opts := defaultOptions
|
||||||
|
opts.useSSSE3 = true
|
||||||
|
|
||||||
|
matrix := [(16 + 16) * dimIn * dimOut81]byte{}
|
||||||
|
coeffs := make([]byte, dimIn*len(out))
|
||||||
|
|
||||||
|
for i := 0; i < dimIn*len(out); i++ {
|
||||||
|
coeffs[i] = byte(rand.Int31n(256))
|
||||||
|
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
|
||||||
|
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do first run with clearing out any existing results
|
||||||
|
_galMulAVX512Parallel81(in, out, &matrix, false)
|
||||||
|
|
||||||
|
expect := make([][]byte, len(out))
|
||||||
|
for i := range expect {
|
||||||
|
expect[i] = make([]byte, size)
|
||||||
|
rand.Read(expect[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range in {
|
||||||
|
if i == 0 {
|
||||||
|
galMulSlice(coeffs[i], in[i], expect[0], &options{})
|
||||||
|
} else {
|
||||||
|
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range out {
|
||||||
|
if 0 != bytes.Compare(out[i], expect[i]) {
|
||||||
|
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inToAdd := make([][]byte, len(in))
|
||||||
|
|
||||||
|
for i := range inToAdd {
|
||||||
|
inToAdd[i] = make([]byte, size)
|
||||||
|
rand.Read(inToAdd[i])
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := 0; i < dimIn*len(out); i++ {
|
||||||
|
coeffs[i] = byte(rand.Int31n(256))
|
||||||
|
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
|
||||||
|
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
|
||||||
|
}
|
||||||
|
|
||||||
|
// Do second run by adding to original run
|
||||||
|
_galMulAVX512Parallel81(inToAdd, out, &matrix, true)
|
||||||
|
|
||||||
|
for i := range in {
|
||||||
|
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
|
||||||
|
}
|
||||||
|
|
||||||
|
for i := range out {
|
||||||
|
if 0 != bytes.Compare(out[i], expect[i]) {
|
||||||
|
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGaloisAvx512Parallel11(t *testing.T) { testGaloisAvx512Parallelx1(t, 1) }
|
||||||
|
func TestGaloisAvx512Parallel21(t *testing.T) { testGaloisAvx512Parallelx1(t, 2) }
|
||||||
|
func TestGaloisAvx512Parallel31(t *testing.T) { testGaloisAvx512Parallelx1(t, 3) }
|
||||||
|
func TestGaloisAvx512Parallel41(t *testing.T) { testGaloisAvx512Parallelx1(t, 4) }
|
||||||
|
func TestGaloisAvx512Parallel51(t *testing.T) { testGaloisAvx512Parallelx1(t, 5) }
|
||||||
|
func TestGaloisAvx512Parallel61(t *testing.T) { testGaloisAvx512Parallelx1(t, 6) }
|
||||||
|
func TestGaloisAvx512Parallel71(t *testing.T) { testGaloisAvx512Parallelx1(t, 7) }
|
||||||
|
func TestGaloisAvx512Parallel81(t *testing.T) { testGaloisAvx512Parallelx1(t, 8) }
|
||||||
|
|
||||||
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
|
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
|
||||||
|
|
||||||
if !defaultOptions.useAVX512 {
|
if !defaultOptions.useAVX512 {
|
||||||
|
|
|
@ -1486,3 +1486,7 @@ func BenchmarkParallel_20x10x05M(b *testing.B) { benchmarkParallel(b, 20, 10, 51
|
||||||
func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }
|
func BenchmarkParallel_8x8x1M(b *testing.B) { benchmarkParallel(b, 8, 8, 1<<20) }
|
||||||
func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) }
|
func BenchmarkParallel_8x8x8M(b *testing.B) { benchmarkParallel(b, 8, 8, 8<<20) }
|
||||||
func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) }
|
func BenchmarkParallel_8x8x32M(b *testing.B) { benchmarkParallel(b, 8, 8, 32<<20) }
|
||||||
|
|
||||||
|
func BenchmarkParallel_8x3x1M(b *testing.B) { benchmarkParallel(b, 8, 3, 1<<20) }
|
||||||
|
func BenchmarkParallel_8x4x1M(b *testing.B) { benchmarkParallel(b, 8, 4, 1<<20) }
|
||||||
|
func BenchmarkParallel_8x5x1M(b *testing.B) { benchmarkParallel(b, 8, 5, 1<<20) }
|
||||||
|
|
Loading…
Reference in New Issue