AVX512 accelerated version resulting in a 4x speed improvement over AVX2 (#91)

The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis of up to 4x compared to AVX2 as can be seen in the following table:

```
$ benchcmp avx2.txt avx512.txt
benchmark                      AVX2 MB/s    AVX512 MB/s   speedup
BenchmarkEncode8x8x1M-72       1681.35      4125.64       2.45x
BenchmarkEncode8x4x8M-72       1529.36      5507.97       3.60x
BenchmarkEncode8x8x8M-72        791.16      2952.29       3.73x
BenchmarkEncode8x8x32M-72       573.26      2168.61       3.78x
BenchmarkEncode12x4x12M-72     1234.41      4912.37       3.98x
BenchmarkEncode16x4x16M-72     1189.59      5138.01       4.32x
BenchmarkEncode24x8x24M-72      690.68      2583.70       3.74x
BenchmarkEncode24x8x48M-72      674.20      2643.31       3.92x
```
master
Frank Wessels 2019-02-10 02:17:23 -08:00 committed by Klaus Post
parent 8885f3a1c7
commit 79aee05119
11 changed files with 1211 additions and 39 deletions

View File

@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon
# Changes
## February 8, 2019
AVX512 accelerated version added for Intel Skylake CPUs. This can give up to a 4x speed improvement as compared to AVX2. See [here](https://github.com/klauspost/reedsolomon#performance-on-avx512) for more details.
## December 18, 2018
Assembly code for ppc64le has been contributed, this boosts performance by about 10x on this platform.
@ -253,6 +257,25 @@ BenchmarkReconstruct50x20x1M-8 1364.35 4189.79 3.07x
BenchmarkReconstruct10x4x16M-8 1484.35 5779.53 3.89x
```
# Performance on AVX512
The performance on AVX512 has been accelerated for Intel CPUs. This gives speedups on a per-core basis of up to 4x compared to AVX2 as can be seen in the following table:
```
$ benchcmp avx2.txt avx512.txt
benchmark AVX2 MB/s AVX512 MB/s speedup
BenchmarkEncode8x8x1M-72 1681.35 4125.64 2.45x
BenchmarkEncode8x4x8M-72 1529.36 5507.97 3.60x
BenchmarkEncode8x8x8M-72 791.16 2952.29 3.73x
BenchmarkEncode8x8x32M-72 573.26 2168.61 3.78x
BenchmarkEncode12x4x12M-72 1234.41 4912.37 3.98x
BenchmarkEncode16x4x16M-72 1189.59 5138.01 4.32x
BenchmarkEncode24x8x24M-72 690.68 2583.70 3.74x
BenchmarkEncode24x8x48M-72 674.20 2643.31 3.92x
```
This speedup has been achieved by computing multiple parity blocks in parallel as opposed to one after the other. In doing so it is possible to minimize the memory bandwidth required for loading all data shards. At the same time the calculations are performed in the 512-bit wide ZMM registers and the surplus of ZMM registers (32 in total) is used to keep more data around (most notably the matrix coefficients).
# Performance on ARM64 NEON
By exploiting NEON instructions the performance for ARM has been accelerated. Below are the performance numbers for a single core on an ARM Cortex-A53 CPU @ 1.2GHz (Debian 8.0 Jessie running Go: 1.7.4):

184
galoisAvx512_amd64.go Normal file
View File

@ -0,0 +1,184 @@
//+build !noasm
//+build !appengine
//+build !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
package reedsolomon
//go:noescape
func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
//go:noescape
func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
const (
dimIn = 8 // Number of input rows processed simultaneously
dimOut82 = 2 // Number of output rows processed simultaneously for x2 routine
dimOut84 = 4 // Number of output rows processed simultaneously for x4 routine
matrixSize82 = (16 + 16) * dimIn * dimOut82 // Dimension of slice of matrix coefficient passed into x2 routine
matrixSize84 = (16 + 16) * dimIn * dimOut84 // Dimension of slice of matrix coefficient passed into x4 routine
)
// Construct block of matrix coefficients for 2 outputs rows in parallel
func setupMatrix82(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize82]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
if c < len(matrixRows[iRow]) {
coeff := matrixRows[iRow][c]
copy(matrix[offset*32:], mulTableLow[coeff][:])
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
} else {
// coefficients not used for this input shard (so null out)
v := matrix[offset*32 : offset*32+32]
for i := range v {
v[i] = 0
}
}
offset += dimIn
if offset >= dimIn*dimOut82 {
offset -= dimIn*dimOut82 - 1
}
}
}
}
// Construct block of matrix coefficients for 4 outputs rows in parallel
func setupMatrix84(matrixRows [][]byte, inputOffset, outputOffset int, matrix *[matrixSize84]byte) {
offset := 0
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
if c < len(matrixRows[iRow]) {
coeff := matrixRows[iRow][c]
copy(matrix[offset*32:], mulTableLow[coeff][:])
copy(matrix[offset*32+16:], mulTableHigh[coeff][:])
} else {
// coefficients not used for this input shard (so null out)
v := matrix[offset*32 : offset*32+32]
for i := range v {
v[i] = 0
}
}
offset += dimIn
if offset >= dimIn*dimOut84 {
offset -= dimIn*dimOut84 - 1
}
}
}
}
// Invoke AVX512 routine for 2 output rows in parallel
func galMulAVX512Parallel82(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
done := len(in[0])
if done == 0 {
return
}
inputEnd := inputOffset + dimIn
if inputEnd > len(in) {
inputEnd = len(in)
}
outputEnd := outputOffset + dimOut82
if outputEnd > len(out) {
outputEnd = len(out)
}
matrix82 := [matrixSize82]byte{}
setupMatrix82(matrixRows, inputOffset, outputOffset, &matrix82)
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel82(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix82, addTo)
done = (done >> 6) << 6
if len(in[0])-done == 0 {
return
}
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut82; iRow++ {
if c < len(matrixRows[iRow]) {
mt := mulTable[matrixRows[iRow][c]]
for i := done; i < len(in[0]); i++ {
if c == 0 { // only set value for first input column
out[iRow][i] = mt[in[c][i]]
} else { // and add for all others
out[iRow][i] ^= mt[in[c][i]]
}
}
}
}
}
}
// Invoke AVX512 routine for 4 output rows in parallel
func galMulAVX512Parallel84(in, out [][]byte, matrixRows [][]byte, inputOffset, outputOffset int) {
done := len(in[0])
if done == 0 {
return
}
inputEnd := inputOffset + dimIn
if inputEnd > len(in) {
inputEnd = len(in)
}
outputEnd := outputOffset + dimOut84
if outputEnd > len(out) {
outputEnd = len(out)
}
matrix84 := [matrixSize84]byte{}
setupMatrix84(matrixRows, inputOffset, outputOffset, &matrix84)
addTo := inputOffset != 0 // Except for the first input column, add to previous results
_galMulAVX512Parallel84(in[inputOffset:inputEnd], out[outputOffset:outputEnd], &matrix84, addTo)
done = (done >> 6) << 6
if len(in[0])-done == 0 {
return
}
for c := inputOffset; c < inputOffset+dimIn; c++ {
for iRow := outputOffset; iRow < outputOffset+dimOut84; iRow++ {
if c < len(matrixRows[iRow]) {
mt := mulTable[matrixRows[iRow][c]]
for i := done; i < len(in[0]); i++ {
if c == 0 { // only set value for first input column
out[iRow][i] = mt[in[c][i]]
} else { // and add for all others
out[iRow][i] ^= mt[in[c][i]]
}
}
}
}
}
}
// Perform the same as codeSomeShards, but taking advantage of
// AVX512 parallelism for up to 4x faster execution as compared to AVX2
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
outputRow := 0
// First process (multiple) batches of 4 output rows in parallel
for ; outputRow+dimOut84 <= len(outputs); outputRow += dimOut84 {
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
galMulAVX512Parallel84(inputs, outputs, matrixRows, inputRow, outputRow)
}
}
// Then process a (single) batch of 2 output rows in parallel
if outputRow+dimOut82 <= len(outputs) {
// fmt.Println(outputRow, len(outputs))
for inputRow := 0; inputRow < len(inputs); inputRow += dimIn {
galMulAVX512Parallel82(inputs, outputs, matrixRows, inputRow, outputRow)
}
outputRow += dimOut82
}
// Lastly, we may have a single output row left (for uneven parity)
if outputRow < len(outputs) {
for c := 0; c < r.DataShards; c++ {
if c == 0 {
galMulSlice(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
} else {
galMulSliceXor(matrixRows[outputRow][c], inputs[c], outputs[outputRow], &r.o)
}
}
}
}

590
galoisAvx512_amd64.s Normal file
View File

@ -0,0 +1,590 @@
//+build !noasm !appengine !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
//
// Process 2 output rows in parallel from a total of 8 input rows
//
// func _galMulAVX512Parallel82(in, out [][]byte, matrix *[matrixSize82]byte, addTo bool)
TEXT ·_galMulAVX512Parallel82(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel82
MOVQ matrix+48(FP), SI
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
MOVQ $15, BX
MOVQ BX, X5
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
MOVB addTo+56(FP), AX
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
WORD $0xf749; BYTE $0xe0 // mul r8
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel82:
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
MOVQ (SI), BX // BX: &in[0][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $1
JE skip_avx512_parallel82
MOVQ 24(SI), BX // BX: &in[1][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $2
JE skip_avx512_parallel82
MOVQ 48(SI), BX // BX: &in[2][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $3
JE skip_avx512_parallel82
MOVQ 72(SI), BX // BX: &in[3][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $4
JE skip_avx512_parallel82
MOVQ 96(SI), BX // BX: &in[4][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $5
JE skip_avx512_parallel82
MOVQ 120(SI), BX // BX: &in[5][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $6
JE skip_avx512_parallel82
MOVQ 144(SI), BX // BX: &in[6][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
CMPQ AX, $7
JE skip_avx512_parallel82
MOVQ 168(SI), BX // BX: &in[7][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
skip_avx512_parallel82:
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel82
done_avx512_parallel82:
VZEROUPPER
RET
//
// Process 4 output rows in parallel from a total of 8 input rows
//
// func _galMulAVX512Parallel84(in, out [][]byte, matrix *[matrixSize84]byte, addTo bool)
TEXT ·_galMulAVX512Parallel84(SB), 7, $0
MOVQ in+0(FP), SI //
MOVQ 8(SI), R9 // R9: len(in)
SHRQ $6, R9 // len(in) / 64
TESTQ R9, R9
JZ done_avx512_parallel84
MOVQ matrix+48(FP), SI
LONG $0x48fee162; WORD $0x066f // VMOVDQU64 ZMM16, 0x000[rsi]
LONG $0x48fee162; WORD $0x4e6f; BYTE $0x01 // VMOVDQU64 ZMM17, 0x040[rsi]
LONG $0x48fee162; WORD $0x566f; BYTE $0x02 // VMOVDQU64 ZMM18, 0x080[rsi]
LONG $0x48fee162; WORD $0x5e6f; BYTE $0x03 // VMOVDQU64 ZMM19, 0x0c0[rsi]
LONG $0x48fee162; WORD $0x666f; BYTE $0x04 // VMOVDQU64 ZMM20, 0x100[rsi]
LONG $0x48fee162; WORD $0x6e6f; BYTE $0x05 // VMOVDQU64 ZMM21, 0x140[rsi]
LONG $0x48fee162; WORD $0x766f; BYTE $0x06 // VMOVDQU64 ZMM22, 0x180[rsi]
LONG $0x48fee162; WORD $0x7e6f; BYTE $0x07 // VMOVDQU64 ZMM23, 0x1c0[rsi]
LONG $0x48fe6162; WORD $0x466f; BYTE $0x08 // VMOVDQU64 ZMM24, 0x200[rsi]
LONG $0x48fe6162; WORD $0x4e6f; BYTE $0x09 // VMOVDQU64 ZMM25, 0x240[rsi]
LONG $0x48fe6162; WORD $0x566f; BYTE $0x0a // VMOVDQU64 ZMM26, 0x280[rsi]
LONG $0x48fe6162; WORD $0x5e6f; BYTE $0x0b // VMOVDQU64 ZMM27, 0x2c0[rsi]
LONG $0x48fe6162; WORD $0x666f; BYTE $0x0c // VMOVDQU64 ZMM28, 0x300[rsi]
LONG $0x48fe6162; WORD $0x6e6f; BYTE $0x0d // VMOVDQU64 ZMM29, 0x340[rsi]
LONG $0x48fe6162; WORD $0x766f; BYTE $0x0e // VMOVDQU64 ZMM30, 0x380[rsi]
LONG $0x48fe6162; WORD $0x7e6f; BYTE $0x0f // VMOVDQU64 ZMM31, 0x3c0[rsi]
MOVQ $15, BX
MOVQ BX, X5
LONG $0x487df262; WORD $0xd578 // VPBROADCASTB ZMM2, XMM5
MOVB addTo+56(FP), AX
LONG $0xffc0c749; WORD $0xffff; BYTE $0xff // mov r8, -1
WORD $0xf749; BYTE $0xe0 // mul r8
LONG $0x92fbe1c4; BYTE $0xc8 // kmovq k1, rax
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), AX // number of inputs
XORQ R11, R11
MOVQ out+24(FP), DX
MOVQ 24(DX), CX // CX: &out[1][0]
MOVQ 48(DX), R10 // R10: &out[2][0]
MOVQ 72(DX), R12 // R12: &out[3][0]
MOVQ (DX), DX // DX: &out[0][0]
loopback_avx512_parallel84:
LONG $0xc9fef162; WORD $0x226f // VMOVDQU64 ZMM4{k1}{z}, [rdx]
LONG $0xc9fef162; WORD $0x296f // VMOVDQU64 ZMM5{k1}{z}, [rcx]
LONG $0xc9fed162; WORD $0x326f // VMOVDQU64 ZMM6{k1}{z}, [r10]
LONG $0xc9fed162; WORD $0x3c6f; BYTE $0x24 // VMOVDQU64 ZMM7{k1}{z}, [r12]
MOVQ (SI), BX // BX: &in[0][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0x00
LONG $0x40fd3362; WORD $0xf843; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0x00
LONG $0x40dd3362; WORD $0xec43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40bd1362; WORD $0xd043; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0x00
LONG $0x40bd1362; WORD $0xd843; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x409d1362; WORD $0xc443; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0x00
LONG $0x409d1362; WORD $0xcc43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $1
JE skip_avx512_parallel84
MOVQ 24(SI), BX // BX: &in[1][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40fd3362; WORD $0xf043; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM16, ZMM16, 0xaa
LONG $0x40fd3362; WORD $0xf843; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM16, ZMM16, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40dd3362; WORD $0xe443; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM20, ZMM20, 0xaa
LONG $0x40dd3362; WORD $0xec43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM20, ZMM20, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40bd1362; WORD $0xd043; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM24, ZMM24, 0xaa
LONG $0x40bd1362; WORD $0xd843; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM24, ZMM24, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x409d1362; WORD $0xc443; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM28, ZMM28, 0xaa
LONG $0x409d1362; WORD $0xcc43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM28, ZMM28, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $2
JE skip_avx512_parallel84
MOVQ 48(SI), BX // BX: &in[2][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0x00
LONG $0x40f53362; WORD $0xf943; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0x00
LONG $0x40d53362; WORD $0xed43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40b51362; WORD $0xd143; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0x00
LONG $0x40b51362; WORD $0xd943; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40951362; WORD $0xc543; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0x00
LONG $0x40951362; WORD $0xcd43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $3
JE skip_avx512_parallel84
MOVQ 72(SI), BX // BX: &in[3][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40f53362; WORD $0xf143; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM17, ZMM17, 0xaa
LONG $0x40f53362; WORD $0xf943; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM17, ZMM17, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40d53362; WORD $0xe543; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM21, ZMM21, 0xaa
LONG $0x40d53362; WORD $0xed43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM21, ZMM21, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40b51362; WORD $0xd143; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM25, ZMM25, 0xaa
LONG $0x40b51362; WORD $0xd943; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM25, ZMM25, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40951362; WORD $0xc543; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM29, ZMM29, 0xaa
LONG $0x40951362; WORD $0xcd43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM29, ZMM29, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $4
JE skip_avx512_parallel84
MOVQ 96(SI), BX // BX: &in[4][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0x00
LONG $0x40ed3362; WORD $0xfa43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0x00
LONG $0x40cd3362; WORD $0xee43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40ad1362; WORD $0xd243; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0x00
LONG $0x40ad1362; WORD $0xda43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x408d1362; WORD $0xc643; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0x00
LONG $0x408d1362; WORD $0xce43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $5
JE skip_avx512_parallel84
MOVQ 120(SI), BX // BX: &in[5][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40ed3362; WORD $0xf243; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM18, ZMM18, 0xaa
LONG $0x40ed3362; WORD $0xfa43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM18, ZMM18, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40cd3362; WORD $0xe643; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM22, ZMM22, 0xaa
LONG $0x40cd3362; WORD $0xee43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM22, ZMM22, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40ad1362; WORD $0xd243; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM26, ZMM26, 0xaa
LONG $0x40ad1362; WORD $0xda43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM26, ZMM26, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x408d1362; WORD $0xc643; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM30, ZMM30, 0xaa
LONG $0x408d1362; WORD $0xce43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM30, ZMM30, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $6
JE skip_avx512_parallel84
MOVQ 144(SI), BX // BX: &in[6][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0x00 // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0x00
LONG $0x40e53362; WORD $0xfb43; BYTE $0x55 // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0x55
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0x00 // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0x00
LONG $0x40c53362; WORD $0xef43; BYTE $0x55 // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0x55
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40a51362; WORD $0xd343; BYTE $0x00 // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0x00
LONG $0x40a51362; WORD $0xdb43; BYTE $0x55 // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0x55
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40851362; WORD $0xc743; BYTE $0x00 // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0x00
LONG $0x40851362; WORD $0xcf43; BYTE $0x55 // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0x55
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
CMPQ AX, $7
JE skip_avx512_parallel84
MOVQ 168(SI), BX // BX: &in[7][0]
LONG $0x48feb162; WORD $0x046f; BYTE $0x1b // VMOVDQU64 ZMM0, [rbx+r11]
LONG $0x40e53362; WORD $0xf343; BYTE $0xaa // VSHUFI64x2 ZMM14, ZMM19, ZMM19, 0xaa
LONG $0x40e53362; WORD $0xfb43; BYTE $0xff // VSHUFI64x2 ZMM15, ZMM19, ZMM19, 0xff
LONG $0x48f5f162; WORD $0xd073; BYTE $0x04 // VPSRLQ ZMM1, ZMM0, 4 ; high input
LONG $0x48fdf162; WORD $0xc2db // VPANDQ ZMM0, ZMM0, ZMM2 ; low input
LONG $0x48f5f162; WORD $0xcadb // VPANDQ ZMM1, ZMM1, ZMM2 ; high input
LONG $0x480d7262; WORD $0xf000 // VPSHUFB ZMM14, ZMM14, ZMM0 ; mul low part
LONG $0x48057262; WORD $0xf900 // VPSHUFB ZMM15, ZMM15, ZMM1 ; mul high part
LONG $0x488d5162; WORD $0xf7ef // VPXORQ ZMM14, ZMM14, ZMM15 ; result
LONG $0x48ddd162; WORD $0xe6ef // VPXORQ ZMM4, ZMM4, ZMM14
LONG $0x40c53362; WORD $0xe743; BYTE $0xaa // VSHUFI64x2 ZMM12, ZMM23, ZMM23, 0xaa
LONG $0x40c53362; WORD $0xef43; BYTE $0xff // VSHUFI64x2 ZMM13, ZMM23, ZMM23, 0xff
LONG $0x481d7262; WORD $0xe000 // VPSHUFB ZMM12, ZMM12, ZMM0 ; mul low part
LONG $0x48157262; WORD $0xe900 // VPSHUFB ZMM13, ZMM13, ZMM1 ; mul high part
LONG $0x489d5162; WORD $0xe5ef // VPXORQ ZMM12, ZMM12, ZMM13 ; result
LONG $0x48d5d162; WORD $0xecef // VPXORQ ZMM5, ZMM5, ZMM12
LONG $0x40a51362; WORD $0xd343; BYTE $0xaa // VSHUFI64x2 ZMM10, ZMM27, ZMM27, 0xaa
LONG $0x40a51362; WORD $0xdb43; BYTE $0xff // VSHUFI64x2 ZMM11, ZMM27, ZMM27, 0xff
LONG $0x482d7262; WORD $0xd000 // VPSHUFB ZMM10, ZMM10, ZMM0 ; mul low part
LONG $0x48257262; WORD $0xd900 // VPSHUFB ZMM11, ZMM11, ZMM1 ; mul high part
LONG $0x48ad5162; WORD $0xd3ef // VPXORQ ZMM10, ZMM10, ZMM11 ; result
LONG $0x48cdd162; WORD $0xf2ef // VPXORQ ZMM6, ZMM6, ZMM10
LONG $0x40851362; WORD $0xc743; BYTE $0xaa // VSHUFI64x2 ZMM8, ZMM31, ZMM31, 0xaa
LONG $0x40851362; WORD $0xcf43; BYTE $0xff // VSHUFI64x2 ZMM9, ZMM31, ZMM31, 0xff
LONG $0x483d7262; WORD $0xc000 // VPSHUFB ZMM8, ZMM8, ZMM0 ; mul low part
LONG $0x48357262; WORD $0xc900 // VPSHUFB ZMM9, ZMM9, ZMM1 ; mul high part
LONG $0x48bd5162; WORD $0xc1ef // VPXORQ ZMM8, ZMM8, ZMM9 ; result
LONG $0x48c5d162; WORD $0xf8ef // VPXORQ ZMM7, ZMM7, ZMM8
skip_avx512_parallel84:
LONG $0x48fef162; WORD $0x227f // VMOVDQU64 [rdx], ZMM4
LONG $0x48fef162; WORD $0x297f // VMOVDQU64 [rcx], ZMM5
LONG $0x48fed162; WORD $0x327f // VMOVDQU64 [r10], ZMM6
LONG $0x48fed162; WORD $0x3c7f; BYTE $0x24 // VMOVDQU64 [r12], ZMM7
ADDQ $64, R11 // in4+=64
ADDQ $64, DX // out+=64
ADDQ $64, CX // out2+=64
ADDQ $64, R10 // out3+=64
ADDQ $64, R12 // out4+=64
SUBQ $1, R9
JNZ loopback_avx512_parallel84
done_avx512_parallel84:
VZEROUPPER
RET

346
galoisAvx512_amd64_test.go Normal file
View File

@ -0,0 +1,346 @@
//+build !noasm
//+build !appengine
//+build !gccgo
// Copyright 2015, Klaus Post, see LICENSE for details.
// Copyright 2019, Minio, Inc.
package reedsolomon
import (
"bytes"
"math/rand"
"testing"
"time"
)
func testGaloisAvx512Parallelx2(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
return
}
rand.Seed(time.Now().UnixNano())
const size = 1024 * 1024
in, out := make([][]byte, inputSize), make([][]byte, dimOut82)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut82]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel82(in, out, &matrix, false)
expect := make([][]byte, len(out))
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel82(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel12(t *testing.T) { testGaloisAvx512Parallelx2(t, 1) }
func TestGaloisAvx512Parallel22(t *testing.T) { testGaloisAvx512Parallelx2(t, 2) }
func TestGaloisAvx512Parallel32(t *testing.T) { testGaloisAvx512Parallelx2(t, 3) }
func TestGaloisAvx512Parallel42(t *testing.T) { testGaloisAvx512Parallelx2(t, 4) }
func TestGaloisAvx512Parallel52(t *testing.T) { testGaloisAvx512Parallelx2(t, 5) }
func TestGaloisAvx512Parallel62(t *testing.T) { testGaloisAvx512Parallelx2(t, 6) }
func TestGaloisAvx512Parallel72(t *testing.T) { testGaloisAvx512Parallelx2(t, 7) }
func TestGaloisAvx512Parallel82(t *testing.T) { testGaloisAvx512Parallelx2(t, 8) }
func testGaloisAvx512Parallelx4(t *testing.T, inputSize int) {
if !defaultOptions.useAVX512 {
return
}
rand.Seed(time.Now().UnixNano())
const size = 1024 * 1024
in, out := make([][]byte, inputSize), make([][]byte, dimOut84)
for i := range in {
in[i] = make([]byte, size)
rand.Read(in[i])
}
for i := range out {
out[i] = make([]byte, size)
rand.Read(out[i])
}
opts := defaultOptions
opts.useSSSE3 = true
matrix := [(16 + 16) * dimIn * dimOut84]byte{}
coeffs := make([]byte, dimIn*len(out))
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do first run with clearing out any existing results
_galMulAVX512Parallel84(in, out, &matrix, false)
expect := make([][]byte, 4)
for i := range expect {
expect[i] = make([]byte, size)
rand.Read(expect[i])
}
for i := range in {
if i == 0 {
galMulSlice(coeffs[i], in[i], expect[0], &options{})
galMulSlice(coeffs[dimIn+i], in[i], expect[1], &options{})
galMulSlice(coeffs[dimIn*2+i], in[i], expect[2], &options{})
galMulSlice(coeffs[dimIn*3+i], in[i], expect[3], &options{})
} else {
galMulSliceXor(coeffs[i], in[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], in[i], expect[1], &options{})
galMulSliceXor(coeffs[dimIn*2+i], in[i], expect[2], &options{})
galMulSliceXor(coeffs[dimIn*3+i], in[i], expect[3], &options{})
}
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
inToAdd := make([][]byte, len(in))
for i := range inToAdd {
inToAdd[i] = make([]byte, size)
rand.Read(inToAdd[i])
}
for i := 0; i < dimIn*len(out); i++ {
coeffs[i] = byte(rand.Int31n(256))
copy(matrix[i*32:], mulTableLow[coeffs[i]][:])
copy(matrix[i*32+16:], mulTableHigh[coeffs[i]][:])
}
// Do second run by adding to original run
_galMulAVX512Parallel84(inToAdd, out, &matrix, true)
for i := range in {
galMulSliceXor(coeffs[i], inToAdd[i], expect[0], &options{})
galMulSliceXor(coeffs[dimIn+i], inToAdd[i], expect[1], &options{})
galMulSliceXor(coeffs[dimIn*2+i], inToAdd[i], expect[2], &options{})
galMulSliceXor(coeffs[dimIn*3+i], inToAdd[i], expect[3], &options{})
}
for i := range out {
if 0 != bytes.Compare(out[i], expect[i]) {
t.Errorf("got [%d]%#v...,\n expected [%d]%#v...", i, out[i][:8], i, expect[i][:8])
}
}
}
func TestGaloisAvx512Parallel14(t *testing.T) { testGaloisAvx512Parallelx4(t, 1) }
func TestGaloisAvx512Parallel24(t *testing.T) { testGaloisAvx512Parallelx4(t, 2) }
func TestGaloisAvx512Parallel34(t *testing.T) { testGaloisAvx512Parallelx4(t, 3) }
func TestGaloisAvx512Parallel44(t *testing.T) { testGaloisAvx512Parallelx4(t, 4) }
func TestGaloisAvx512Parallel54(t *testing.T) { testGaloisAvx512Parallelx4(t, 5) }
func TestGaloisAvx512Parallel64(t *testing.T) { testGaloisAvx512Parallelx4(t, 6) }
func TestGaloisAvx512Parallel74(t *testing.T) { testGaloisAvx512Parallelx4(t, 7) }
func TestGaloisAvx512Parallel84(t *testing.T) { testGaloisAvx512Parallelx4(t, 8) }
func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int) {
if !defaultOptions.useAVX512 {
return
}
var data = make([]byte, l)
fillRandom(data)
enc, _ := New(ds, ps)
r := enc.(*reedSolomon) // need to access private methods
shards, _ := enc.Split(data)
// Fill shards to encode with garbage
for i := r.DataShards; i < r.DataShards+r.ParityShards; i++ {
rand.Read(shards[i])
}
r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
correct, _ := r.Verify(shards)
if !correct {
t.Errorf("Verification of encoded shards failed")
}
}
func testCodeSomeShardsAvx512(t *testing.T, ds, ps int) {
if !defaultOptions.useAVX512 {
return
}
for l := 1; l <= 8192; l++ {
testCodeSomeShardsAvx512WithLength(t, ds, ps, l)
}
}
func TestCodeSomeShardsAvx512_8x2(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 2) }
func TestCodeSomeShardsAvx512_1x4(t *testing.T) { testCodeSomeShardsAvx512(t, 1, 4) }
func TestCodeSomeShardsAvx512_2x4(t *testing.T) { testCodeSomeShardsAvx512(t, 2, 4) }
func TestCodeSomeShardsAvx512_3x4(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 4) }
func TestCodeSomeShardsAvx512_4x4(t *testing.T) { testCodeSomeShardsAvx512(t, 4, 4) }
func TestCodeSomeShardsAvx512_5x4(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 4) }
func TestCodeSomeShardsAvx512_6x4(t *testing.T) { testCodeSomeShardsAvx512(t, 6, 4) }
func TestCodeSomeShardsAvx512_7x4(t *testing.T) { testCodeSomeShardsAvx512(t, 7, 4) }
func TestCodeSomeShardsAvx512_8x4(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 4) }
func TestCodeSomeShardsAvx512_9x4(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 4) }
func TestCodeSomeShardsAvx512_10x4(t *testing.T) { testCodeSomeShardsAvx512(t, 10, 4) }
func TestCodeSomeShardsAvx512_12x4(t *testing.T) { testCodeSomeShardsAvx512(t, 12, 4) }
func TestCodeSomeShardsAvx512_16x4(t *testing.T) { testCodeSomeShardsAvx512(t, 16, 4) }
func TestCodeSomeShardsAvx512_3x6(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 6) }
func TestCodeSomeShardsAvx512_8x6(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 6) }
func TestCodeSomeShardsAvx512_8x7(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 7) }
func TestCodeSomeShardsAvx512_3x8(t *testing.T) { testCodeSomeShardsAvx512(t, 3, 8) }
func TestCodeSomeShardsAvx512_8x8(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 8) }
func TestCodeSomeShardsAvx512_5x10(t *testing.T) { testCodeSomeShardsAvx512(t, 5, 10) }
func TestCodeSomeShardsAvx512_8x10(t *testing.T) { testCodeSomeShardsAvx512(t, 8, 10) }
func TestCodeSomeShardsAvx512_9x10(t *testing.T) { testCodeSomeShardsAvx512(t, 9, 10) }
func TestCodeSomeShardsAvx512_Manyx4(t *testing.T) {
if !defaultOptions.useAVX512 {
return
}
for inputs := 1; inputs <= 200; inputs++ {
testCodeSomeShardsAvx512WithLength(t, inputs, 4, 1024+33)
}
}
func TestCodeSomeShardsAvx512_ManyxMany(t *testing.T) {
if !defaultOptions.useAVX512 {
return
}
for outputs := 1; outputs <= 32; outputs++ {
for inputs := 1; inputs <= 32; inputs++ {
testCodeSomeShardsAvx512WithLength(t, inputs, outputs, 1024+33)
}
}
}
func benchmarkAvx512Encode(b *testing.B, dataShards, parityShards, shardSize int) {
if !defaultOptions.useAVX512 {
return
}
enc, err := New(dataShards, parityShards)
if err != nil {
b.Fatal(err)
}
r := enc.(*reedSolomon) // need to access private methods
shards := make([][]byte, dataShards+parityShards)
for s := range shards {
shards[s] = make([]byte, shardSize)
}
rand.Seed(0)
for s := 0; s < dataShards; s++ {
fillRandom(shards[s])
}
b.SetBytes(int64(shardSize * dataShards))
b.ResetTimer()
for i := 0; i < b.N; i++ {
// Do the coding.
r.codeSomeShardsAvx512(r.parity, shards[0:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0]))
}
}
// Benchmark various combination of data shards and parity shards for AVX512 accelerated code
func BenchmarkEncodeAvx512_8x4x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 4, 8*1024*1024) }
func BenchmarkEncodeAvx512_12x4x12M(b *testing.B) { benchmarkAvx512Encode(b, 12, 4, 12*1024*1024) }
func BenchmarkEncodeAvx512_16x4x16M(b *testing.B) { benchmarkAvx512Encode(b, 16, 4, 16*1024*1024) }
func BenchmarkEncodeAvx512_16x4x32M(b *testing.B) { benchmarkAvx512Encode(b, 16, 4, 32*1024*1024) }
func BenchmarkEncodeAvx512_16x4x64M(b *testing.B) { benchmarkAvx512Encode(b, 16, 4, 64*1024*1024) }
func BenchmarkEncodeAvx512_8x5x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 5, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x6x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 6, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x7x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 7, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x9x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 9, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x10x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 10, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x11x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 11, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x8x05M(b *testing.B) { benchmarkAvx512Encode(b, 8, 8, 1*1024*1024/2) }
func BenchmarkEncodeAvx512_8x8x1M(b *testing.B) { benchmarkAvx512Encode(b, 8, 8, 1*1024*1024) }
func BenchmarkEncodeAvx512_8x8x8M(b *testing.B) { benchmarkAvx512Encode(b, 8, 8, 8*1024*1024) }
func BenchmarkEncodeAvx512_8x8x32M(b *testing.B) { benchmarkAvx512Encode(b, 8, 8, 32*1024*1024) }
func BenchmarkEncodeAvx512_24x8x24M(b *testing.B) { benchmarkAvx512Encode(b, 24, 8, 24*1024*1024) }
func BenchmarkEncodeAvx512_24x8x48M(b *testing.B) { benchmarkAvx512Encode(b, 24, 8, 48*1024*1024) }

View File

@ -40,12 +40,12 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
}
*/
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSlice(c byte, in, out []byte, o *options) {
var done int
if avx2 {
if o.useAVX2 {
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 5) << 5
} else if ssse3 {
} else if o.useSSSE3 {
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
@ -58,12 +58,12 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSliceXor(c byte, in, out []byte, o *options) {
var done int
if avx2 {
if o.useAVX2 {
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 5) << 5
} else if ssse3 {
} else if o.useSSSE3 {
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}

View File

@ -13,7 +13,7 @@ func galMulNEON(c uint64, in, out []byte)
//go:noescape
func galMulXorNEON(c uint64, in, out []byte)
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSlice(c byte, in, out []byte, o *options) {
var done int
galMulNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
@ -27,7 +27,7 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSliceXor(c byte, in, out []byte, o *options) {
var done int
galMulXorNEON(uint64(c), in, out)
done = (len(in) >> 5) << 5
@ -47,3 +47,6 @@ func sliceXor(in, out []byte, sse2 bool) {
out[n] ^= input
}
}
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
}

View File

@ -6,14 +6,14 @@
package reedsolomon
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSlice(c byte, in, out []byte, o *options) {
mt := mulTable[c]
for n, input := range in {
out[n] = mt[input]
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSliceXor(c byte, in, out []byte, o *options) {
mt := mulTable[c]
for n, input := range in {
out[n] ^= mt[input]
@ -26,3 +26,6 @@ func sliceXor(in, out []byte, sse2 bool) {
out[n] ^= input
}
}
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
}

View File

@ -31,7 +31,7 @@ func galMulPpcXor(low, high, in, out []byte) {
}
*/
func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSlice(c byte, in, out []byte, o *options) {
done := (len(in) >> 4) << 4
if done > 0 {
galMulPpc(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
@ -45,7 +45,7 @@ func galMulSlice(c byte, in, out []byte, ssse3, avx2 bool) {
}
}
func galMulSliceXor(c byte, in, out []byte, ssse3, avx2 bool) {
func galMulSliceXor(c byte, in, out []byte, o *options) {
done := (len(in) >> 4) << 4
if done > 0 {
galMulPpcXor(mulTableLow[c][:], mulTableHigh[c][:], in[:done], out)
@ -65,3 +65,6 @@ func sliceXor(in, out []byte, sse2 bool) {
out[n] ^= input
}
}
func (r reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
}

View File

@ -116,7 +116,7 @@ func TestExp(t *testing.T) {
}
}
func testGalois(t *testing.T, ssse3, avx2 bool) {
func testGalois(t *testing.T, o *options) {
// These values were copied output of the Python code.
if galMultiply(3, 4) != 12 {
t.Fatal("galMultiply(3, 4) != 12")
@ -131,25 +131,25 @@ func testGalois(t *testing.T, ssse3, avx2 bool) {
// Test slices (>32 entries to test assembler -- AVX2 & NEON)
in := []byte{0, 1, 2, 3, 4, 5, 6, 10, 50, 100, 150, 174, 201, 255, 99, 32, 67, 85, 200, 199, 198, 197, 196, 195, 194, 193, 192, 191, 190, 189, 188, 187, 186, 185}
out := make([]byte, len(in))
galMulSlice(25, in, out, ssse3, avx2)
galMulSlice(25, in, out, o)
expect := []byte{0x0, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0xfa, 0xb8, 0x6d, 0xc7, 0x85, 0xc3, 0x1f, 0x22, 0x7, 0x25, 0xfe, 0xda, 0x5d, 0x44, 0x6f, 0x76, 0x39, 0x20, 0xb, 0x12, 0x11, 0x8, 0x23, 0x3a, 0x75, 0x6c, 0x47}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}
expectXor := []byte{0x0, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0x2f, 0x79, 0xf2, 0x7, 0x51, 0xd4, 0x19, 0x31, 0xc9, 0xf8, 0xfc, 0xf9, 0x4f, 0x62, 0x15, 0x38, 0xfb, 0xd6, 0xa1, 0x8c, 0x96, 0xbb, 0xcc, 0xe1, 0x22, 0xf, 0x78}
galMulSliceXor(52, in, out, ssse3, avx2)
galMulSliceXor(52, in, out, o)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}
galMulSlice(177, in, out, ssse3, avx2)
galMulSlice(177, in, out, o)
expect = []byte{0x0, 0xb1, 0x7f, 0xce, 0xfe, 0x4f, 0x81, 0x9e, 0x3, 0x6, 0xe8, 0x75, 0xbd, 0x40, 0x36, 0xa3, 0x95, 0xcb, 0xc, 0xdd, 0x6c, 0xa2, 0x13, 0x23, 0x92, 0x5c, 0xed, 0x1b, 0xaa, 0x64, 0xd5, 0xe5, 0x54, 0x9a}
if 0 != bytes.Compare(out, expect) {
t.Errorf("got %#v, expected %#v", out, expect)
}
expectXor = []byte{0x0, 0xc4, 0x95, 0x51, 0x37, 0xf3, 0xa2, 0xfb, 0xec, 0xc5, 0xd0, 0xc7, 0x53, 0x88, 0xa3, 0xa5, 0x6, 0x78, 0x97, 0x9f, 0x5b, 0xa, 0xce, 0xa8, 0x6c, 0x3d, 0xf9, 0xdf, 0x1b, 0x4a, 0x8e, 0xe8, 0x2c, 0x7d}
galMulSliceXor(117, in, out, ssse3, avx2)
galMulSliceXor(117, in, out, o)
if 0 != bytes.Compare(out, expectXor) {
t.Errorf("got %#v, expected %#v", out, expectXor)
}
@ -167,14 +167,18 @@ func testGalois(t *testing.T, ssse3, avx2 bool) {
func TestGalois(t *testing.T) {
// invoke with all combinations of asm instructions
testGalois(t, false, false)
testGalois(t, true, false)
o := options{}
o.useSSSE3, o.useAVX2 = false, false
testGalois(t, &o)
o.useSSSE3, o.useAVX2 = true, false
testGalois(t, &o)
if defaultOptions.useAVX2 {
testGalois(t, false, true)
o.useSSSE3, o.useAVX2 = false, true
testGalois(t, &o)
}
}
func TestSliceGalADD(t *testing.T) {
func TestSliceGalAdd(t *testing.T) {
lengthList := []int{16, 32, 34}
for _, length := range lengthList {
@ -225,10 +229,13 @@ func benchmarkGalois(b *testing.B, size int) {
in := make([]byte, size)
out := make([]byte, size)
o := options{}
o.useSSSE3, o.useAVX2 = false, true
b.SetBytes(int64(size))
b.ResetTimer()
for i := 0; i < b.N; i++ {
galMulSlice(25, in[:], out[:], true, false)
galMulSlice(25, in[:], out[:], &o)
}
}
@ -244,10 +251,13 @@ func benchmarkGaloisXor(b *testing.B, size int) {
in := make([]byte, size)
out := make([]byte, size)
o := options{}
o.useSSSE3, o.useAVX2 = true, false
b.SetBytes(int64(size))
b.ResetTimer()
for i := 0; i < b.N; i++ {
galMulSliceXor(177, in[:], out[:], true, false)
galMulSliceXor(177, in[:], out[:], &o)
}
}

View File

@ -10,12 +10,12 @@ import (
type Option func(*options)
type options struct {
maxGoroutines int
minSplitSize int
useAVX2, useSSSE3, useSSE2 bool
usePAR1Matrix bool
useCauchy bool
shardSize int
maxGoroutines int
minSplitSize int
useAVX512, useAVX2, useSSSE3, useSSE2 bool
usePAR1Matrix bool
useCauchy bool
shardSize int
}
var defaultOptions = options{
@ -29,8 +29,9 @@ func init() {
}
// Detect CPU capabilities.
defaultOptions.useSSSE3 = cpuid.CPU.SSSE3()
defaultOptions.useAVX2 = cpuid.CPU.AVX2()
defaultOptions.useSSE2 = cpuid.CPU.SSE2()
defaultOptions.useAVX2 = cpuid.CPU.AVX2()
defaultOptions.useAVX512 = cpuid.CPU.AVX512F() && cpuid.CPU.AVX512BW()
}
// WithMaxGoroutines is the maximum number of goroutines number for encoding & decoding.
@ -88,6 +89,12 @@ func withSSE2(enabled bool) Option {
}
}
func withAVX512(enabled bool) Option {
return func(o *options) {
o.useAVX512 = enabled
}
}
// WithPAR1Matrix causes the encoder to build the matrix how PARv1
// does. Note that the method they use is buggy, and may lead to cases
// where recovery is impossible, even if there are enough parity

View File

@ -372,7 +372,7 @@ func (r reedSolomon) updateParityShards(matrixRows, oldinputs, newinputs, output
// oldinputs data will be change
sliceXor(in, oldin, r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], oldin, outputs[iRow], &r.o)
}
}
}
@ -399,7 +399,7 @@ func (r reedSolomon) updateParityShardsP(matrixRows, oldinputs, newinputs, outpu
// oldinputs data will be change
sliceXor(in[start:stop], oldin[start:stop], r.o.useSSE2)
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], oldin[start:stop], outputs[iRow][start:stop], &r.o)
}
}
wg.Done()
@ -437,7 +437,10 @@ func (r reedSolomon) Verify(shards [][]byte) (bool, error) {
// number of matrix rows used, is determined by
// outputCount, which is the number of outputs to compute.
func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
if r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2 {
r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount)
return
} else if r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize {
r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount)
return
}
@ -445,9 +448,9 @@ func (r reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, output
in := inputs[c]
for iRow := 0; iRow < outputCount; iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
galMulSlice(matrixRows[iRow][c], in, outputs[iRow], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
}
}
}
@ -474,9 +477,9 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
in := inputs[c][start:stop]
for iRow := 0; iRow < outputCount; iRow++ {
if c == 0 {
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
} else {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][start:stop], &r.o)
}
}
}
@ -501,7 +504,7 @@ func (r reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outpu
for c := 0; c < r.DataShards; c++ {
in := inputs[c]
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
}
}
@ -545,7 +548,7 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
mu.RUnlock()
in := inputs[c][start : start+do]
for iRow := 0; iRow < outputCount; iRow++ {
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], r.o.useSSSE3, r.o.useAVX2)
galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow], &r.o)
}
}