PSHUFB is S(upplemental)-SSE3, not plain SSE3.
parent
d31049df42
commit
dc9cd67c8c
12
README.md
12
README.md
|
@ -156,12 +156,12 @@ Performance depends mainly on the number of parity shards. In rough terms, doubl
|
|||
|
||||
Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 2 CPU cores are used for encoding.
|
||||
|
||||
| Data | Parity | Parity | MB/s | SSE3 MB/s | SSE3 Speed | Rel. Speed |
|
||||
|------|--------|--------|--------|------------|------------|------------|
|
||||
| 5 | 2 | 40% | 576,11 | 2599,2 | 451% | 100,00% |
|
||||
| 10 | 2 | 20% | 587,73 | 3100,28 | 528% | 102,02% |
|
||||
| 10 | 4 | 40% | 298,38 | 2470,97 | 828% | 51,79% |
|
||||
| 50 | 20 | 40% | 59,81 | 713,28 | 1193% | 10,38% |
|
||||
| Data | Parity | Parity | MB/s | SSSE3 MB/s | SSSE3 Speed | Rel. Speed |
|
||||
|------|--------|--------|--------|-------------|-------------|------------|
|
||||
| 5 | 2 | 40% | 576,11 | 2599,2 | 451% | 100,00% |
|
||||
| 10 | 2 | 20% | 587,73 | 3100,28 | 528% | 102,02% |
|
||||
| 10 | 4 | 40% | 298,38 | 2470,97 | 828% | 51,79% |
|
||||
| 50 | 20 | 40% | 59,81 | 713,28 | 1193% | 10,38% |
|
||||
|
||||
If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.
|
||||
|
||||
|
|
|
@ -9,12 +9,12 @@ import (
|
|||
"github.com/klauspost/cpuid"
|
||||
)
|
||||
|
||||
func galMulSSE3(low, high, in, out []byte)
|
||||
func galMulSSE3Xor(low, high, in, out []byte)
|
||||
func galMulSSSE3(low, high, in, out []byte)
|
||||
func galMulSSSE3Xor(low, high, in, out []byte)
|
||||
|
||||
// This is what the assembler rountes does in blocks of 16 bytes:
|
||||
/*
|
||||
func galMulSSE3(low, high, in, out []byte) {
|
||||
func galMulSSSE3(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
|
@ -22,7 +22,7 @@ func galMulSSE3(low, high, in, out []byte) {
|
|||
}
|
||||
}
|
||||
|
||||
func galMulSSE3Xor(low, high, in, out []byte) {
|
||||
func galMulSSSE3Xor(low, high, in, out []byte) {
|
||||
for n, input := range in {
|
||||
l := input & 0xf
|
||||
h := input >> 4
|
||||
|
@ -33,8 +33,8 @@ func galMulSSE3Xor(low, high, in, out []byte) {
|
|||
|
||||
func galMulSlice(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSE3() {
|
||||
galMulSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
if cpuid.CPU.SSSE3() {
|
||||
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
remain := len(in) - done
|
||||
|
@ -48,8 +48,8 @@ func galMulSlice(c byte, in, out []byte) {
|
|||
|
||||
func galMulSliceXor(c byte, in, out []byte) {
|
||||
var done int
|
||||
if cpuid.CPU.SSE3() {
|
||||
galMulSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
if cpuid.CPU.SSSE3() {
|
||||
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
}
|
||||
remain := len(in) - done
|
||||
|
|
|
@ -5,8 +5,8 @@
|
|||
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
|
||||
// and http://jerasure.org/jerasure/gf-complete/tree/master
|
||||
|
||||
// func galMulSSE3Xor(low, high, in, out []byte)
|
||||
TEXT ·galMulSSE3Xor(SB), 7, $0
|
||||
// func galMulSSSE3Xor(low, high, in, out []byte)
|
||||
TEXT ·galMulSSSE3Xor(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVOU (SI), X6 // X6 low
|
||||
|
@ -42,8 +42,8 @@ loopback_xor:
|
|||
done_xor:
|
||||
RET
|
||||
|
||||
// func galMulSSE3(low, high, in, out []byte)
|
||||
TEXT ·galMulSSE3(SB), 7, $0
|
||||
// func galMulSSSE3(low, high, in, out []byte)
|
||||
TEXT ·galMulSSSE3(SB), 7, $0
|
||||
MOVQ low+0(FP),SI // SI: &low
|
||||
MOVQ high+24(FP),DX // DX: &high
|
||||
MOVOU (SI), X6 // X6 low
|
||||
|
|
Loading…
Reference in New Issue