PSHUFB is S(upplemental)-SSE3, not plain SSE3.

master
klauspost 2015-06-24 16:57:38 +02:00
parent d31049df42
commit dc9cd67c8c
3 changed files with 18 additions and 18 deletions

View File

@ -156,12 +156,12 @@ Performance depends mainly on the number of parity shards. In rough terms, doubl
Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 2 CPU cores are used for encoding.
| Data | Parity | Parity | MB/s | SSE3 MB/s | SSE3 Speed | Rel. Speed |
|------|--------|--------|--------|------------|------------|------------|
| 5 | 2 | 40% | 576,11 | 2599,2 | 451% | 100,00% |
| 10 | 2 | 20% | 587,73 | 3100,28 | 528% | 102,02% |
| 10 | 4 | 40% | 298,38 | 2470,97 | 828% | 51,79% |
| 50 | 20 | 40% | 59,81 | 713,28 | 1193% | 10,38% |
| Data | Parity | Parity | MB/s | SSSE3 MB/s | SSSE3 Speed | Rel. Speed |
|------|--------|--------|--------|-------------|-------------|------------|
| 5 | 2 | 40% | 576,11 | 2599,2 | 451% | 100,00% |
| 10 | 2 | 20% | 587,73 | 3100,28 | 528% | 102,02% |
| 10 | 4 | 40% | 298,38 | 2470,97 | 828% | 51,79% |
| 50 | 20 | 40% | 59,81 | 713,28 | 1193% | 10,38% |
If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.

View File

@ -9,12 +9,12 @@ import (
"github.com/klauspost/cpuid"
)
func galMulSSE3(low, high, in, out []byte)
func galMulSSE3Xor(low, high, in, out []byte)
func galMulSSSE3(low, high, in, out []byte)
func galMulSSSE3Xor(low, high, in, out []byte)
// This is what the assembler rountes does in blocks of 16 bytes:
/*
func galMulSSE3(low, high, in, out []byte) {
func galMulSSSE3(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
@ -22,7 +22,7 @@ func galMulSSE3(low, high, in, out []byte) {
}
}
func galMulSSE3Xor(low, high, in, out []byte) {
func galMulSSSE3Xor(low, high, in, out []byte) {
for n, input := range in {
l := input & 0xf
h := input >> 4
@ -33,8 +33,8 @@ func galMulSSE3Xor(low, high, in, out []byte) {
func galMulSlice(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSE3() {
galMulSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
if cpuid.CPU.SSSE3() {
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done
@ -48,8 +48,8 @@ func galMulSlice(c byte, in, out []byte) {
func galMulSliceXor(c byte, in, out []byte) {
var done int
if cpuid.CPU.SSE3() {
galMulSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
if cpuid.CPU.SSSE3() {
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done = (len(in) >> 4) << 4
}
remain := len(in) - done

View File

@ -5,8 +5,8 @@
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
// and http://jerasure.org/jerasure/gf-complete/tree/master
// func galMulSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSE3Xor(SB), 7, $0
// func galMulSSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSSE3Xor(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVOU (SI), X6 // X6 low
@ -42,8 +42,8 @@ loopback_xor:
done_xor:
RET
// func galMulSSE3(low, high, in, out []byte)
TEXT ·galMulSSE3(SB), 7, $0
// func galMulSSSE3(low, high, in, out []byte)
TEXT ·galMulSSSE3(SB), 7, $0
MOVQ low+0(FP),SI // SI: &low
MOVQ high+24(FP),DX // DX: &high
MOVOU (SI), X6 // X6 low