PSHUFB is S(upplemental)-SSE3, not plain SSE3.

2015-06-24 16:57:38 +02:00 · 2015-06-24 16:57:38 +02:00 · dc9cd67c8c
parent d31049df42
commit dc9cd67c8c
3 changed files with 18 additions and 18 deletions
--- a/README.md
+++ b/README.md
@ -156,12 +156,12 @@ Performance depends mainly on the number of parity shards. In rough terms, doubl

 Here are the throughput numbers with some different selections of data and parity shards. For reference each shard is 1MB random data, and 2 CPU cores are used for encoding.

-| Data | Parity | Parity | MB/s   | SSE3 MB/s  | SSE3 Speed | Rel. Speed |
-|------|--------|--------|--------|------------|------------|------------|
-| 5    | 2      | 40%    | 576,11 | 2599,2     | 451%       | 100,00%    |
-| 10   | 2      | 20%    | 587,73 | 3100,28    | 528%       | 102,02%    |
-| 10   | 4      | 40%    | 298,38 | 2470,97    | 828%       | 51,79%     |
-| 50   | 20     | 40%    | 59,81  | 713,28     | 1193%      | 10,38%     |
+| Data | Parity | Parity | MB/s   | SSSE3 MB/s  | SSSE3 Speed | Rel. Speed |
+|------|--------|--------|--------|-------------|-------------|------------|
+| 5    | 2      | 40%    | 576,11 | 2599,2      | 451%        | 100,00%    |
+| 10   | 2      | 20%    | 587,73 | 3100,28     | 528%        | 102,02%    |
+| 10   | 4      | 40%    | 298,38 | 2470,97     | 828%        | 51,79%     |
+| 50   | 20     | 40%    | 59,81  | 713,28      | 1193%       | 10,38%     |

 If `runtime.GOMAXPROCS()` is set to a value higher than 1, the encoder will use multiple goroutines to perform the calculations in `Verify`, `Encode` and `Reconstruct`.

--- a/galois_amd64.go
+++ b/galois_amd64.go
@ -9,12 +9,12 @@ import (
 	"github.com/klauspost/cpuid"
 )

-func galMulSSE3(low, high, in, out []byte)
-func galMulSSE3Xor(low, high, in, out []byte)
+func galMulSSSE3(low, high, in, out []byte)
+func galMulSSSE3Xor(low, high, in, out []byte)

 // This is what the assembler rountes does in blocks of 16 bytes:
 /*
-func galMulSSE3(low, high, in, out []byte) {
+func galMulSSSE3(low, high, in, out []byte) {
 	for n, input := range in {
 		l := input & 0xf
 		h := input >> 4
@ -22,7 +22,7 @@ func galMulSSE3(low, high, in, out []byte) {
 	}
 }

-func galMulSSE3Xor(low, high, in, out []byte) {
+func galMulSSSE3Xor(low, high, in, out []byte) {
 	for n, input := range in {
 		l := input & 0xf
 		h := input >> 4
@ -33,8 +33,8 @@ func galMulSSE3Xor(low, high, in, out []byte) {

 func galMulSlice(c byte, in, out []byte) {
 	var done int
-	if cpuid.CPU.SSE3() {
-		galMulSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+	if cpuid.CPU.SSSE3() {
+		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		done = (len(in) >> 4) << 4
 	}
 	remain := len(in) - done
@ -48,8 +48,8 @@ func galMulSlice(c byte, in, out []byte) {

 func galMulSliceXor(c byte, in, out []byte) {
 	var done int
-	if cpuid.CPU.SSE3() {
-		galMulSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+	if cpuid.CPU.SSSE3() {
+		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
 		done = (len(in) >> 4) << 4
 	}
 	remain := len(in) - done
--- a/galois_amd64.s
+++ b/galois_amd64.s
@ -5,8 +5,8 @@
 // Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
 // and http://jerasure.org/jerasure/gf-complete/tree/master

-// func galMulSSE3Xor(low, high, in, out []byte)
-TEXT ·galMulSSE3Xor(SB), 7, $0
+// func galMulSSSE3Xor(low, high, in, out []byte)
+TEXT ·galMulSSSE3Xor(SB), 7, $0
    MOVQ    low+0(FP),SI        // SI: &low
    MOVQ    high+24(FP),DX      // DX: &high
    MOVOU  (SI), X6             // X6 low
@ -42,8 +42,8 @@ loopback_xor:
 done_xor:
    RET

-// func galMulSSE3(low, high, in, out []byte)
-TEXT ·galMulSSE3(SB), 7, $0
+// func galMulSSSE3(low, high, in, out []byte)
+TEXT ·galMulSSSE3(SB), 7, $0
    MOVQ    low+0(FP),SI        // SI: &low
    MOVQ    high+24(FP),DX      // DX: &high
    MOVOU   (SI), X6            // X6 low