Split blocks into size divisible by 16

Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 17:37:40 +01:00 · 2017-11-18 17:37:40 +01:00 · f5e73dcfe2
parent e52c150f96
commit f5e73dcfe2
5 changed files with 112 additions and 8 deletions
--- a/README.md
+++ b/README.md
@ -24,6 +24,10 @@ go get -u github.com/klauspost/reedsolomon

 # Changes

+## November 18, 2017
+
+Added [WithAutoGoroutines](https://godoc.org/github.com/klauspost/reedsolomon#WithAutoGoroutines) which will attempt to calculate the optimal number og goroutines to use based on your expected shard size and detected CPU.
+
 ## October 1, 2017

 * [Cauchy Matrix](https://godoc.org/github.com/klauspost/reedsolomon#WithCauchyMatrix) is now an option. Thanks to [templexxx](https://github.com/templexxx) for the basis of this.
--- a/galois_amd64.s
+++ b/galois_amd64.s
@ -19,8 +19,35 @@ TEXT ·galMulSSSE3Xor(SB), 7, $0
 	MOVQ   out+72(FP), DX    // DX: &out
 	PSHUFB X5, X8            // X8: lomask (unpacked)
 	SHRQ   $4, R9            // len(in) / 16
+	MOVQ   SI, AX
+	MOVQ   DX, BX
+	ANDQ   $15, AX
+	ANDQ   $15, BX
 	CMPQ   R9, $0
 	JEQ    done_xor
+	ORQ    AX, BX
+	CMPQ   BX, $0
+	JNZ    loopback_xor
+
+loopback_xor_aligned:
+	MOVOA  (SI), X0             // in[x]
+	MOVOA  (DX), X4             // out[x]
+	MOVOA  X0, X1               // in[x]
+	MOVOA  X6, X2               // low copy
+	MOVOA  X7, X3               // high copy
+	PSRLQ  $4, X1               // X1: high input
+	PAND   X8, X0               // X0: low input
+	PAND   X8, X1               // X0: high input
+	PSHUFB X0, X2               // X2: mul low part
+	PSHUFB X1, X3               // X3: mul high part
+	PXOR   X2, X3               // X3: Result
+	PXOR   X4, X3               // X3: Result xor existing out
+	MOVOA  X3, (DX)             // Store
+	ADDQ   $16, SI              // in+=16
+	ADDQ   $16, DX              // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback_xor_aligned
+	JMP    done_xor

 loopback_xor:
 	MOVOU  (SI), X0     // in[x]
@ -57,15 +84,40 @@ TEXT ·galMulSSSE3(SB), 7, $0
 	MOVQ   in_len+56(FP), R9 // R9: len(in)
 	MOVQ   out+72(FP), DX    // DX: &out
 	PSHUFB X5, X8            // X8: lomask (unpacked)
+	MOVQ   SI, AX
+	MOVQ   DX, BX
 	SHRQ   $4, R9            // len(in) / 16
+	ANDQ   $15, AX
+	ANDQ   $15, BX
 	CMPQ   R9, $0
 	JEQ    done
+	ORQ    AX, BX
+	CMPQ   BX, $0
+	JNZ    loopback
+
+loopback_aligned:
+	MOVOA  (SI), X0         // in[x]
+	MOVOA  X0, X1           // in[x]
+	MOVOA  X6, X2           // low copy
+	MOVOA  X7, X3           // high copy
+	PSRLQ  $4, X1           // X1: high input
+	PAND   X8, X0           // X0: low input
+	PAND   X8, X1           // X0: high input
+	PSHUFB X0, X2           // X2: mul low part
+	PSHUFB X1, X3           // X3: mul high part
+	PXOR   X2, X3           // X3: Result
+	MOVOA  X3, (DX)         // Store
+	ADDQ   $16, SI          // in+=16
+	ADDQ   $16, DX          // out+=16
+	SUBQ   $1, R9
+	JNZ    loopback_aligned
+	JMP    done

 loopback:
 	MOVOU  (SI), X0 // in[x]
 	MOVOU  X0, X1   // in[x]
-	MOVOU  X6, X2   // low copy
-	MOVOU  X7, X3   // high copy
+	MOVOA  X6, X2   // low copy
+	MOVOA  X7, X3   // high copy
 	PSRLQ  $4, X1   // X1: high input
 	PAND   X8, X0   // X0: low input
 	PAND   X8, X1   // X0: high input
--- a/options.go
+++ b/options.go
@ -15,11 +15,12 @@ type options struct {
 	useAVX2, useSSSE3, useSSE2 bool
 	usePAR1Matrix              bool
 	useCauchy                  bool
+	shardSize                  int
 }

 var defaultOptions = options{
 	maxGoroutines: 384,
-	minSplitSize:  512,
+	minSplitSize:  1024,
 }

 func init() {
@ -46,6 +47,18 @@ func WithMaxGoroutines(n int) Option {
 	}
 }

+// WithAutoGoroutines will adjust the number of goroutines for optimal speed with a
+// specific shard size.
+// Send in the shard size you expect to send. Other shard sizes will work, but may not
+// run at the optimal speed.
+// Overwrites WithMaxGoroutines.
+// If shardSize <= 0, it is ignored.
+func WithAutoGoroutines(shardSize int) Option {
+	return func(o *options) {
+		o.shardSize = shardSize
+	}
+}
+
 // WithMinSplitSize is the minimum encoding size in bytes per goroutine.
 // See WithMaxGoroutines on how jobs are split.
 // If n <= 0, it is ignored.
--- a/reedsolomon.go
+++ b/reedsolomon.go
@ -15,7 +15,10 @@ import (
 	"bytes"
 	"errors"
 	"io"
+	"runtime"
 	"sync"
+
+	"github.com/klauspost/cpuid"
 )

 // Encoder is an interface to encode Reed-Salomon parity sets for your data.
@ -239,6 +242,33 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) {
 	if err != nil {
 		return nil, err
 	}
+	if r.o.shardSize > 0 {
+		cacheSize := cpuid.CPU.Cache.L2
+		if cacheSize <= 0 {
+			// Set to 128K if undetectable.
+			cacheSize = 128 << 10
+		}
+		p := runtime.NumCPU()
+
+		// 1 input + parity must fit in cache, and we add one more to be safer.
+		shards := 1 + parityShards
+		g := (r.o.shardSize * shards) / (cacheSize - (cacheSize >> 4))
+
+		if cpuid.CPU.ThreadsPerCore > 1 {
+			// If multiple threads per core, make sure they don't contend for cache.
+			g *= cpuid.CPU.ThreadsPerCore
+		}
+		g *= 2
+		if g < p {
+			g = p
+		}
+
+		// Have g be multiple of p
+		g += p - 1
+		g -= g % p
+
+		r.o.maxGoroutines = g
+	}

 	// Inverted matrices are cached in a tree keyed by the indices
 	// of the invalid rows of the data to reconstruct.
@ -431,6 +461,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
+	// Make sizes divisible by 16
+	do = (do + 15) & (^15)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
@ -490,6 +522,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
+	// Make sizes divisible by 16
+	do = (do + 15) & (^15)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@ -113,6 +113,7 @@ func testOpts() [][]Option {
 		{WithMaxGoroutines(5000), WithMinSplitSize(50), withSSE3(false), withAVX2(false)},
 		{WithMaxGoroutines(5000), WithMinSplitSize(500000), withSSE3(false), withAVX2(false)},
 		{WithMaxGoroutines(1), WithMinSplitSize(500000), withSSE3(false), withAVX2(false)},
+		{WithAutoGoroutines(50000), WithMinSplitSize(500)},
 	}
 	for _, o := range opts[:] {
 		if defaultOptions.useSSSE3 {
@ -655,7 +656,7 @@ func fillRandom(p []byte) {
 }

 func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) {
-	r, err := New(dataShards, parityShards)
+	r, err := New(dataShards, parityShards, WithAutoGoroutines(shardSize))
 	if err != nil {
 		b.Fatal(err)
 	}
@ -722,7 +723,7 @@ func BenchmarkEncode17x3x16M(b *testing.B) {
 }

 func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) {
-	r, err := New(dataShards, parityShards)
+	r, err := New(dataShards, parityShards, WithAutoGoroutines(shardSize))
 	if err != nil {
 		b.Fatal(err)
 	}
@ -793,7 +794,7 @@ func corruptRandom(shards [][]byte, dataShards, parityShards int) {
 }

 func benchmarkReconstruct(b *testing.B, dataShards, parityShards, shardSize int) {
-	r, err := New(dataShards, parityShards)
+	r, err := New(dataShards, parityShards, WithAutoGoroutines(shardSize))
 	if err != nil {
 		b.Fatal(err)
 	}
@ -873,7 +874,7 @@ func corruptRandomData(shards [][]byte, dataShards, parityShards int) {
 }

 func benchmarkReconstructData(b *testing.B, dataShards, parityShards, shardSize int) {
-	r, err := New(dataShards, parityShards)
+	r, err := New(dataShards, parityShards, WithAutoGoroutines(shardSize))
 	if err != nil {
 		b.Fatal(err)
 	}
@ -939,7 +940,7 @@ func BenchmarkReconstructData10x4x16M(b *testing.B) {
 }

 func benchmarkReconstructP(b *testing.B, dataShards, parityShards, shardSize int) {
-	r, err := New(dataShards, parityShards)
+	r, err := New(dataShards, parityShards, WithAutoGoroutines(shardSize))
 	if err != nil {
 		b.Fatal(err)
 	}