Faster AVX2 encoding (#153)

* Remove 50% of bounds checks when copying. * Use RIP only addressing, free one register. ``` benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 57663.49 58005.87 1.01x BenchmarkGalois1M-32 49479.31 49848.29 1.01x BenchmarkGaloisXor128K-32 46310.69 46501.88 1.00x BenchmarkGaloisXor1M-32 43804.86 43984.39 1.00x BenchmarkEncode10x2x10000-32 25926.93 27457.75 1.06x BenchmarkEncode100x20x10000-32 2635.82 2818.95 1.07x BenchmarkEncode17x3x1M-32 63215.11 61576.76 0.97x BenchmarkEncode10x4x16M-32 19551.54 19505.07 1.00x BenchmarkEncode5x2x1M-32 79612.06 81985.14 1.03x BenchmarkEncode10x2x1M-32 121478.29 127739.41 1.05x BenchmarkEncode10x4x1M-32 70757.61 74423.67 1.05x BenchmarkEncode50x20x1M-32 19811.96 20103.32 1.01x BenchmarkEncode17x3x16M-32 27202.10 27825.34 1.02x BenchmarkEncode_8x4x8M-32 19029.04 19701.31 1.04x BenchmarkEncode_12x4x12M-32 22449.87 22480.51 1.00x BenchmarkEncode_16x4x16M-32 24536.74 24672.24 1.01x BenchmarkEncode_16x4x32M-32 24381.34 24981.99 1.02x BenchmarkEncode_16x4x64M-32 24717.69 25086.94 1.01x BenchmarkEncode_8x5x8M-32 16763.51 17154.04 1.02x BenchmarkEncode_8x6x8M-32 15067.22 15205.87 1.01x BenchmarkEncode_8x7x8M-32 13156.38 13589.40 1.03x BenchmarkEncode_8x9x8M-32 11363.74 11523.70 1.01x BenchmarkEncode_8x10x8M-32 10359.37 10474.91 1.01x BenchmarkEncode_8x11x8M-32 9627.07 9463.24 0.98x BenchmarkEncode_8x8x05M-32 30104.80 32634.89 1.08x BenchmarkEncode_8x8x1M-32 36497.28 36425.88 1.00x BenchmarkEncode_8x8x8M-32 12186.19 11602.41 0.95x BenchmarkEncode_8x8x32M-32 11670.72 11413.71 0.98x BenchmarkEncode_24x8x24M-32 21709.83 21652.50 1.00x BenchmarkEncode_24x8x48M-32 22494.40 22280.59 0.99x BenchmarkVerify10x2x10000-32 10567.56 10483.91 0.99x BenchmarkVerify50x5x50000-32 28102.84 27923.63 0.99x BenchmarkVerify10x2x1M-32 30298.33 30106.18 0.99x BenchmarkVerify5x2x1M-32 16115.91 15847.03 0.98x BenchmarkVerify10x4x1M-32 15382.13 14852.68 0.97x BenchmarkVerify50x20x1M-32 8476.02 8466.24 1.00x BenchmarkVerify10x4x16M-32 15101.03 15434.71 1.02x BenchmarkReconstruct10x2x10000-32 26228.18 26960.19 1.03x BenchmarkReconstruct50x5x50000-32 31091.42 30975.82 1.00x BenchmarkReconstruct10x2x1M-32 58548.87 60281.92 1.03x BenchmarkReconstruct5x2x1M-32 39499.23 41791.80 1.06x BenchmarkReconstruct10x4x1M-32 41448.60 43053.15 1.04x BenchmarkReconstruct50x20x1M-32 17185.99 17354.67 1.01x BenchmarkReconstruct10x4x16M-32 18798.60 18847.43 1.00x BenchmarkReconstructData10x2x10000-32 27208.48 27538.38 1.01x BenchmarkReconstructData50x5x50000-32 32135.65 32078.91 1.00x BenchmarkReconstructData10x2x1M-32 63180.19 67332.17 1.07x BenchmarkReconstructData5x2x1M-32 47532.85 49932.17 1.05x BenchmarkReconstructData10x4x1M-32 50059.14 52323.15 1.05x BenchmarkReconstructData50x20x1M-32 26679.75 26714.11 1.00x BenchmarkReconstructData10x4x16M-32 24854.99 24527.23 0.99x BenchmarkReconstructP10x2x10000-32 115089.87 113229.75 0.98x BenchmarkReconstructP10x5x20000-32 129838.75 132871.10 1.02x BenchmarkParallel_8x8x64K-32 69951.43 69980.44 1.00x BenchmarkParallel_8x8x05M-32 11752.94 11724.35 1.00x BenchmarkParallel_20x10x05M-32 18553.93 18613.33 1.00x BenchmarkParallel_8x8x1M-32 11639.19 11746.86 1.01x BenchmarkParallel_8x8x8M-32 11799.36 11685.63 0.99x BenchmarkParallel_8x8x32M-32 11510.94 11791.72 1.02x BenchmarkParallel_8x3x1M-32 20268.92 20678.21 1.02x BenchmarkParallel_8x4x1M-32 17616.05 17856.17 1.01x BenchmarkParallel_8x5x1M-32 15590.87 15872.42 1.02x BenchmarkStreamEncode10x2x10000-32 14917.08 15408.39 1.03x BenchmarkStreamEncode100x20x10000-32 2014.81 2077.31 1.03x BenchmarkStreamEncode17x3x1M-32 11839.37 12434.80 1.05x BenchmarkStreamEncode10x4x16M-32 9151.14 9206.98 1.01x BenchmarkStreamEncode5x2x1M-32 13598.55 13663.56 1.00x BenchmarkStreamEncode10x2x1M-32 13192.91 13453.41 1.02x BenchmarkStreamEncode10x4x1M-32 12109.90 12050.68 1.00x BenchmarkStreamEncode50x20x1M-32 8640.73 8370.10 0.97x BenchmarkStreamEncode17x3x16M-32 10473.17 10527.04 1.01x BenchmarkStreamVerify10x2x10000-32 7032.23 7128.82 1.01x BenchmarkStreamVerify50x5x50000-32 13023.46 13109.31 1.01x BenchmarkStreamVerify10x2x1M-32 11941.63 11949.91 1.00x BenchmarkStreamVerify5x2x1M-32 8029.93 8263.39 1.03x BenchmarkStreamVerify10x4x1M-32 8137.82 8271.11 1.02x BenchmarkStreamVerify50x20x1M-32 7378.87 7708.81 1.04x BenchmarkStreamVerify10x4x16M-32 8973.18 8955.29 1.00x ```
2020-11-10 05:39:23 -08:00 · 2020-11-10 05:39:23 -08:00 · 653e76aa26
parent 04d4482b55
commit 653e76aa26
5 changed files with 4226 additions and 2450 deletions
--- a/galois.go
+++ b/galois.go
@ -917,12 +917,14 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte
 	for i, row := range matrixRows[:outputs] {
 		for j, idx := range row[:inputs] {
 			dstIdx := (j*outputs + i) * 64
+			dstPart := dst[dstIdx:]
+			dstPart = dstPart[:64]
 			lo := mulTableLow[idx][:]
 			hi := mulTableHigh[idx][:]
-			copy(dst[dstIdx:], lo)
-			copy(dst[dstIdx+16:], lo)
-			copy(dst[dstIdx+32:], hi)
-			copy(dst[dstIdx+48:], hi)
+			copy(dstPart[:16], lo)
+			copy(dstPart[16:32], lo)
+			copy(dstPart[32:48], hi)
+			copy(dstPart[48:64], hi)
 		}
 	}
 	return dst
--- a/galois_gen_amd64.s
+++ b/galois_gen_amd64.s
--- a/gen.go
+++ b/gen.go
@ -26,6 +26,11 @@ var switchDefsX [inputMax][outputMax]string
 const perLoopBits = 5
 const perLoop = 1 << perLoopBits

+// Prefetch offsets, set to 0 to disable.
+// Disabled since they appear to be consistently slower.
+const prefetchSrc = 0
+const prefetchDst = 0
+
 func main() {
 	Constraint(buildtags.Not("appengine").ToConstraint())
 	Constraint(buildtags.Not("noasm").ToConstraint())
@ -98,6 +103,7 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	var loadNone bool
 	// Use registers for destination registers.
 	var regDst = true
+	var reloadLength = false

 	// lo, hi, 1 in, 1 out, 2 tmp, 1 mask
 	est := total*2 + outputs + 5
@ -109,9 +115,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	if est > 16 {
 		loadNone = true
 		// We run out of GP registers first, now.
-		if inputs+outputs > 12 {
+		if inputs+outputs > 13 {
 			regDst = false
 		}
+		// Save one register by reloading length.
+		if inputs+outputs > 12 && regDst {
+			reloadLength = true
+		}
 	}

 	TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)"))
@ -127,6 +137,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 		// loadNone == false
 		Comment("Loading all tables to registers")
 	}
+	if regDst {
+		Comment("Destination kept in GP registers")
+	} else {
+		Comment("Destination kept on stack")
+	}

 	Doc(doc...)
 	Pragma("noescape")
@ -139,21 +154,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	TESTQ(length, length)
 	JZ(LabelRef(name + "_end"))

-	dst := make([]reg.VecVirtual, outputs)
-	dstPtr := make([]reg.GPVirtual, outputs)
-	outBase := Param("out").Base().MustAddr()
-	outSlicePtr := GP64()
-	MOVQ(outBase, outSlicePtr)
-	for i := range dst {
-		dst[i] = YMM()
-		if !regDst {
-			continue
-		}
-		ptr := GP64()
-		MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
-		dstPtr[i] = ptr
-	}
-
 	inLo := make([]reg.VecVirtual, total)
 	inHi := make([]reg.VecVirtual, total)

@ -177,6 +177,36 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 		MOVQ(Mem{Base: inSlicePtr, Disp: i * 24}, ptr)
 		inPtrs[i] = ptr
 	}
+	// Destination
+	dst := make([]reg.VecVirtual, outputs)
+	dstPtr := make([]reg.GPVirtual, outputs)
+	outBase := Param("out").Base().MustAddr()
+	outSlicePtr := GP64()
+	MOVQ(outBase, outSlicePtr)
+	for i := range dst {
+		dst[i] = YMM()
+		if !regDst {
+			continue
+		}
+		ptr := GP64()
+		MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
+		dstPtr[i] = ptr
+	}
+
+	offset := GP64()
+	MOVQ(Param("start").MustAddr(), offset)
+	if regDst {
+		Comment("Add start offset to output")
+		for _, ptr := range dstPtr {
+			ADDQ(offset, ptr)
+		}
+	}
+
+	Comment("Add start offset to input")
+	for _, ptr := range inPtrs {
+		ADDQ(offset, ptr)
+	}
+	// Offset no longer needed unless not regdst

 	tmpMask := GP64()
 	MOVQ(U32(15), tmpMask)
@ -184,8 +214,10 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	MOVQ(tmpMask, lowMask.AsX())
 	VPBROADCASTB(lowMask.AsX(), lowMask)

-	offset := GP64()
-	MOVQ(Param("start").MustAddr(), offset)
+	if reloadLength {
+		length = Load(Param("n"), GP64())
+		SHRQ(U8(perLoopBits), length)
+	}
 	Label(name + "_loop")
 	if xor {
 		Commentf("Load %d outputs", outputs)
@ -195,12 +227,18 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	for i := range dst {
 		if xor {
 			if regDst {
-				VMOVDQU(Mem{Base: dstPtr[i], Index: offset, Scale: 1}, dst[i])
+				VMOVDQU(Mem{Base: dstPtr[i]}, dst[i])
+				if prefetchDst > 0 {
+					PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
+				}
 				continue
 			}
 			ptr := GP64()
 			MOVQ(outBase, ptr)
 			VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i])
+			if prefetchDst > 0 {
+				PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
+			}
 		} else {
 			VPXOR(dst[i], dst[i], dst[i])
 		}
@ -210,7 +248,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	inLow, inHigh := YMM(), YMM()
 	for i := range inPtrs {
 		Commentf("Load and process 32 bytes from input %d to %d outputs", i, outputs)
-		VMOVDQU(Mem{Base: inPtrs[i], Index: offset, Scale: 1}, inLow)
+		VMOVDQU(Mem{Base: inPtrs[i]}, inLow)
+		if prefetchSrc > 0 {
+			PREFETCHT0(Mem{Base: inPtrs[i], Disp: prefetchSrc})
+		}
+		ADDQ(U8(perLoop), inPtrs[i])
 		VPSRLQ(U8(4), inLow, inHigh)
 		VPAND(lowMask, inLow, inLow)
 		VPAND(lowMask, inHigh, inHigh)
@ -231,15 +273,24 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) {
 	Commentf("Store %d outputs", outputs)
 	for i := range dst {
 		if regDst {
-			VMOVDQU(dst[i], Mem{Base: dstPtr[i], Index: offset, Scale: 1})
+			VMOVDQU(dst[i], Mem{Base: dstPtr[i]})
+			if prefetchDst > 0 && !xor {
+				PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst})
+			}
+			ADDQ(U8(perLoop), dstPtr[i])
 			continue
 		}
 		ptr := GP64()
 		MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr)
 		VMOVDQU(dst[i], Mem{Base: ptr, Index: offset, Scale: 1})
+		if prefetchDst > 0 && !xor {
+			PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1})
+		}
 	}
 	Comment("Prepare for next loop")
-	ADDQ(U8(perLoop), offset)
+	if !regDst {
+		ADDQ(U8(perLoop), offset)
+	}
 	DECQ(length)
 	JNZ(LabelRef(name + "_loop"))
 	VZEROUPPER()
--- a/reedsolomon.go
+++ b/reedsolomon.go
@ -520,7 +520,7 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
 	if end > len(inputs[0]) {
 		end = len(inputs[0])
 	}
-	if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
+	if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs)+len(outputs) >= 4 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
 		m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
 		start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount)
 		r.mPool.Put(m)
@ -550,18 +550,23 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu
 // several goroutines.
 func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) {
 	var wg sync.WaitGroup
-	do := byteCount / r.o.maxGoroutines
-	if do < r.o.minSplitSize {
-		do = r.o.minSplitSize
-	}
-	// Make sizes divisible by 64
-	do = (do + 63) & (^63)
-	start := 0
+	gor := r.o.maxGoroutines
+
 	var avx2Matrix []byte
-	if avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
+	useAvx2 := avx2CodeGen && r.o.useAVX2 && byteCount >= 32 && len(inputs)+len(outputs) >= 4 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs
+	if useAvx2 {
 		avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte))
 		defer r.mPool.Put(avx2Matrix)
 	}
+
+	do := byteCount / gor
+	if do < r.o.minSplitSize {
+		do = r.o.minSplitSize
+	}
+
+	// Make sizes divisible by 64
+	do = (do + 63) & (^63)
+	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
 			do = byteCount - start
@ -569,7 +574,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp

 		wg.Add(1)
 		go func(start, stop int) {
-			if avx2CodeGen && r.o.useAVX2 && stop-start >= 32 && len(inputs) > 1 && len(outputs) > 1 && len(inputs) <= maxAvx2Inputs && len(outputs) <= maxAvx2Outputs {
+			if useAvx2 && stop-start >= 32 {
 				start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop)
 			}

--- a/reedsolomon_test.go
+++ b/reedsolomon_test.go
@ -646,7 +646,8 @@ func testVerify(t *testing.T, o ...Option) {
 		t.Fatal(err)
 	}
 	if !ok {
-		t.Fatal("Verification failed")
+		t.Error("Verification failed")
+		return
 	}

 	// Put in random data. Verification should fail