avx2: Add 64 bytes per loop processing (#128)

* avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 16:36:01 +02:00 · 2020-05-05 16:36:01 +02:00 · 0e9e10435f
parent abb309aca7
commit 0e9e10435f
3 changed files with 190 additions and 21 deletions
--- a/galois_amd64.go
+++ b/galois_amd64.go
@ -21,6 +21,15 @@ func galMulAVX2(low, high, in, out []byte)
 //go:noescape
 func sSE2XorSlice(in, out []byte)

+//go:noescape
+func galMulAVX2Xor_64(low, high, in, out []byte)
+
+//go:noescape
+func galMulAVX2_64(low, high, in, out []byte)
+
+//go:noescape
+func sSE2XorSlice_64(in, out []byte)
+
 // This is what the assembler routines do in blocks of 16 bytes:
 /*
 func galMulSSSE3(low, high, in, out []byte) {
@ -40,17 +49,29 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
 }
 */

+// bigSwitchover is the size where 64 bytes are processed per loop.
+const bigSwitchover = 128
+
 func galMulSlice(c byte, in, out []byte, o *options) {
-	var done int
 	if o.useAVX2 {
-		galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
+		if len(in) >= bigSwitchover {
+			galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) > 32 {
+			galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 5) << 5
+			in = in[done:]
+			out = out[done:]
+		}
 	} else if o.useSSSE3 {
 		galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
+		done := (len(in) >> 4) << 4
+		in = in[done:]
+		out = out[done:]
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	mt := mulTable[c][:256]
 	for i := range in {
@ -59,16 +80,25 @@ func galMulSlice(c byte, in, out []byte, o *options) {
 }

 func galMulSliceXor(c byte, in, out []byte, o *options) {
-	var done int
 	if o.useAVX2 {
-		galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 5) << 5
+		if len(in) >= bigSwitchover {
+			galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) >= 32 {
+			galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
+			done := (len(in) >> 5) << 5
+			in = in[done:]
+			out = out[done:]
+		}
 	} else if o.useSSSE3 {
 		galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
-		done = (len(in) >> 4) << 4
+		done := (len(in) >> 4) << 4
+		in = in[done:]
+		out = out[done:]
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	mt := mulTable[c][:256]
 	for i := range in {
@ -78,13 +108,20 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {

 // slice galois add
 func sliceXor(in, out []byte, sse2 bool) {
-	var done int
 	if sse2 {
-		sSE2XorSlice(in, out)
-		done = (len(in) >> 4) << 4
+		if len(in) >= bigSwitchover {
+			sSE2XorSlice_64(in, out)
+			done := (len(in) >> 6) << 6
+			in = in[done:]
+			out = out[done:]
+		}
+		if len(in) >= 16 {
+			sSE2XorSlice(in, out)
+			done := (len(in) >> 4) << 4
+			in = in[done:]
+			out = out[done:]
+		}
 	}
-	in = in[done:]
-	out = out[done:]
 	out = out[:len(in)]
 	for i := range in {
 		out[i] ^= in[i]
--- a/galois_amd64.s
+++ b/galois_amd64.s
@ -234,3 +234,135 @@ loopback_xor_sse2:

 done_xor_sse2:
 	RET
+
+// func galMulAVX2Xor_64(low, high, in, out []byte)
+TEXT ·galMulAVX2Xor_64(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6: low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	VINSERTI128  $1, X6, Y6, Y6 // low
+	VINSERTI128  $1, X7, Y7, Y7 // high
+	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)
+
+	SHRQ  $6, R9           // len(in) / 64
+	MOVQ  out+72(FP), DX   // DX: &out
+	MOVQ  in+48(FP), SI    // SI: &in
+	TESTQ R9, R9
+	JZ    done_xor_avx2_64
+
+loopback_xor_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y10
+	VMOVDQU (DX), Y4
+	VMOVDQU 32(DX), Y14
+	VPSRLQ  $4, Y0, Y1    // Y1: high input
+	VPSRLQ  $4, Y10, Y11  // Y11: high input 2
+	VPAND   Y8, Y0, Y0    // Y0: low input
+	VPAND   Y8, Y10, Y10  // Y10: low input 2
+	VPAND   Y8, Y1, Y1    // Y11: high input
+	VPAND   Y8, Y11, Y11  // Y11: high input 2
+	VPSHUFB Y0, Y6, Y2    // Y2: mul low part
+	VPSHUFB Y10, Y6, Y12  // Y12: mul low part 2
+	VPSHUFB Y1, Y7, Y3    // Y3: mul high part
+	VPSHUFB Y11, Y7, Y13  // Y13: mul high part 2
+	VPXOR   Y3, Y2, Y3    // Y3: Result
+	VPXOR   Y13, Y12, Y13 // Y13: Result 2
+	VPXOR   Y4, Y3, Y4    // Y4: Result
+	VPXOR   Y14, Y13, Y14 // Y4: Result 2
+	VMOVDQU Y4, (DX)
+	VMOVDQU Y14, 32(DX)
+
+	ADDQ $64, SI              // in+=64
+	ADDQ $64, DX              // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_xor_avx2_64
+
+done_xor_avx2_64:
+	VZEROUPPER
+	RET
+
+// func galMulAVX2_64(low, high, in, out []byte)
+TEXT ·galMulAVX2_64(SB), 7, $0
+	MOVQ  low+0(FP), SI     // SI: &low
+	MOVQ  high+24(FP), DX   // DX: &high
+	MOVQ  $15, BX           // BX: low mask
+	MOVQ  BX, X5
+	MOVOU (SI), X6          // X6: low
+	MOVOU (DX), X7          // X7: high
+	MOVQ  in_len+56(FP), R9 // R9: len(in)
+
+	VINSERTI128  $1, X6, Y6, Y6 // low
+	VINSERTI128  $1, X7, Y7, Y7 // high
+	VPBROADCASTB X5, Y8         // Y8: lomask (unpacked)
+
+	SHRQ  $6, R9         // len(in) / 64
+	MOVQ  out+72(FP), DX // DX: &out
+	MOVQ  in+48(FP), SI  // SI: &in
+	TESTQ R9, R9
+	JZ    done_avx2_64
+
+loopback_avx2_64:
+	VMOVDQU (SI), Y0
+	VMOVDQU 32(SI), Y10
+	VPSRLQ  $4, Y0, Y1    // Y1: high input
+	VPSRLQ  $4, Y10, Y11  // Y11: high input 2
+	VPAND   Y8, Y0, Y0    // Y0: low input
+	VPAND   Y8, Y10, Y10  // Y10: low input
+	VPAND   Y8, Y1, Y1    // Y1: high input
+	VPAND   Y8, Y11, Y11  // Y11: high input 2
+	VPSHUFB Y0, Y6, Y2    // Y2: mul low part
+	VPSHUFB Y10, Y6, Y12  // Y12: mul low part 2
+	VPSHUFB Y1, Y7, Y3    // Y3: mul high part
+	VPSHUFB Y11, Y7, Y13  // Y13: mul high part 2
+	VPXOR   Y3, Y2, Y4    // Y4: Result
+	VPXOR   Y13, Y12, Y14 // Y14: Result 2
+	VMOVDQU Y4, (DX)
+	VMOVDQU Y14, 32(DX)
+
+	ADDQ $64, SI          // in+=64
+	ADDQ $64, DX          // out+=64
+	SUBQ $1, R9
+	JNZ  loopback_avx2_64
+
+done_avx2_64:
+	VZEROUPPER
+	RET
+
+// func sSE2XorSlice_64(in, out []byte)
+TEXT ·sSE2XorSlice_64(SB), 7, $0
+	MOVQ in+0(FP), SI     // SI: &in
+	MOVQ in_len+8(FP), R9 // R9: len(in)
+	MOVQ out+24(FP), DX   // DX: &out
+	SHRQ $6, R9           // len(in) / 64
+	CMPQ R9, $0
+	JEQ  done_xor_sse2_64
+
+loopback_xor_sse2_64:
+	MOVOU (SI), X0             // in[x]
+	MOVOU 16(SI), X2           // in[x]
+	MOVOU 32(SI), X4           // in[x]
+	MOVOU 48(SI), X6           // in[x]
+	MOVOU (DX), X1             // out[x]
+	MOVOU 16(DX), X3           // out[x]
+	MOVOU 32(DX), X5           // out[x]
+	MOVOU 48(DX), X7           // out[x]
+	PXOR  X0, X1
+	PXOR  X2, X3
+	PXOR  X4, X5
+	PXOR  X6, X7
+	MOVOU X1, (DX)
+	MOVOU X3, 16(DX)
+	MOVOU X5, 32(DX)
+	MOVOU X7, 48(DX)
+	ADDQ  $64, SI              // in+=64
+	ADDQ  $64, DX              // out+=64
+	SUBQ  $1, R9
+	JNZ   loopback_xor_sse2_64
+
+done_xor_sse2_64:
+	RET
--- a/reedsolomon.go
+++ b/reedsolomon.go
@ -503,8 +503,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
-	// Make sizes divisible by 32
-	do = (do + 31) & (^31)
+	// Make sizes divisible by 64
+	do = (do + 63) & (^63)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {
@ -576,8 +576,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
 	if do < r.o.minSplitSize {
 		do = r.o.minSplitSize
 	}
-	// Make sizes divisible by 32
-	do = (do + 31) & (^31)
+	// Make sizes divisible by 64
+	do = (do + 63) & (^63)
 	start := 0
 	for start < byteCount {
 		if start+do > byteCount {