From 0e9e10435f3bc870657652624d04e1237557bffe Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 5 May 2020 16:36:01 +0200 Subject: [PATCH] avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ``` --- galois_amd64.go | 71 +++++++++++++++++++------- galois_amd64.s | 132 ++++++++++++++++++++++++++++++++++++++++++++++++ reedsolomon.go | 8 +-- 3 files changed, 190 insertions(+), 21 deletions(-) diff --git a/galois_amd64.go b/galois_amd64.go index 2b63983..bf7faca 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -21,6 +21,15 @@ func galMulAVX2(low, high, in, out []byte) //go:noescape func sSE2XorSlice(in, out []byte) +//go:noescape +func galMulAVX2Xor_64(low, high, in, out []byte) + +//go:noescape +func galMulAVX2_64(low, high, in, out []byte) + +//go:noescape +func sSE2XorSlice_64(in, out []byte) + // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { @@ -40,17 +49,29 @@ func galMulSSSE3Xor(low, high, in, out []byte) { } */ +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + func galMulSlice(c byte, in, out []byte, o *options) { - var done int if o.useAVX2 { - galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 + if len(in) >= bigSwitchover { + galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) > 32 { + galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } } else if o.useSSSE3 { galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 4) << 4 + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] } - in = in[done:] - out = out[done:] out = out[:len(in)] mt := mulTable[c][:256] for i := range in { @@ -59,16 +80,25 @@ func galMulSlice(c byte, in, out []byte, o *options) { } func galMulSliceXor(c byte, in, out []byte, o *options) { - var done int if o.useAVX2 { - galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 + if len(in) >= bigSwitchover { + galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) >= 32 { + galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } } else if o.useSSSE3 { galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 4) << 4 + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] } - in = in[done:] - out = out[done:] out = out[:len(in)] mt := mulTable[c][:256] for i := range in { @@ -78,13 +108,20 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { // slice galois add func sliceXor(in, out []byte, sse2 bool) { - var done int if sse2 { - sSE2XorSlice(in, out) - done = (len(in) >> 4) << 4 + if len(in) >= bigSwitchover { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } } - in = in[done:] - out = out[done:] out = out[:len(in)] for i := range in { out[i] ^= in[i] diff --git a/galois_amd64.s b/galois_amd64.s index b768028..3501110 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -234,3 +234,135 @@ loopback_xor_sse2: done_xor_sse2: RET + +// func galMulAVX2Xor_64(low, high, in, out []byte) +TEXT ·galMulAVX2Xor_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_xor_avx2_64 + +loopback_xor_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VMOVDQU (DX), Y4 + VMOVDQU 32(DX), Y14 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input 2 + VPAND Y8, Y1, Y1 // Y11: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y3 // Y3: Result + VPXOR Y13, Y12, Y13 // Y13: Result 2 + VPXOR Y4, Y3, Y4 // Y4: Result + VPXOR Y14, Y13, Y14 // Y4: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_avx2_64 + +done_xor_avx2_64: + VZEROUPPER + RET + +// func galMulAVX2_64(low, high, in, out []byte) +TEXT ·galMulAVX2_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_avx2_64 + +loopback_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y4 // Y4: Result + VPXOR Y13, Y12, Y14 // Y14: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_avx2_64 + +done_avx2_64: + VZEROUPPER + RET + +// func sSE2XorSlice_64(in, out []byte) +TEXT ·sSE2XorSlice_64(SB), 7, $0 + MOVQ in+0(FP), SI // SI: &in + MOVQ in_len+8(FP), R9 // R9: len(in) + MOVQ out+24(FP), DX // DX: &out + SHRQ $6, R9 // len(in) / 64 + CMPQ R9, $0 + JEQ done_xor_sse2_64 + +loopback_xor_sse2_64: + MOVOU (SI), X0 // in[x] + MOVOU 16(SI), X2 // in[x] + MOVOU 32(SI), X4 // in[x] + MOVOU 48(SI), X6 // in[x] + MOVOU (DX), X1 // out[x] + MOVOU 16(DX), X3 // out[x] + MOVOU 32(DX), X5 // out[x] + MOVOU 48(DX), X7 // out[x] + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (DX) + MOVOU X3, 16(DX) + MOVOU X5, 32(DX) + MOVOU X7, 48(DX) + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_sse2_64 + +done_xor_sse2_64: + RET diff --git a/reedsolomon.go b/reedsolomon.go index 999fcc3..9bfbb26 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -503,8 +503,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu if do < r.o.minSplitSize { do = r.o.minSplitSize } - // Make sizes divisible by 32 - do = (do + 31) & (^31) + // Make sizes divisible by 64 + do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount { @@ -576,8 +576,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp if do < r.o.minSplitSize { do = r.o.minSplitSize } - // Make sizes divisible by 32 - do = (do + 31) & (^31) + // Make sizes divisible by 64 + do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount {