reedsolomon-go/galois_amd64.go

141 lines
2.8 KiB
Go
Raw Normal View History

//go:build !noasm && !appengine && !gccgo
// +build !noasm,!appengine,!gccgo
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
// Copyright 2015, Klaus Post, see LICENSE for details.
package reedsolomon
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
//go:noescape
func galMulSSSE3(low, high, in, out []byte)
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
//go:noescape
func galMulSSSE3Xor(low, high, in, out []byte)
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
//go:noescape
func galMulAVX2Xor(low, high, in, out []byte)
//go:noescape
func galMulAVX2(low, high, in, out []byte)
//go:noescape
func sSE2XorSlice(in, out []byte)
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
//go:noescape
func galMulAVX2Xor_64(low, high, in, out []byte)
//go:noescape
func galMulAVX2_64(low, high, in, out []byte)
//go:noescape
func sSE2XorSlice_64(in, out []byte)
// This is what the assembler routines do in blocks of 16 bytes:
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
/*
func galMulSSSE3(low, high, in, out []byte) {
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] = low[l] ^ high[h]
}
}
func galMulSSSE3Xor(low, high, in, out []byte) {
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
for n, input := range in {
l := input & 0xf
h := input >> 4
out[n] ^= low[l] ^ high[h]
}
}
*/
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
// bigSwitchover is the size where 64 bytes are processed per loop.
const bigSwitchover = 128
func galMulSlice(c byte, in, out []byte, o *options) {
if c == 1 {
copy(out, in)
return
}
if o.useAVX2 {
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
if len(in) >= bigSwitchover {
galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
}
if len(in) > 32 {
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done := (len(in) >> 5) << 5
in = in[done:]
out = out[done:]
}
} else if o.useSSSE3 {
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
done := (len(in) >> 4) << 4
in = in[done:]
out = out[done:]
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
}
Remove a bounds check in pure Go (#123) 40% faster on the pure operation. ``` benchmark old ns/op new ns/op delta BenchmarkParallel_8x8x05M-8 2990849 2763554 -7.60% BenchmarkParallel_8x8x1M-8 4941575 5061619 +2.43% BenchmarkParallel_8x8x8M-8 34257722 33192541 -3.11% BenchmarkParallel_8x8x32M-8 143157262 131654688 -8.03% BenchmarkGalois128K-8 64201 38374 -40.23% BenchmarkGalois1M-8 507053 307236 -39.41% BenchmarkGaloisXor128K-8 63815 63157 -1.03% BenchmarkGaloisXor1M-8 506369 505641 -0.14% BenchmarkEncode10x2x10000-8 96414 92781 -3.77% BenchmarkEncode100x20x10000-8 3188549 3238299 +1.56% BenchmarkEncode17x3x1M-8 3741349 3633535 -2.88% BenchmarkEncode10x4x16M-8 41628596 40306100 -3.18% BenchmarkEncode5x2x1M-8 724162 699137 -3.46% BenchmarkEncode10x2x1M-8 1451401 1423224 -1.94% BenchmarkEncode10x4x1M-8 2839382 2740249 -3.49% BenchmarkEncode50x20x1M-8 68415407 67015156 -2.05% BenchmarkEncode17x3x16M-8 53734221 51784418 -3.63% BenchmarkEncode_8x4x8M-8 16826004 16013691 -4.83% BenchmarkEncode_12x4x12M-8 37544203 36392439 -3.07% BenchmarkEncode_16x4x16M-8 66070450 69062838 +4.53% BenchmarkEncode_16x4x32M-8 133905200 130529500 -2.52% BenchmarkEncode_16x4x64M-8 281313400 265809900 -5.51% BenchmarkEncode_8x5x8M-8 20789000 19866553 -4.44% BenchmarkEncode_8x6x8M-8 25027385 25087290 +0.24% BenchmarkEncode_8x7x8M-8 29156578 28231372 -3.17% BenchmarkEncode_8x9x8M-8 37286413 37383431 +0.26% BenchmarkEncode_8x10x8M-8 41722722 39786752 -4.64% BenchmarkEncode_8x11x8M-8 45692118 43409812 -4.99% BenchmarkEncode_8x8x05M-8 2358946 2298631 -2.56% BenchmarkEncode_8x8x1M-8 4551026 4357599 -4.25% BenchmarkEncode_8x8x8M-8 33596074 31951653 -4.89% BenchmarkEncode_8x8x32M-8 135030488 127382850 -5.66% BenchmarkEncode_24x8x24M-8 297317050 301777575 +1.50% BenchmarkEncode_24x8x48M-8 611638100 596134400 -2.53% BenchmarkVerify10x2x10000-8 103723 103523 -0.19% BenchmarkVerify50x5x50000-8 2170780 2148170 -1.04% BenchmarkVerify10x2x1M-8 1693351 1676973 -0.97% BenchmarkVerify5x2x1M-8 997721 995888 -0.18% BenchmarkVerify10x4x1M-8 3354687 3296939 -1.72% BenchmarkVerify50x20x1M-8 67491300 66890056 -0.89% BenchmarkVerify10x4x16M-8 44195152 44356146 +0.36% BenchmarkReconstruct10x2x10000-8 24720 23373 -5.45% BenchmarkReconstruct50x5x50000-8 880988 858684 -2.53% BenchmarkReconstruct10x2x1M-8 387655 368900 -4.84% BenchmarkReconstruct5x2x1M-8 191067 175841 -7.97% BenchmarkReconstruct10x4x1M-8 1040639 1004731 -3.45% BenchmarkReconstruct50x20x1M-8 28507103 28467956 -0.14% BenchmarkReconstruct10x4x16M-8 15829872 15225654 -3.82% BenchmarkReconstructData10x2x10000-8 24369 23374 -4.08% BenchmarkReconstructData50x5x50000-8 865039 852456 -1.45% BenchmarkReconstructData10x2x1M-8 383240 366751 -4.30% BenchmarkReconstructData5x2x1M-8 183644 170444 -7.19% BenchmarkReconstructData10x4x1M-8 1010537 969151 -4.10% BenchmarkReconstructData50x20x1M-8 28288428 28051051 -0.84% BenchmarkReconstructData10x4x16M-8 15048840 14443250 -4.02% BenchmarkReconstructP10x2x10000-8 3219 3122 -3.01% BenchmarkReconstructP10x5x20000-8 23574 22704 -3.69% BenchmarkSplit10x4x160M-8 2822150 2735071 -3.09% BenchmarkSplit5x2x5M-8 409699 311346 -24.01% BenchmarkSplit10x2x1M-8 43767 40247 -8.04% BenchmarkSplit10x4x10M-8 741097 566888 -23.51% BenchmarkSplit50x20x50M-8 1913475 1682060 -12.09% BenchmarkSplit17x3x272M-8 2059505 2095628 +1.75% BenchmarkStreamEncode10x2x10000-8 8517255 5226284 -38.64% BenchmarkStreamEncode100x20x10000-8 41903836 40969212 -2.23% BenchmarkStreamEncode17x3x1M-8 12038007 14129765 +17.38% BenchmarkStreamEncode10x4x16M-8 56512840 54821895 -2.99% BenchmarkStreamEncode5x2x1M-8 5326508 3966411 -25.53% BenchmarkStreamEncode10x2x1M-8 6924358 6589396 -4.84% BenchmarkStreamEncode10x4x1M-8 9016080 8459049 -6.18% BenchmarkStreamEncode50x20x1M-8 93583042 94021200 +0.47% BenchmarkStreamEncode17x3x16M-8 76643714 74750193 -2.47% BenchmarkStreamVerify10x2x10000-8 8311646 5162179 -37.89% BenchmarkStreamVerify50x5x50000-8 19015944 18352626 -3.49% BenchmarkStreamVerify10x2x1M-8 5738380 5441592 -5.17% BenchmarkStreamVerify5x2x1M-8 3462751 3328057 -3.89% BenchmarkStreamVerify10x4x1M-8 6735717 6381116 -5.26% BenchmarkStreamVerify50x20x1M-8 29844543 29416921 -1.43% BenchmarkStreamVerify10x4x16M-8 8512699 8375778 -1.61% benchmark old MB/s new MB/s speedup BenchmarkParallel_8x8x05M-8 1402.38 1517.72 1.08x BenchmarkParallel_8x8x1M-8 1697.56 1657.30 0.98x BenchmarkParallel_8x8x8M-8 1958.94 2021.81 1.03x BenchmarkParallel_8x8x32M-8 1875.11 2038.94 1.09x BenchmarkGalois128K-8 2041.59 3415.64 1.67x BenchmarkGalois1M-8 2067.98 3412.93 1.65x BenchmarkGaloisXor128K-8 2053.92 2075.33 1.01x BenchmarkGaloisXor1M-8 2070.77 2073.76 1.00x BenchmarkEncode10x2x10000-8 1037.19 1077.81 1.04x BenchmarkEncode100x20x10000-8 313.62 308.80 0.98x BenchmarkEncode17x3x1M-8 4764.54 4905.91 1.03x BenchmarkEncode10x4x16M-8 4030.21 4162.45 1.03x BenchmarkEncode5x2x1M-8 7239.93 7499.07 1.04x BenchmarkEncode10x2x1M-8 7224.58 7367.61 1.02x BenchmarkEncode10x4x1M-8 3692.97 3826.57 1.04x BenchmarkEncode50x20x1M-8 766.33 782.34 1.02x BenchmarkEncode17x3x16M-8 5307.84 5507.69 1.04x BenchmarkEncode_8x4x8M-8 3988.40 4190.72 1.05x BenchmarkEncode_12x4x12M-8 4021.79 4149.07 1.03x BenchmarkEncode_16x4x16M-8 4062.87 3886.83 0.96x BenchmarkEncode_16x4x32M-8 4009.34 4113.02 1.03x BenchmarkEncode_16x4x64M-8 3816.89 4039.51 1.06x BenchmarkEncode_8x5x8M-8 3228.09 3377.98 1.05x BenchmarkEncode_8x6x8M-8 2681.42 2675.01 1.00x BenchmarkEncode_8x7x8M-8 2301.67 2377.10 1.03x BenchmarkEncode_8x9x8M-8 1799.82 1795.15 1.00x BenchmarkEncode_8x10x8M-8 1608.45 1686.71 1.05x BenchmarkEncode_8x11x8M-8 1468.72 1545.94 1.05x BenchmarkEncode_8x8x05M-8 1778.04 1824.70 1.03x BenchmarkEncode_8x8x1M-8 1843.23 1925.05 1.04x BenchmarkEncode_8x8x8M-8 1997.52 2100.33 1.05x BenchmarkEncode_8x8x32M-8 1987.96 2107.31 1.06x BenchmarkEncode_24x8x24M-8 2031.43 2001.41 0.99x BenchmarkEncode_24x8x48M-8 1974.96 2026.32 1.03x BenchmarkVerify10x2x10000-8 964.10 965.97 1.00x BenchmarkVerify50x5x50000-8 2303.32 2327.56 1.01x BenchmarkVerify10x2x1M-8 6192.31 6252.79 1.01x BenchmarkVerify5x2x1M-8 5254.86 5264.53 1.00x BenchmarkVerify10x4x1M-8 3125.70 3180.45 1.02x BenchmarkVerify50x20x1M-8 776.82 783.81 1.01x BenchmarkVerify10x4x16M-8 3796.17 3782.39 1.00x BenchmarkReconstruct10x2x10000-8 4045.30 4278.40 1.06x BenchmarkReconstruct50x5x50000-8 5675.45 5822.87 1.03x BenchmarkReconstruct10x2x1M-8 27049.21 28424.40 1.05x BenchmarkReconstruct5x2x1M-8 27440.02 29815.96 1.09x BenchmarkReconstruct10x4x1M-8 10076.27 10436.39 1.04x BenchmarkReconstruct50x20x1M-8 1839.15 1841.68 1.00x BenchmarkReconstruct10x4x16M-8 10598.45 11019.04 1.04x BenchmarkReconstructData10x2x10000-8 4103.60 4278.25 1.04x BenchmarkReconstructData50x5x50000-8 5780.09 5865.40 1.01x BenchmarkReconstructData10x2x1M-8 27360.79 28590.95 1.04x BenchmarkReconstructData5x2x1M-8 28549.19 30760.16 1.08x BenchmarkReconstructData10x4x1M-8 10376.42 10819.53 1.04x BenchmarkReconstructData50x20x1M-8 1853.37 1869.05 1.01x BenchmarkReconstructData10x4x16M-8 11148.51 11615.96 1.04x BenchmarkReconstructP10x2x10000-8 31068.70 32026.22 1.03x BenchmarkReconstructP10x5x20000-8 8484.08 8808.93 1.04x BenchmarkStreamEncode10x2x10000-8 11.74 19.13 1.63x BenchmarkStreamEncode100x20x10000-8 23.86 24.41 1.02x BenchmarkStreamEncode17x3x1M-8 1480.79 1261.58 0.85x BenchmarkStreamEncode10x4x16M-8 2968.74 3060.31 1.03x BenchmarkStreamEncode5x2x1M-8 984.30 1321.82 1.34x BenchmarkStreamEncode10x2x1M-8 1514.33 1591.31 1.05x BenchmarkStreamEncode10x4x1M-8 1163.01 1239.59 1.07x BenchmarkStreamEncode50x20x1M-8 560.24 557.63 1.00x BenchmarkStreamEncode17x3x16M-8 3721.28 3815.54 1.03x BenchmarkStreamVerify10x2x10000-8 12.03 19.37 1.61x BenchmarkStreamVerify50x5x50000-8 262.94 272.44 1.04x BenchmarkStreamVerify10x2x1M-8 1827.30 1926.97 1.05x BenchmarkStreamVerify5x2x1M-8 1514.08 1575.36 1.04x BenchmarkStreamVerify10x4x1M-8 1556.74 1643.25 1.06x BenchmarkStreamVerify50x20x1M-8 1756.73 1782.27 1.01x BenchmarkStreamVerify10x4x16M-8 19708.46 20030.64 1.02x ```
2020-05-03 20:38:55 +03:00
out = out[:len(in)]
mt := mulTable[c][:256]
for i := range in {
out[i] = mt[in[i]]
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
}
}
func galMulSliceXor(c byte, in, out []byte, o *options) {
if c == 1 {
sliceXor(in, out, o)
return
}
if o.useAVX2 {
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
if len(in) >= bigSwitchover {
galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
}
if len(in) >= 32 {
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
done := (len(in) >> 5) << 5
in = in[done:]
out = out[done:]
}
} else if o.useSSSE3 {
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
done := (len(in) >> 4) << 4
in = in[done:]
out = out[done:]
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
}
avx2: Improve speed when > 10 input or output shards. (#174) Speeds are including a limiting the number of goroutines with all AVX2 paths, Before/after ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2240 2240 +0.00% BenchmarkGalois1M-32 19578 18891 -3.51% BenchmarkGaloisXor128K-32 2798 2852 +1.93% BenchmarkGaloisXor1M-32 23334 23345 +0.05% BenchmarkEncode2x1x1M-32 34357 34370 +0.04% BenchmarkEncode10x2x10000-32 3210 3093 -3.64% BenchmarkEncode100x20x10000-32 362925 148214 -59.16% BenchmarkEncode17x3x1M-32 323767 224157 -30.77% BenchmarkEncode10x4x16M-32 8376895 8376737 -0.00% BenchmarkEncode5x2x1M-32 68365 66861 -2.20% BenchmarkEncode10x2x1M-32 101407 93023 -8.27% BenchmarkEncode10x4x1M-32 171880 155477 -9.54% BenchmarkEncode50x20x1M-32 3704691 3015047 -18.62% BenchmarkEncode17x3x16M-32 10279233 10106658 -1.68% BenchmarkEncode_8x4x8M-32 3438245 3326479 -3.25% BenchmarkEncode_12x4x12M-32 6632257 6581637 -0.76% BenchmarkEncode_16x4x16M-32 10815755 10788377 -0.25% BenchmarkEncode_16x4x32M-32 21029061 21507995 +2.28% BenchmarkEncode_16x4x64M-32 42145450 43876850 +4.11% BenchmarkEncode_8x5x8M-32 4543208 3846378 -15.34% BenchmarkEncode_8x6x8M-32 5065494 4397218 -13.19% BenchmarkEncode_8x7x8M-32 5818995 4962884 -14.71% BenchmarkEncode_8x9x8M-32 6215449 6114898 -1.62% BenchmarkEncode_8x10x8M-32 6923415 6610501 -4.52% BenchmarkEncode_8x11x8M-32 7365988 7010473 -4.83% BenchmarkEncode_8x8x05M-32 150857 136820 -9.30% BenchmarkEncode_8x8x1M-32 256722 254854 -0.73% BenchmarkEncode_8x8x8M-32 5547790 5422048 -2.27% BenchmarkEncode_8x8x32M-32 23038643 22705859 -1.44% BenchmarkEncode_24x8x24M-32 27729259 30332216 +9.39% BenchmarkEncode_24x8x48M-32 53865705 61187658 +13.59% BenchmarkVerify10x2x10000-32 8769 8154 -7.01% BenchmarkVerify10x2x1M-32 516149 476180 -7.74% BenchmarkVerify5x2x1M-32 443888 419541 -5.48% BenchmarkVerify10x4x1M-32 1030299 948021 -7.99% BenchmarkVerify50x20x1M-32 7209689 6186891 -14.19% BenchmarkVerify10x4x16M-32 17774456 17681879 -0.52% BenchmarkReconstruct10x2x10000-32 3352 3256 -2.86% BenchmarkReconstruct50x5x50000-32 166417 140900 -15.33% BenchmarkReconstruct10x2x1M-32 189711 174615 -7.96% BenchmarkReconstruct5x2x1M-32 128080 126520 -1.22% BenchmarkReconstruct10x4x1M-32 273312 254017 -7.06% BenchmarkReconstruct50x20x1M-32 3628812 3192474 -12.02% BenchmarkReconstruct10x4x16M-32 8562186 8781479 +2.56% BenchmarkReconstructData10x2x10000-32 3241 3116 -3.86% BenchmarkReconstructData50x5x50000-32 162520 134794 -17.06% BenchmarkReconstructData10x2x1M-32 171253 161955 -5.43% BenchmarkReconstructData5x2x1M-32 102215 106942 +4.62% BenchmarkReconstructData10x4x1M-32 225593 219969 -2.49% BenchmarkReconstructData50x20x1M-32 2515311 2129721 -15.33% BenchmarkReconstructData10x4x16M-32 6980308 6698111 -4.04% BenchmarkReconstructP10x2x10000-32 924 937 +1.35% BenchmarkReconstructP10x5x20000-32 1639 1703 +3.90% BenchmarkSplit10x4x160M-32 4984993 4898045 -1.74% BenchmarkSplit5x2x5M-32 380415 221446 -41.79% BenchmarkSplit10x2x1M-32 58761 53335 -9.23% BenchmarkSplit10x4x10M-32 643188 410959 -36.11% BenchmarkSplit50x20x50M-32 1843879 1647205 -10.67% BenchmarkSplit17x3x272M-32 3684920 3613951 -1.93% BenchmarkParallel_8x8x64K-32 7022 6630 -5.58% BenchmarkParallel_8x8x05M-32 348308 348369 +0.02% BenchmarkParallel_20x10x05M-32 575672 581028 +0.93% BenchmarkParallel_8x8x1M-32 716033 697167 -2.63% BenchmarkParallel_8x8x8M-32 5716048 5616437 -1.74% BenchmarkParallel_8x8x32M-32 22650878 22098667 -2.44% BenchmarkParallel_8x3x1M-32 406839 399125 -1.90% BenchmarkParallel_8x4x1M-32 459107 463890 +1.04% BenchmarkParallel_8x5x1M-32 527488 520334 -1.36% BenchmarkStreamEncode10x2x10000-32 6013 5878 -2.25% BenchmarkStreamEncode100x20x10000-32 503124 267894 -46.75% BenchmarkStreamEncode17x3x1M-32 1561838 1376618 -11.86% BenchmarkStreamEncode10x4x16M-32 19124427 17762582 -7.12% BenchmarkStreamEncode5x2x1M-32 429701 384666 -10.48% BenchmarkStreamEncode10x2x1M-32 801257 763637 -4.70% BenchmarkStreamEncode10x4x1M-32 876065 820744 -6.31% BenchmarkStreamEncode50x20x1M-32 7205112 6081398 -15.60% BenchmarkStreamEncode17x3x16M-32 27182786 26117143 -3.92% BenchmarkStreamVerify10x2x10000-32 13767 14026 +1.88% BenchmarkStreamVerify50x5x50000-32 826983 690453 -16.51% BenchmarkStreamVerify10x2x1M-32 1238566 1182591 -4.52% BenchmarkStreamVerify5x2x1M-32 892661 806301 -9.67% BenchmarkStreamVerify10x4x1M-32 1676394 1631495 -2.68% BenchmarkStreamVerify50x20x1M-32 10877875 10037678 -7.72% BenchmarkStreamVerify10x4x16M-32 27599576 30435400 +10.27% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 58518.53 58510.17 1.00x BenchmarkGalois1M-32 53558.10 55507.44 1.04x BenchmarkGaloisXor128K-32 46839.74 45961.09 0.98x BenchmarkGaloisXor1M-32 44936.98 44917.46 1.00x BenchmarkEncode2x1x1M-32 91561.27 91524.11 1.00x BenchmarkEncode10x2x10000-32 37385.54 38792.54 1.04x BenchmarkEncode100x20x10000-32 3306.47 8096.40 2.45x BenchmarkEncode17x3x1M-32 64773.49 93557.14 1.44x BenchmarkEncode10x4x16M-32 28039.15 28039.68 1.00x BenchmarkEncode5x2x1M-32 107365.88 109781.16 1.02x BenchmarkEncode10x2x1M-32 124083.62 135266.27 1.09x BenchmarkEncode10x4x1M-32 85408.99 94419.71 1.11x BenchmarkEncode50x20x1M-32 19812.81 24344.67 1.23x BenchmarkEncode17x3x16M-32 32642.93 33200.32 1.02x BenchmarkEncode_8x4x8M-32 29277.52 30261.21 1.03x BenchmarkEncode_12x4x12M-32 30355.67 30589.14 1.01x BenchmarkEncode_16x4x16M-32 31023.66 31102.39 1.00x BenchmarkEncode_16x4x32M-32 31912.44 31201.82 0.98x BenchmarkEncode_16x4x64M-32 31846.32 30589.65 0.96x BenchmarkEncode_8x5x8M-32 24003.28 28351.84 1.18x BenchmarkEncode_8x6x8M-32 23184.41 26707.91 1.15x BenchmarkEncode_8x7x8M-32 21623.86 25354.03 1.17x BenchmarkEncode_8x9x8M-32 22943.85 23321.13 1.02x BenchmarkEncode_8x10x8M-32 21809.31 22841.68 1.05x BenchmarkEncode_8x11x8M-32 21637.77 22735.06 1.05x BenchmarkEncode_8x8x05M-32 55606.22 61311.47 1.10x BenchmarkEncode_8x8x1M-32 65351.80 65830.73 1.01x BenchmarkEncode_8x8x8M-32 24193.01 24754.07 1.02x BenchmarkEncode_8x8x32M-32 23303.06 23644.60 1.01x BenchmarkEncode_24x8x24M-32 29041.76 26549.54 0.91x BenchmarkEncode_24x8x48M-32 29900.52 26322.51 0.88x BenchmarkVerify10x2x10000-32 13685.12 14717.10 1.08x BenchmarkVerify10x2x1M-32 24378.43 26424.72 1.08x BenchmarkVerify5x2x1M-32 16535.79 17495.41 1.06x BenchmarkVerify10x4x1M-32 14248.35 15484.96 1.09x BenchmarkVerify50x20x1M-32 10180.79 11863.85 1.17x BenchmarkVerify10x4x16M-32 13214.53 13283.71 1.01x BenchmarkReconstruct10x2x10000-32 35799.16 36854.89 1.03x BenchmarkReconstruct50x5x50000-32 33049.47 39034.89 1.18x BenchmarkReconstruct10x2x1M-32 66326.88 72061.06 1.09x BenchmarkReconstruct5x2x1M-32 57308.21 58014.92 1.01x BenchmarkReconstruct10x4x1M-32 53711.74 57791.66 1.08x BenchmarkReconstruct50x20x1M-32 20227.09 22991.67 1.14x BenchmarkReconstruct10x4x16M-32 27432.37 26747.32 0.98x BenchmarkReconstructData10x2x10000-32 37030.86 38511.87 1.04x BenchmarkReconstructData50x5x50000-32 33842.07 40802.85 1.21x BenchmarkReconstructData10x2x1M-32 73475.57 77693.87 1.06x BenchmarkReconstructData5x2x1M-32 71809.58 68635.57 0.96x BenchmarkReconstructData10x4x1M-32 65073.27 66736.88 1.03x BenchmarkReconstructData50x20x1M-32 29181.41 34464.76 1.18x BenchmarkReconstructData10x4x16M-32 33649.09 35066.75 1.04x BenchmarkReconstructP10x2x10000-32 129819.98 128086.76 0.99x BenchmarkReconstructP10x5x20000-32 183073.89 176202.21 0.96x BenchmarkParallel_8x8x64K-32 149327.33 158153.67 1.06x BenchmarkParallel_8x8x05M-32 24083.89 24079.69 1.00x BenchmarkParallel_20x10x05M-32 27322.20 27070.35 0.99x BenchmarkParallel_8x8x1M-32 23430.78 24064.83 1.03x BenchmarkParallel_8x8x8M-32 23480.86 23897.31 1.02x BenchmarkParallel_8x8x32M-32 23701.99 24294.27 1.02x BenchmarkParallel_8x3x1M-32 28351.11 28899.03 1.02x BenchmarkParallel_8x4x1M-32 27407.34 27124.76 0.99x BenchmarkParallel_8x5x1M-32 25842.27 26197.58 1.01x BenchmarkStreamEncode10x2x10000-32 16629.76 17012.26 1.02x BenchmarkStreamEncode100x20x10000-32 1987.58 3732.83 1.88x BenchmarkStreamEncode17x3x1M-32 11413.34 12948.97 1.13x BenchmarkStreamEncode10x4x16M-32 8772.66 9445.26 1.08x BenchmarkStreamEncode5x2x1M-32 12201.21 13629.70 1.12x BenchmarkStreamEncode10x2x1M-32 13086.64 13731.34 1.05x BenchmarkStreamEncode10x4x1M-32 11969.16 12775.92 1.07x BenchmarkStreamEncode50x20x1M-32 7276.61 8621.18 1.18x BenchmarkStreamEncode17x3x16M-32 10492.40 10920.52 1.04x BenchmarkStreamVerify10x2x10000-32 7264.00 7129.49 0.98x BenchmarkStreamVerify50x5x50000-32 6046.07 7241.62 1.20x BenchmarkStreamVerify10x2x1M-32 8466.05 8866.77 1.05x BenchmarkStreamVerify5x2x1M-32 5873.31 6502.39 1.11x BenchmarkStreamVerify10x4x1M-32 6254.95 6427.09 1.03x BenchmarkStreamVerify50x20x1M-32 4819.76 5223.20 1.08x BenchmarkStreamVerify10x4x16M-32 6078.79 5512.40 0.91x ```
2021-12-09 14:28:44 +03:00
if len(in) == 0 {
return
}
Remove a bounds check in pure Go (#123) 40% faster on the pure operation. ``` benchmark old ns/op new ns/op delta BenchmarkParallel_8x8x05M-8 2990849 2763554 -7.60% BenchmarkParallel_8x8x1M-8 4941575 5061619 +2.43% BenchmarkParallel_8x8x8M-8 34257722 33192541 -3.11% BenchmarkParallel_8x8x32M-8 143157262 131654688 -8.03% BenchmarkGalois128K-8 64201 38374 -40.23% BenchmarkGalois1M-8 507053 307236 -39.41% BenchmarkGaloisXor128K-8 63815 63157 -1.03% BenchmarkGaloisXor1M-8 506369 505641 -0.14% BenchmarkEncode10x2x10000-8 96414 92781 -3.77% BenchmarkEncode100x20x10000-8 3188549 3238299 +1.56% BenchmarkEncode17x3x1M-8 3741349 3633535 -2.88% BenchmarkEncode10x4x16M-8 41628596 40306100 -3.18% BenchmarkEncode5x2x1M-8 724162 699137 -3.46% BenchmarkEncode10x2x1M-8 1451401 1423224 -1.94% BenchmarkEncode10x4x1M-8 2839382 2740249 -3.49% BenchmarkEncode50x20x1M-8 68415407 67015156 -2.05% BenchmarkEncode17x3x16M-8 53734221 51784418 -3.63% BenchmarkEncode_8x4x8M-8 16826004 16013691 -4.83% BenchmarkEncode_12x4x12M-8 37544203 36392439 -3.07% BenchmarkEncode_16x4x16M-8 66070450 69062838 +4.53% BenchmarkEncode_16x4x32M-8 133905200 130529500 -2.52% BenchmarkEncode_16x4x64M-8 281313400 265809900 -5.51% BenchmarkEncode_8x5x8M-8 20789000 19866553 -4.44% BenchmarkEncode_8x6x8M-8 25027385 25087290 +0.24% BenchmarkEncode_8x7x8M-8 29156578 28231372 -3.17% BenchmarkEncode_8x9x8M-8 37286413 37383431 +0.26% BenchmarkEncode_8x10x8M-8 41722722 39786752 -4.64% BenchmarkEncode_8x11x8M-8 45692118 43409812 -4.99% BenchmarkEncode_8x8x05M-8 2358946 2298631 -2.56% BenchmarkEncode_8x8x1M-8 4551026 4357599 -4.25% BenchmarkEncode_8x8x8M-8 33596074 31951653 -4.89% BenchmarkEncode_8x8x32M-8 135030488 127382850 -5.66% BenchmarkEncode_24x8x24M-8 297317050 301777575 +1.50% BenchmarkEncode_24x8x48M-8 611638100 596134400 -2.53% BenchmarkVerify10x2x10000-8 103723 103523 -0.19% BenchmarkVerify50x5x50000-8 2170780 2148170 -1.04% BenchmarkVerify10x2x1M-8 1693351 1676973 -0.97% BenchmarkVerify5x2x1M-8 997721 995888 -0.18% BenchmarkVerify10x4x1M-8 3354687 3296939 -1.72% BenchmarkVerify50x20x1M-8 67491300 66890056 -0.89% BenchmarkVerify10x4x16M-8 44195152 44356146 +0.36% BenchmarkReconstruct10x2x10000-8 24720 23373 -5.45% BenchmarkReconstruct50x5x50000-8 880988 858684 -2.53% BenchmarkReconstruct10x2x1M-8 387655 368900 -4.84% BenchmarkReconstruct5x2x1M-8 191067 175841 -7.97% BenchmarkReconstruct10x4x1M-8 1040639 1004731 -3.45% BenchmarkReconstruct50x20x1M-8 28507103 28467956 -0.14% BenchmarkReconstruct10x4x16M-8 15829872 15225654 -3.82% BenchmarkReconstructData10x2x10000-8 24369 23374 -4.08% BenchmarkReconstructData50x5x50000-8 865039 852456 -1.45% BenchmarkReconstructData10x2x1M-8 383240 366751 -4.30% BenchmarkReconstructData5x2x1M-8 183644 170444 -7.19% BenchmarkReconstructData10x4x1M-8 1010537 969151 -4.10% BenchmarkReconstructData50x20x1M-8 28288428 28051051 -0.84% BenchmarkReconstructData10x4x16M-8 15048840 14443250 -4.02% BenchmarkReconstructP10x2x10000-8 3219 3122 -3.01% BenchmarkReconstructP10x5x20000-8 23574 22704 -3.69% BenchmarkSplit10x4x160M-8 2822150 2735071 -3.09% BenchmarkSplit5x2x5M-8 409699 311346 -24.01% BenchmarkSplit10x2x1M-8 43767 40247 -8.04% BenchmarkSplit10x4x10M-8 741097 566888 -23.51% BenchmarkSplit50x20x50M-8 1913475 1682060 -12.09% BenchmarkSplit17x3x272M-8 2059505 2095628 +1.75% BenchmarkStreamEncode10x2x10000-8 8517255 5226284 -38.64% BenchmarkStreamEncode100x20x10000-8 41903836 40969212 -2.23% BenchmarkStreamEncode17x3x1M-8 12038007 14129765 +17.38% BenchmarkStreamEncode10x4x16M-8 56512840 54821895 -2.99% BenchmarkStreamEncode5x2x1M-8 5326508 3966411 -25.53% BenchmarkStreamEncode10x2x1M-8 6924358 6589396 -4.84% BenchmarkStreamEncode10x4x1M-8 9016080 8459049 -6.18% BenchmarkStreamEncode50x20x1M-8 93583042 94021200 +0.47% BenchmarkStreamEncode17x3x16M-8 76643714 74750193 -2.47% BenchmarkStreamVerify10x2x10000-8 8311646 5162179 -37.89% BenchmarkStreamVerify50x5x50000-8 19015944 18352626 -3.49% BenchmarkStreamVerify10x2x1M-8 5738380 5441592 -5.17% BenchmarkStreamVerify5x2x1M-8 3462751 3328057 -3.89% BenchmarkStreamVerify10x4x1M-8 6735717 6381116 -5.26% BenchmarkStreamVerify50x20x1M-8 29844543 29416921 -1.43% BenchmarkStreamVerify10x4x16M-8 8512699 8375778 -1.61% benchmark old MB/s new MB/s speedup BenchmarkParallel_8x8x05M-8 1402.38 1517.72 1.08x BenchmarkParallel_8x8x1M-8 1697.56 1657.30 0.98x BenchmarkParallel_8x8x8M-8 1958.94 2021.81 1.03x BenchmarkParallel_8x8x32M-8 1875.11 2038.94 1.09x BenchmarkGalois128K-8 2041.59 3415.64 1.67x BenchmarkGalois1M-8 2067.98 3412.93 1.65x BenchmarkGaloisXor128K-8 2053.92 2075.33 1.01x BenchmarkGaloisXor1M-8 2070.77 2073.76 1.00x BenchmarkEncode10x2x10000-8 1037.19 1077.81 1.04x BenchmarkEncode100x20x10000-8 313.62 308.80 0.98x BenchmarkEncode17x3x1M-8 4764.54 4905.91 1.03x BenchmarkEncode10x4x16M-8 4030.21 4162.45 1.03x BenchmarkEncode5x2x1M-8 7239.93 7499.07 1.04x BenchmarkEncode10x2x1M-8 7224.58 7367.61 1.02x BenchmarkEncode10x4x1M-8 3692.97 3826.57 1.04x BenchmarkEncode50x20x1M-8 766.33 782.34 1.02x BenchmarkEncode17x3x16M-8 5307.84 5507.69 1.04x BenchmarkEncode_8x4x8M-8 3988.40 4190.72 1.05x BenchmarkEncode_12x4x12M-8 4021.79 4149.07 1.03x BenchmarkEncode_16x4x16M-8 4062.87 3886.83 0.96x BenchmarkEncode_16x4x32M-8 4009.34 4113.02 1.03x BenchmarkEncode_16x4x64M-8 3816.89 4039.51 1.06x BenchmarkEncode_8x5x8M-8 3228.09 3377.98 1.05x BenchmarkEncode_8x6x8M-8 2681.42 2675.01 1.00x BenchmarkEncode_8x7x8M-8 2301.67 2377.10 1.03x BenchmarkEncode_8x9x8M-8 1799.82 1795.15 1.00x BenchmarkEncode_8x10x8M-8 1608.45 1686.71 1.05x BenchmarkEncode_8x11x8M-8 1468.72 1545.94 1.05x BenchmarkEncode_8x8x05M-8 1778.04 1824.70 1.03x BenchmarkEncode_8x8x1M-8 1843.23 1925.05 1.04x BenchmarkEncode_8x8x8M-8 1997.52 2100.33 1.05x BenchmarkEncode_8x8x32M-8 1987.96 2107.31 1.06x BenchmarkEncode_24x8x24M-8 2031.43 2001.41 0.99x BenchmarkEncode_24x8x48M-8 1974.96 2026.32 1.03x BenchmarkVerify10x2x10000-8 964.10 965.97 1.00x BenchmarkVerify50x5x50000-8 2303.32 2327.56 1.01x BenchmarkVerify10x2x1M-8 6192.31 6252.79 1.01x BenchmarkVerify5x2x1M-8 5254.86 5264.53 1.00x BenchmarkVerify10x4x1M-8 3125.70 3180.45 1.02x BenchmarkVerify50x20x1M-8 776.82 783.81 1.01x BenchmarkVerify10x4x16M-8 3796.17 3782.39 1.00x BenchmarkReconstruct10x2x10000-8 4045.30 4278.40 1.06x BenchmarkReconstruct50x5x50000-8 5675.45 5822.87 1.03x BenchmarkReconstruct10x2x1M-8 27049.21 28424.40 1.05x BenchmarkReconstruct5x2x1M-8 27440.02 29815.96 1.09x BenchmarkReconstruct10x4x1M-8 10076.27 10436.39 1.04x BenchmarkReconstruct50x20x1M-8 1839.15 1841.68 1.00x BenchmarkReconstruct10x4x16M-8 10598.45 11019.04 1.04x BenchmarkReconstructData10x2x10000-8 4103.60 4278.25 1.04x BenchmarkReconstructData50x5x50000-8 5780.09 5865.40 1.01x BenchmarkReconstructData10x2x1M-8 27360.79 28590.95 1.04x BenchmarkReconstructData5x2x1M-8 28549.19 30760.16 1.08x BenchmarkReconstructData10x4x1M-8 10376.42 10819.53 1.04x BenchmarkReconstructData50x20x1M-8 1853.37 1869.05 1.01x BenchmarkReconstructData10x4x16M-8 11148.51 11615.96 1.04x BenchmarkReconstructP10x2x10000-8 31068.70 32026.22 1.03x BenchmarkReconstructP10x5x20000-8 8484.08 8808.93 1.04x BenchmarkStreamEncode10x2x10000-8 11.74 19.13 1.63x BenchmarkStreamEncode100x20x10000-8 23.86 24.41 1.02x BenchmarkStreamEncode17x3x1M-8 1480.79 1261.58 0.85x BenchmarkStreamEncode10x4x16M-8 2968.74 3060.31 1.03x BenchmarkStreamEncode5x2x1M-8 984.30 1321.82 1.34x BenchmarkStreamEncode10x2x1M-8 1514.33 1591.31 1.05x BenchmarkStreamEncode10x4x1M-8 1163.01 1239.59 1.07x BenchmarkStreamEncode50x20x1M-8 560.24 557.63 1.00x BenchmarkStreamEncode17x3x16M-8 3721.28 3815.54 1.03x BenchmarkStreamVerify10x2x10000-8 12.03 19.37 1.61x BenchmarkStreamVerify50x5x50000-8 262.94 272.44 1.04x BenchmarkStreamVerify10x2x1M-8 1827.30 1926.97 1.05x BenchmarkStreamVerify5x2x1M-8 1514.08 1575.36 1.04x BenchmarkStreamVerify10x4x1M-8 1556.74 1643.25 1.06x BenchmarkStreamVerify50x20x1M-8 1756.73 1782.27 1.01x BenchmarkStreamVerify10x4x16M-8 19708.46 20030.64 1.02x ```
2020-05-03 20:38:55 +03:00
out = out[:len(in)]
mt := mulTable[c][:256]
for i := range in {
out[i] ^= mt[in[i]]
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
}
}
// simple slice xor
func sliceXor(in, out []byte, o *options) {
if o.useSSE2 {
avx2: Add 64 bytes per loop processing (#128) * avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```
2020-05-05 17:36:01 +03:00
if len(in) >= bigSwitchover {
sSE2XorSlice_64(in, out)
done := (len(in) >> 6) << 6
in = in[done:]
out = out[done:]
}
if len(in) >= 16 {
sSE2XorSlice(in, out)
done := (len(in) >> 4) << 4
in = in[done:]
out = out[done:]
}
}
Remove a bounds check in pure Go (#123) 40% faster on the pure operation. ``` benchmark old ns/op new ns/op delta BenchmarkParallel_8x8x05M-8 2990849 2763554 -7.60% BenchmarkParallel_8x8x1M-8 4941575 5061619 +2.43% BenchmarkParallel_8x8x8M-8 34257722 33192541 -3.11% BenchmarkParallel_8x8x32M-8 143157262 131654688 -8.03% BenchmarkGalois128K-8 64201 38374 -40.23% BenchmarkGalois1M-8 507053 307236 -39.41% BenchmarkGaloisXor128K-8 63815 63157 -1.03% BenchmarkGaloisXor1M-8 506369 505641 -0.14% BenchmarkEncode10x2x10000-8 96414 92781 -3.77% BenchmarkEncode100x20x10000-8 3188549 3238299 +1.56% BenchmarkEncode17x3x1M-8 3741349 3633535 -2.88% BenchmarkEncode10x4x16M-8 41628596 40306100 -3.18% BenchmarkEncode5x2x1M-8 724162 699137 -3.46% BenchmarkEncode10x2x1M-8 1451401 1423224 -1.94% BenchmarkEncode10x4x1M-8 2839382 2740249 -3.49% BenchmarkEncode50x20x1M-8 68415407 67015156 -2.05% BenchmarkEncode17x3x16M-8 53734221 51784418 -3.63% BenchmarkEncode_8x4x8M-8 16826004 16013691 -4.83% BenchmarkEncode_12x4x12M-8 37544203 36392439 -3.07% BenchmarkEncode_16x4x16M-8 66070450 69062838 +4.53% BenchmarkEncode_16x4x32M-8 133905200 130529500 -2.52% BenchmarkEncode_16x4x64M-8 281313400 265809900 -5.51% BenchmarkEncode_8x5x8M-8 20789000 19866553 -4.44% BenchmarkEncode_8x6x8M-8 25027385 25087290 +0.24% BenchmarkEncode_8x7x8M-8 29156578 28231372 -3.17% BenchmarkEncode_8x9x8M-8 37286413 37383431 +0.26% BenchmarkEncode_8x10x8M-8 41722722 39786752 -4.64% BenchmarkEncode_8x11x8M-8 45692118 43409812 -4.99% BenchmarkEncode_8x8x05M-8 2358946 2298631 -2.56% BenchmarkEncode_8x8x1M-8 4551026 4357599 -4.25% BenchmarkEncode_8x8x8M-8 33596074 31951653 -4.89% BenchmarkEncode_8x8x32M-8 135030488 127382850 -5.66% BenchmarkEncode_24x8x24M-8 297317050 301777575 +1.50% BenchmarkEncode_24x8x48M-8 611638100 596134400 -2.53% BenchmarkVerify10x2x10000-8 103723 103523 -0.19% BenchmarkVerify50x5x50000-8 2170780 2148170 -1.04% BenchmarkVerify10x2x1M-8 1693351 1676973 -0.97% BenchmarkVerify5x2x1M-8 997721 995888 -0.18% BenchmarkVerify10x4x1M-8 3354687 3296939 -1.72% BenchmarkVerify50x20x1M-8 67491300 66890056 -0.89% BenchmarkVerify10x4x16M-8 44195152 44356146 +0.36% BenchmarkReconstruct10x2x10000-8 24720 23373 -5.45% BenchmarkReconstruct50x5x50000-8 880988 858684 -2.53% BenchmarkReconstruct10x2x1M-8 387655 368900 -4.84% BenchmarkReconstruct5x2x1M-8 191067 175841 -7.97% BenchmarkReconstruct10x4x1M-8 1040639 1004731 -3.45% BenchmarkReconstruct50x20x1M-8 28507103 28467956 -0.14% BenchmarkReconstruct10x4x16M-8 15829872 15225654 -3.82% BenchmarkReconstructData10x2x10000-8 24369 23374 -4.08% BenchmarkReconstructData50x5x50000-8 865039 852456 -1.45% BenchmarkReconstructData10x2x1M-8 383240 366751 -4.30% BenchmarkReconstructData5x2x1M-8 183644 170444 -7.19% BenchmarkReconstructData10x4x1M-8 1010537 969151 -4.10% BenchmarkReconstructData50x20x1M-8 28288428 28051051 -0.84% BenchmarkReconstructData10x4x16M-8 15048840 14443250 -4.02% BenchmarkReconstructP10x2x10000-8 3219 3122 -3.01% BenchmarkReconstructP10x5x20000-8 23574 22704 -3.69% BenchmarkSplit10x4x160M-8 2822150 2735071 -3.09% BenchmarkSplit5x2x5M-8 409699 311346 -24.01% BenchmarkSplit10x2x1M-8 43767 40247 -8.04% BenchmarkSplit10x4x10M-8 741097 566888 -23.51% BenchmarkSplit50x20x50M-8 1913475 1682060 -12.09% BenchmarkSplit17x3x272M-8 2059505 2095628 +1.75% BenchmarkStreamEncode10x2x10000-8 8517255 5226284 -38.64% BenchmarkStreamEncode100x20x10000-8 41903836 40969212 -2.23% BenchmarkStreamEncode17x3x1M-8 12038007 14129765 +17.38% BenchmarkStreamEncode10x4x16M-8 56512840 54821895 -2.99% BenchmarkStreamEncode5x2x1M-8 5326508 3966411 -25.53% BenchmarkStreamEncode10x2x1M-8 6924358 6589396 -4.84% BenchmarkStreamEncode10x4x1M-8 9016080 8459049 -6.18% BenchmarkStreamEncode50x20x1M-8 93583042 94021200 +0.47% BenchmarkStreamEncode17x3x16M-8 76643714 74750193 -2.47% BenchmarkStreamVerify10x2x10000-8 8311646 5162179 -37.89% BenchmarkStreamVerify50x5x50000-8 19015944 18352626 -3.49% BenchmarkStreamVerify10x2x1M-8 5738380 5441592 -5.17% BenchmarkStreamVerify5x2x1M-8 3462751 3328057 -3.89% BenchmarkStreamVerify10x4x1M-8 6735717 6381116 -5.26% BenchmarkStreamVerify50x20x1M-8 29844543 29416921 -1.43% BenchmarkStreamVerify10x4x16M-8 8512699 8375778 -1.61% benchmark old MB/s new MB/s speedup BenchmarkParallel_8x8x05M-8 1402.38 1517.72 1.08x BenchmarkParallel_8x8x1M-8 1697.56 1657.30 0.98x BenchmarkParallel_8x8x8M-8 1958.94 2021.81 1.03x BenchmarkParallel_8x8x32M-8 1875.11 2038.94 1.09x BenchmarkGalois128K-8 2041.59 3415.64 1.67x BenchmarkGalois1M-8 2067.98 3412.93 1.65x BenchmarkGaloisXor128K-8 2053.92 2075.33 1.01x BenchmarkGaloisXor1M-8 2070.77 2073.76 1.00x BenchmarkEncode10x2x10000-8 1037.19 1077.81 1.04x BenchmarkEncode100x20x10000-8 313.62 308.80 0.98x BenchmarkEncode17x3x1M-8 4764.54 4905.91 1.03x BenchmarkEncode10x4x16M-8 4030.21 4162.45 1.03x BenchmarkEncode5x2x1M-8 7239.93 7499.07 1.04x BenchmarkEncode10x2x1M-8 7224.58 7367.61 1.02x BenchmarkEncode10x4x1M-8 3692.97 3826.57 1.04x BenchmarkEncode50x20x1M-8 766.33 782.34 1.02x BenchmarkEncode17x3x16M-8 5307.84 5507.69 1.04x BenchmarkEncode_8x4x8M-8 3988.40 4190.72 1.05x BenchmarkEncode_12x4x12M-8 4021.79 4149.07 1.03x BenchmarkEncode_16x4x16M-8 4062.87 3886.83 0.96x BenchmarkEncode_16x4x32M-8 4009.34 4113.02 1.03x BenchmarkEncode_16x4x64M-8 3816.89 4039.51 1.06x BenchmarkEncode_8x5x8M-8 3228.09 3377.98 1.05x BenchmarkEncode_8x6x8M-8 2681.42 2675.01 1.00x BenchmarkEncode_8x7x8M-8 2301.67 2377.10 1.03x BenchmarkEncode_8x9x8M-8 1799.82 1795.15 1.00x BenchmarkEncode_8x10x8M-8 1608.45 1686.71 1.05x BenchmarkEncode_8x11x8M-8 1468.72 1545.94 1.05x BenchmarkEncode_8x8x05M-8 1778.04 1824.70 1.03x BenchmarkEncode_8x8x1M-8 1843.23 1925.05 1.04x BenchmarkEncode_8x8x8M-8 1997.52 2100.33 1.05x BenchmarkEncode_8x8x32M-8 1987.96 2107.31 1.06x BenchmarkEncode_24x8x24M-8 2031.43 2001.41 0.99x BenchmarkEncode_24x8x48M-8 1974.96 2026.32 1.03x BenchmarkVerify10x2x10000-8 964.10 965.97 1.00x BenchmarkVerify50x5x50000-8 2303.32 2327.56 1.01x BenchmarkVerify10x2x1M-8 6192.31 6252.79 1.01x BenchmarkVerify5x2x1M-8 5254.86 5264.53 1.00x BenchmarkVerify10x4x1M-8 3125.70 3180.45 1.02x BenchmarkVerify50x20x1M-8 776.82 783.81 1.01x BenchmarkVerify10x4x16M-8 3796.17 3782.39 1.00x BenchmarkReconstruct10x2x10000-8 4045.30 4278.40 1.06x BenchmarkReconstruct50x5x50000-8 5675.45 5822.87 1.03x BenchmarkReconstruct10x2x1M-8 27049.21 28424.40 1.05x BenchmarkReconstruct5x2x1M-8 27440.02 29815.96 1.09x BenchmarkReconstruct10x4x1M-8 10076.27 10436.39 1.04x BenchmarkReconstruct50x20x1M-8 1839.15 1841.68 1.00x BenchmarkReconstruct10x4x16M-8 10598.45 11019.04 1.04x BenchmarkReconstructData10x2x10000-8 4103.60 4278.25 1.04x BenchmarkReconstructData50x5x50000-8 5780.09 5865.40 1.01x BenchmarkReconstructData10x2x1M-8 27360.79 28590.95 1.04x BenchmarkReconstructData5x2x1M-8 28549.19 30760.16 1.08x BenchmarkReconstructData10x4x1M-8 10376.42 10819.53 1.04x BenchmarkReconstructData50x20x1M-8 1853.37 1869.05 1.01x BenchmarkReconstructData10x4x16M-8 11148.51 11615.96 1.04x BenchmarkReconstructP10x2x10000-8 31068.70 32026.22 1.03x BenchmarkReconstructP10x5x20000-8 8484.08 8808.93 1.04x BenchmarkStreamEncode10x2x10000-8 11.74 19.13 1.63x BenchmarkStreamEncode100x20x10000-8 23.86 24.41 1.02x BenchmarkStreamEncode17x3x1M-8 1480.79 1261.58 0.85x BenchmarkStreamEncode10x4x16M-8 2968.74 3060.31 1.03x BenchmarkStreamEncode5x2x1M-8 984.30 1321.82 1.34x BenchmarkStreamEncode10x2x1M-8 1514.33 1591.31 1.05x BenchmarkStreamEncode10x4x1M-8 1163.01 1239.59 1.07x BenchmarkStreamEncode50x20x1M-8 560.24 557.63 1.00x BenchmarkStreamEncode17x3x16M-8 3721.28 3815.54 1.03x BenchmarkStreamVerify10x2x10000-8 12.03 19.37 1.61x BenchmarkStreamVerify50x5x50000-8 262.94 272.44 1.04x BenchmarkStreamVerify10x2x1M-8 1827.30 1926.97 1.05x BenchmarkStreamVerify5x2x1M-8 1514.08 1575.36 1.04x BenchmarkStreamVerify10x4x1M-8 1556.74 1643.25 1.06x BenchmarkStreamVerify50x20x1M-8 1756.73 1782.27 1.01x BenchmarkStreamVerify10x4x16M-8 19708.46 20030.64 1.02x ```
2020-05-03 20:38:55 +03:00
out = out[:len(in)]
for i := range in {
out[i] ^= in[i]
}
}