reedsolomon-go/galois_amd64.s

237 lines
6.4 KiB
ArmAsm
Raw Normal View History

//+build !noasm !appengine !gccgo
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
// Copyright 2015, Klaus Post, see LICENSE for details.
// Based on http://www.snia.org/sites/default/files2/SDC2013/presentations/NewThinking/EthanMiller_Screaming_Fast_Galois_Field%20Arithmetic_SIMD%20Instructions.pdf
// and http://jerasure.org/jerasure/gf-complete/tree/master
// func galMulSSSE3Xor(low, high, in, out []byte)
TEXT ·galMulSSSE3Xor(SB), 7, $0
2015-12-14 16:57:49 +03:00
MOVQ low+0(FP), SI // SI: &low
MOVQ high+24(FP), DX // DX: &high
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
MOVQ in+48(FP), SI // R11: &in
MOVQ in_len+56(FP), R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
SHRQ $4, R9 // len(in) / 16
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
MOVQ SI, AX
MOVQ DX, BX
ANDQ $15, AX
ANDQ $15, BX
2015-12-14 16:57:49 +03:00
CMPQ R9, $0
JEQ done_xor
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
ORQ AX, BX
CMPQ BX, $0
JNZ loopback_xor
loopback_xor_aligned:
MOVOA (SI), X0 // in[x]
MOVOA (DX), X4 // out[x]
MOVOA X0, X1 // in[x]
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
PXOR X4, X3 // X3: Result xor existing out
MOVOA X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor_aligned
JMP done_xor
2015-12-14 16:57:49 +03:00
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
loopback_xor:
2015-12-14 16:57:49 +03:00
MOVOU (SI), X0 // in[x]
MOVOU (DX), X4 // out[x]
MOVOU X0, X1 // in[x]
MOVOU X6, X2 // low copy
MOVOU X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
PXOR X4, X3 // X3: Result xor existing out
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
done_xor:
2015-12-14 16:57:49 +03:00
RET
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
// func galMulSSSE3(low, high, in, out []byte)
TEXT ·galMulSSSE3(SB), 7, $0
2015-12-14 16:57:49 +03:00
MOVQ low+0(FP), SI // SI: &low
MOVQ high+24(FP), DX // DX: &high
MOVOU (SI), X6 // X6 low
MOVOU (DX), X7 // X7: high
MOVQ $15, BX // BX: low mask
MOVQ BX, X8
PXOR X5, X5
MOVQ in+48(FP), SI // R11: &in
MOVQ in_len+56(FP), R9 // R9: len(in)
MOVQ out+72(FP), DX // DX: &out
PSHUFB X5, X8 // X8: lomask (unpacked)
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
MOVQ SI, AX
MOVQ DX, BX
2015-12-14 16:57:49 +03:00
SHRQ $4, R9 // len(in) / 16
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
ANDQ $15, AX
ANDQ $15, BX
2015-12-14 16:57:49 +03:00
CMPQ R9, $0
JEQ done
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
ORQ AX, BX
CMPQ BX, $0
JNZ loopback
loopback_aligned:
MOVOA (SI), X0 // in[x]
MOVOA X0, X1 // in[x]
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
MOVOA X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_aligned
JMP done
2015-12-14 16:57:49 +03:00
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
loopback:
2015-12-14 16:57:49 +03:00
MOVOU (SI), X0 // in[x]
MOVOU X0, X1 // in[x]
Split blocks into size divisible by 16 Older systems (typically without AVX2) are more sensitive to misaligned load+stores. Add parameter to automatically set the number of goroutines. name old time/op new time/op delta Encode10x2x10000-8 18.4µs ± 1% 16.1µs ± 1% -12.43% (p=0.000 n=9+9) Encode100x20x10000-8 692µs ± 1% 608µs ± 1% -12.10% (p=0.000 n=10+10) Encode17x3x1M-8 1.78ms ± 5% 1.49ms ± 1% -16.63% (p=0.000 n=10+10) Encode10x4x16M-8 21.5ms ± 5% 19.6ms ± 4% -8.74% (p=0.000 n=10+9) Encode5x2x1M-8 343µs ± 2% 267µs ± 2% -22.22% (p=0.000 n=9+10) Encode10x2x1M-8 858µs ± 5% 701µs ± 5% -18.34% (p=0.000 n=10+10) Encode10x4x1M-8 1.34ms ± 1% 1.16ms ± 1% -13.19% (p=0.000 n=9+9) Encode50x20x1M-8 30.3ms ± 4% 25.0ms ± 2% -17.51% (p=0.000 n=10+8) Encode17x3x16M-8 26.9ms ± 1% 24.5ms ± 4% -9.13% (p=0.000 n=8+10) name old speed new speed delta Encode10x2x10000-8 5.45GB/s ± 1% 6.22GB/s ± 1% +14.20% (p=0.000 n=9+9) Encode100x20x10000-8 1.44GB/s ± 1% 1.64GB/s ± 1% +13.77% (p=0.000 n=10+10) Encode17x3x1M-8 10.0GB/s ± 5% 12.0GB/s ± 1% +19.88% (p=0.000 n=10+10) Encode10x4x16M-8 7.81GB/s ± 5% 8.56GB/s ± 5% +9.58% (p=0.000 n=10+9) Encode5x2x1M-8 15.3GB/s ± 2% 19.6GB/s ± 2% +28.57% (p=0.000 n=9+10) Encode10x2x1M-8 12.2GB/s ± 5% 15.0GB/s ± 5% +22.45% (p=0.000 n=10+10) Encode10x4x1M-8 7.84GB/s ± 1% 9.03GB/s ± 1% +15.19% (p=0.000 n=9+9) Encode50x20x1M-8 1.73GB/s ± 4% 2.09GB/s ± 4% +20.59% (p=0.000 n=10+9) Encode17x3x16M-8 10.6GB/s ± 1% 11.7GB/s ± 4% +10.12% (p=0.000 n=8+10)
2017-11-18 19:37:40 +03:00
MOVOA X6, X2 // low copy
MOVOA X7, X3 // high copy
2015-12-14 16:57:49 +03:00
PSRLQ $4, X1 // X1: high input
PAND X8, X0 // X0: low input
PAND X8, X1 // X0: high input
PSHUFB X0, X2 // X2: mul low part
PSHUFB X1, X3 // X3: mul high part
PXOR X2, X3 // X3: Result
MOVOU X3, (DX) // Store
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
done:
2015-12-14 16:57:49 +03:00
RET
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
// func galMulAVX2Xor(low, high, in, out []byte)
TEXT ·galMulAVX2Xor(SB), 7, $0
2015-12-14 16:57:49 +03:00
MOVQ low+0(FP), SI // SI: &low
MOVQ high+24(FP), DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6: low
2015-12-14 16:57:49 +03:00
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP), R9 // R9: len(in)
VINSERTI128 $1, X6, Y6, Y6 // low
VINSERTI128 $1, X7, Y7, Y7 // high
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
Add AMD64 SSE3 Galois multiplication. Approximately 5-10x faster. BenchmarkEncode10x2x10000 333.31 5827.17 17.48x BenchmarkEncode10x2x10000-2 431.20 2802.53 6.50x BenchmarkEncode10x2x10000-4 553.98 2432.95 4.39x BenchmarkEncode10x2x10000-8 585.79 3469.61 5.92x BenchmarkEncode100x20x10000 32.59 583.40 17.90x BenchmarkEncode100x20x10000-2 59.52 726.70 12.21x BenchmarkEncode100x20x10000-4 108.04 1363.25 12.62x BenchmarkEncode100x20x10000-8 113.76 1274.62 11.20x BenchmarkEncode17x3x1M 215.28 3141.85 14.59x BenchmarkEncode17x3x1M-2 398.76 3650.12 9.15x BenchmarkEncode17x3x1M-4 655.32 6071.11 9.26x BenchmarkEncode17x3x1M-8 832.16 6616.47 7.95x BenchmarkEncode10x4x16M 154.48 1357.30 8.79x BenchmarkEncode10x4x16M-2 295.62 2377.92 8.04x BenchmarkEncode10x4x16M-4 529.89 3519.49 6.64x BenchmarkEncode10x4x16M-8 632.11 4521.90 7.15x BenchmarkEncode5x2x1M 327.87 4879.09 14.88x BenchmarkEncode5x2x1M-2 576.11 2599.20 4.51x BenchmarkEncode5x2x1M-4 1043.65 3559.12 3.41x BenchmarkEncode5x2x1M-8 1227.77 4255.34 3.47x BenchmarkEncode10x2x1M 321.24 4574.68 14.24x BenchmarkEncode10x2x1M-2 587.73 3100.28 5.28x BenchmarkEncode10x2x1M-4 1101.96 4770.32 4.33x BenchmarkEncode10x2x1M-8 1217.08 5812.17 4.78x BenchmarkEncode10x4x1M 155.34 2037.27 13.11x BenchmarkEncode10x4x1M-2 298.38 2470.97 8.28x BenchmarkEncode10x4x1M-4 548.67 3603.15 6.57x BenchmarkEncode10x4x1M-8 625.23 4827.42 7.72x BenchmarkEncode50x20x1M 31.37 347.65 11.08x BenchmarkEncode50x20x1M-2 59.81 713.28 11.93x BenchmarkEncode50x20x1M-4 105.34 1175.47 11.16x BenchmarkEncode50x20x1M-8 123.84 1491.91 12.05x BenchmarkEncode17x3x16M 209.55 1861.59 8.88x BenchmarkEncode17x3x16M-2 394.19 3331.73 8.45x BenchmarkEncode17x3x16M-4 643.30 4942.74 7.68x BenchmarkEncode17x3x16M-8 839.64 6213.43 7.40x
2015-06-21 22:23:22 +03:00
SHRQ $5, R9 // len(in) / 32
2015-12-14 16:57:49 +03:00
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // SI: &in
2015-12-14 16:57:49 +03:00
TESTQ R9, R9
JZ done_xor_avx2
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
loopback_xor_avx2:
VMOVDQU (SI), Y0
VMOVDQU (DX), Y4
VPSRLQ $4, Y0, Y1 // Y1: high input
VPAND Y8, Y0, Y0 // Y0: low input
VPAND Y8, Y1, Y1 // Y1: high input
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
VPXOR Y3, Y2, Y3 // Y3: Result
VPXOR Y4, Y3, Y4 // Y4: Result
VMOVDQU Y4, (DX)
2015-12-14 16:57:49 +03:00
ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_xor_avx2
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
done_xor_avx2:
VZEROUPPER
2015-12-14 16:57:49 +03:00
RET
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
// func galMulAVX2(low, high, in, out []byte)
TEXT ·galMulAVX2(SB), 7, $0
2015-12-14 16:57:49 +03:00
MOVQ low+0(FP), SI // SI: &low
MOVQ high+24(FP), DX // DX: &high
MOVQ $15, BX // BX: low mask
MOVQ BX, X5
MOVOU (SI), X6 // X6: low
2015-12-14 16:57:49 +03:00
MOVOU (DX), X7 // X7: high
MOVQ in_len+56(FP), R9 // R9: len(in)
VINSERTI128 $1, X6, Y6, Y6 // low
VINSERTI128 $1, X7, Y7, Y7 // high
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
SHRQ $5, R9 // len(in) / 32
2015-12-14 16:57:49 +03:00
MOVQ out+72(FP), DX // DX: &out
MOVQ in+48(FP), SI // SI: &in
2015-12-14 16:57:49 +03:00
TESTQ R9, R9
JZ done_avx2
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
loopback_avx2:
VMOVDQU (SI), Y0
VPSRLQ $4, Y0, Y1 // Y1: high input
VPAND Y8, Y0, Y0 // Y0: low input
VPAND Y8, Y1, Y1 // Y1: high input
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
VPXOR Y3, Y2, Y4 // Y4: Result
VMOVDQU Y4, (DX)
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
2015-12-14 16:57:49 +03:00
ADDQ $32, SI // in+=32
ADDQ $32, DX // out+=32
SUBQ $1, R9
JNZ loopback_avx2
Add AVX2 assembler functions. Benchmarks on a VM (therefore a bit more noisy) benchmark old ns/op new ns/op delta BenchmarkEncode10x2x10000-8 58372 47421 -18.76% BenchmarkEncode100x20x10000-8 2635444 1550511 -41.17% BenchmarkEncode17x3x1M-8 3885495 2231034 -42.58% BenchmarkEncode10x4x16M-8 24180221 21467661 -11.22% BenchmarkEncode5x2x1M-8 2395287 2261452 -5.59% BenchmarkEncode10x2x1M-8 2571278 2566560 -0.18% BenchmarkEncode10x4x1M-8 3396774 3431916 +1.03% BenchmarkEncode50x20x1M-8 27004601 20325731 -24.73% BenchmarkEncode17x3x16M-8 29671393 23668596 -20.23% BenchmarkVerify10x2x10000-8 109730 101519 -7.48% BenchmarkVerify50x5x50000-8 3904166 3101568 -20.56% BenchmarkVerify10x2x1M-8 4398490 4721719 +7.35% BenchmarkVerify5x2x1M-8 3174574 3296440 +3.84% BenchmarkVerify10x4x1M-8 5247394 5346667 +1.89% BenchmarkVerify50x20x1M-8 35742777 26154681 -26.83% BenchmarkVerify10x4x16M-8 52873512 54931253 +3.89% benchmark old MB/s new MB/s speedup BenchmarkEncode10x2x10000-8 1713.14 2108.73 1.23x BenchmarkEncode100x20x10000-8 379.44 644.95 1.70x BenchmarkEncode17x3x1M-8 4587.78 7989.92 1.74x BenchmarkEncode10x4x16M-8 6938.40 7815.11 1.13x BenchmarkEncode5x2x1M-8 2188.83 2318.37 1.06x BenchmarkEncode10x2x1M-8 4078.03 4085.53 1.00x BenchmarkEncode10x4x1M-8 3086.98 3055.37 0.99x BenchmarkEncode50x20x1M-8 1941.48 2579.43 1.33x BenchmarkEncode17x3x16M-8 9612.38 12050.26 1.25x BenchmarkVerify10x2x10000-8 911.32 985.03 1.08x BenchmarkVerify50x5x50000-8 1280.68 1612.09 1.26x BenchmarkVerify10x2x1M-8 2383.94 2220.75 0.93x BenchmarkVerify5x2x1M-8 1651.52 1590.47 0.96x BenchmarkVerify10x4x1M-8 1998.28 1961.18 0.98x BenchmarkVerify50x20x1M-8 1466.84 2004.57 1.37x BenchmarkVerify10x4x16M-8 3173.09 3054.22 0.96x
2015-12-07 15:40:57 +03:00
done_avx2:
VZEROUPPER
2015-12-14 16:57:49 +03:00
RET
// func sSE2XorSlice(in, out []byte)
TEXT ·sSE2XorSlice(SB), 7, $0
2017-08-26 12:51:49 +03:00
MOVQ in+0(FP), SI // SI: &in
MOVQ in_len+8(FP), R9 // R9: len(in)
MOVQ out+24(FP), DX // DX: &out
SHRQ $4, R9 // len(in) / 16
CMPQ R9, $0
JEQ done_xor_sse2
loopback_xor_sse2:
2017-08-26 12:51:49 +03:00
MOVOU (SI), X0 // in[x]
MOVOU (DX), X1 // out[x]
PXOR X0, X1
MOVOU X1, (DX)
ADDQ $16, SI // in+=16
ADDQ $16, DX // out+=16
SUBQ $1, R9
JNZ loopback_xor_sse2
done_xor_sse2:
RET