avx2: Add 64 bytes per loop processing (#128)
* avx2: Add 64 bytes per loop processing Not super clean benchmark run, but `BenchmarkGalois` is consistently faster. ``` benchmark old ns/op new ns/op delta BenchmarkGalois128K-32 2551 2261 -11.37% BenchmarkGalois1M-32 22492 21107 -6.16% BenchmarkGaloisXor128K-32 2972 2808 -5.52% BenchmarkGaloisXor1M-32 25181 23951 -4.88% BenchmarkEncode10x2x10000-32 5081 4722 -7.07% BenchmarkEncode100x20x10000-32 383800 346655 -9.68% BenchmarkEncode17x3x1M-32 264806 263191 -0.61% BenchmarkEncode10x4x16M-32 8337857 8376910 +0.47% BenchmarkEncode5x2x1M-32 77119 73598 -4.57% BenchmarkEncode10x2x1M-32 108424 102423 -5.53% BenchmarkEncode10x4x1M-32 194427 184301 -5.21% BenchmarkEncode50x20x1M-32 3870301 3747639 -3.17% BenchmarkEncode17x3x16M-32 10617586 10602449 -0.14% BenchmarkEncode_8x4x8M-32 3227254 3229451 +0.07% BenchmarkEncode_12x4x12M-32 6841898 6847261 +0.08% BenchmarkEncode_16x4x16M-32 11153469 11048738 -0.94% BenchmarkEncode_16x4x32M-32 21947506 21826647 -0.55% BenchmarkEncode_16x4x64M-32 43163608 42971338 -0.45% BenchmarkEncode_8x5x8M-32 3856675 3780730 -1.97% BenchmarkEncode_8x6x8M-32 4322023 4437109 +2.66% BenchmarkEncode_8x7x8M-32 5011434 4959623 -1.03% BenchmarkEncode_8x9x8M-32 6243694 6098824 -2.32% BenchmarkEncode_8x10x8M-32 6724456 6657099 -1.00% BenchmarkEncode_8x11x8M-32 7207693 7340332 +1.84% BenchmarkEncode_8x8x05M-32 176877 172183 -2.65% BenchmarkEncode_8x8x1M-32 309716 301743 -2.57% BenchmarkEncode_8x8x8M-32 5498952 5489078 -0.18% BenchmarkEncode_8x8x32M-32 22630195 22557074 -0.32% BenchmarkEncode_24x8x24M-32 28488886 28220702 -0.94% BenchmarkEncode_24x8x48M-32 56124735 54862495 -2.25% BenchmarkVerify10x2x10000-32 9874 9356 -5.25% BenchmarkVerify50x5x50000-32 175610 159735 -9.04% BenchmarkVerify10x2x1M-32 331276 311726 -5.90% BenchmarkVerify5x2x1M-32 265466 248075 -6.55% BenchmarkVerify10x4x1M-32 701627 606420 -13.57% BenchmarkVerify50x20x1M-32 4338171 4245635 -2.13% BenchmarkVerify10x4x16M-32 12312830 11932698 -3.09% BenchmarkReconstruct10x2x10000-32 1594 1504 -5.65% BenchmarkReconstruct50x5x50000-32 95101 79558 -16.34% BenchmarkReconstruct10x2x1M-32 38479 37225 -3.26% BenchmarkReconstruct5x2x1M-32 30968 30013 -3.08% BenchmarkReconstruct10x4x1M-32 81630 75350 -7.69% BenchmarkReconstruct50x20x1M-32 1136952 1040156 -8.51% BenchmarkReconstruct10x4x16M-32 685408 656484 -4.22% BenchmarkReconstructData10x2x10000-32 1609 1486 -7.64% BenchmarkReconstructData50x5x50000-32 87090 71512 -17.89% BenchmarkReconstructData10x2x1M-32 31497 30347 -3.65% BenchmarkReconstructData5x2x1M-32 23379 22611 -3.28% BenchmarkReconstructData10x4x1M-32 63853 61035 -4.41% BenchmarkReconstructData50x20x1M-32 1048807 966201 -7.88% BenchmarkReconstructData10x4x16M-32 866658 892252 +2.95% BenchmarkReconstructP10x2x10000-32 544 540 -0.74% BenchmarkReconstructP10x5x20000-32 1242 1206 -2.90% BenchmarkSplit10x4x160M-32 2735508 2743214 +0.28% BenchmarkSplit5x2x5M-32 276232 288523 +4.45% BenchmarkSplit10x2x1M-32 44389 45517 +2.54% BenchmarkSplit10x4x10M-32 477282 460888 -3.43% BenchmarkSplit50x20x50M-32 1608821 1602105 -0.42% BenchmarkSplit17x3x272M-32 2035932 2034705 -0.06% BenchmarkParallel_8x8x05M-32 346733 351837 +1.47% BenchmarkParallel_20x10x05M-32 577127 586232 +1.58% BenchmarkParallel_8x8x1M-32 722453 729294 +0.95% BenchmarkParallel_8x8x8M-32 5717650 5817130 +1.74% BenchmarkParallel_8x8x32M-32 22914260 24132696 +5.32% BenchmarkStreamEncode10x2x10000-32 6703131 7141021 +6.53% BenchmarkStreamEncode100x20x10000-32 38175873 39767386 +4.17% BenchmarkStreamEncode17x3x1M-32 8920549 9218973 +3.35% BenchmarkStreamEncode10x4x16M-32 21841702 21784898 -0.26% BenchmarkStreamEncode5x2x1M-32 4088001 3247404 -20.56% BenchmarkStreamEncode10x2x1M-32 5860652 5932381 +1.22% BenchmarkStreamEncode10x4x1M-32 7555172 7589960 +0.46% BenchmarkStreamEncode50x20x1M-32 30006814 30250054 +0.81% BenchmarkStreamEncode17x3x16M-32 32757489 32818254 +0.19% BenchmarkStreamVerify10x2x10000-32 6714996 6831093 +1.73% BenchmarkStreamVerify50x5x50000-32 18525904 18761767 +1.27% BenchmarkStreamVerify10x2x1M-32 5232278 5444148 +4.05% BenchmarkStreamVerify5x2x1M-32 3673843 3755283 +2.22% BenchmarkStreamVerify10x4x1M-32 7184419 7185293 +0.01% BenchmarkStreamVerify50x20x1M-32 28441187 28574766 +0.47% BenchmarkStreamVerify10x4x16M-32 8538440 8668614 +1.52% benchmark old MB/s new MB/s speedup BenchmarkGalois128K-32 51374.59 57976.36 1.13x BenchmarkGalois1M-32 46620.03 49679.10 1.07x BenchmarkGaloisXor128K-32 44106.22 46671.56 1.06x BenchmarkGaloisXor1M-32 41641.82 43779.89 1.05x BenchmarkEncode10x2x10000-32 19682.61 21176.81 1.08x BenchmarkEncode100x20x10000-32 2605.52 2884.71 1.11x BenchmarkEncode17x3x1M-32 67316.54 67729.50 1.01x BenchmarkEncode10x4x16M-32 20121.74 20027.93 1.00x BenchmarkEncode5x2x1M-32 67984.17 71236.47 1.05x BenchmarkEncode10x2x1M-32 96710.29 102377.00 1.06x BenchmarkEncode10x4x1M-32 53931.74 56894.82 1.05x BenchmarkEncode50x20x1M-32 13546.44 13989.82 1.03x BenchmarkEncode17x3x16M-32 26862.29 26900.64 1.00x BenchmarkEncode_8x4x8M-32 20794.42 20780.27 1.00x BenchmarkEncode_12x4x12M-32 22069.16 22051.88 1.00x BenchmarkEncode_16x4x16M-32 24067.44 24295.58 1.01x BenchmarkEncode_16x4x32M-32 24461.59 24597.04 1.01x BenchmarkEncode_16x4x64M-32 24876.09 24987.40 1.00x BenchmarkEncode_8x5x8M-32 17400.71 17750.24 1.02x BenchmarkEncode_8x6x8M-32 15527.19 15124.46 0.97x BenchmarkEncode_8x7x8M-32 13391.15 13531.04 1.01x BenchmarkEncode_8x9x8M-32 10748.26 11003.58 1.02x BenchmarkEncode_8x10x8M-32 9979.82 10080.80 1.01x BenchmarkEncode_8x11x8M-32 9310.73 9142.48 0.98x BenchmarkEncode_8x8x05M-32 23713.12 24359.50 1.03x BenchmarkEncode_8x8x1M-32 27084.87 27800.50 1.03x BenchmarkEncode_8x8x8M-32 12203.94 12225.89 1.00x BenchmarkEncode_8x8x32M-32 11861.83 11900.28 1.00x BenchmarkEncode_24x8x24M-32 21200.54 21402.01 1.01x BenchmarkEncode_24x8x48M-32 21522.77 22017.95 1.02x BenchmarkVerify10x2x10000-32 10127.24 10688.01 1.06x BenchmarkVerify50x5x50000-32 28472.25 31301.75 1.10x BenchmarkVerify10x2x1M-32 31652.63 33637.74 1.06x BenchmarkVerify5x2x1M-32 19749.74 21134.27 1.07x BenchmarkVerify10x4x1M-32 14944.92 17291.25 1.16x BenchmarkVerify50x20x1M-32 12085.46 12348.87 1.02x BenchmarkVerify10x4x16M-32 13625.80 14059.87 1.03x BenchmarkReconstruct10x2x10000-32 62723.68 66470.81 1.06x BenchmarkReconstruct50x5x50000-32 52575.87 62847.32 1.20x BenchmarkReconstruct10x2x1M-32 272507.04 281685.84 1.03x BenchmarkReconstruct5x2x1M-32 169299.03 174685.39 1.03x BenchmarkReconstruct10x4x1M-32 128455.17 139161.42 1.08x BenchmarkReconstruct50x20x1M-32 46113.48 50404.73 1.09x BenchmarkReconstruct10x4x16M-32 244777.11 255561.72 1.04x BenchmarkReconstructData10x2x10000-32 62160.46 67305.98 1.08x BenchmarkReconstructData50x5x50000-32 57411.81 69917.97 1.22x BenchmarkReconstructData10x2x1M-32 332909.82 345526.29 1.04x BenchmarkReconstructData5x2x1M-32 224254.60 231868.74 1.03x BenchmarkReconstructData10x4x1M-32 164216.61 171799.68 1.05x BenchmarkReconstructData50x20x1M-32 49988.98 54262.82 1.09x BenchmarkReconstructData10x4x16M-32 193585.15 188032.29 0.97x BenchmarkReconstructP10x2x10000-32 183806.57 185284.57 1.01x BenchmarkReconstructP10x5x20000-32 160985.46 165852.51 1.03x BenchmarkParallel_8x8x05M-32 12096.63 11921.17 0.99x BenchmarkParallel_20x10x05M-32 18168.91 17886.72 0.98x BenchmarkParallel_8x8x1M-32 11611.28 11502.36 0.99x BenchmarkParallel_8x8x8M-32 11737.14 11536.42 0.98x BenchmarkParallel_8x8x32M-32 11714.78 11123.31 0.95x BenchmarkStreamEncode10x2x10000-32 14.92 14.00 0.94x BenchmarkStreamEncode100x20x10000-32 26.19 25.15 0.96x BenchmarkStreamEncode17x3x1M-32 1998.28 1933.60 0.97x BenchmarkStreamEncode10x4x16M-32 7681.28 7701.31 1.00x BenchmarkStreamEncode5x2x1M-32 1282.50 1614.48 1.26x BenchmarkStreamEncode10x2x1M-32 1789.18 1767.55 0.99x BenchmarkStreamEncode10x4x1M-32 1387.89 1381.53 1.00x BenchmarkStreamEncode50x20x1M-32 1747.23 1733.18 0.99x BenchmarkStreamEncode17x3x16M-32 8706.79 8690.67 1.00x BenchmarkStreamVerify10x2x10000-32 14.89 14.64 0.98x BenchmarkStreamVerify50x5x50000-32 269.89 266.50 0.99x BenchmarkStreamVerify10x2x1M-32 2004.05 1926.06 0.96x BenchmarkStreamVerify5x2x1M-32 1427.08 1396.13 0.98x BenchmarkStreamVerify10x4x1M-32 1459.51 1459.34 1.00x BenchmarkStreamVerify50x20x1M-32 1843.41 1834.79 1.00x BenchmarkStreamVerify10x4x16M-32 19649.04 19353.98 0.98x ```master
parent
abb309aca7
commit
0e9e10435f
|
@ -21,6 +21,15 @@ func galMulAVX2(low, high, in, out []byte)
|
|||
//go:noescape
|
||||
func sSE2XorSlice(in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulAVX2Xor_64(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func galMulAVX2_64(low, high, in, out []byte)
|
||||
|
||||
//go:noescape
|
||||
func sSE2XorSlice_64(in, out []byte)
|
||||
|
||||
// This is what the assembler routines do in blocks of 16 bytes:
|
||||
/*
|
||||
func galMulSSSE3(low, high, in, out []byte) {
|
||||
|
@ -40,17 +49,29 @@ func galMulSSSE3Xor(low, high, in, out []byte) {
|
|||
}
|
||||
*/
|
||||
|
||||
// bigSwitchover is the size where 64 bytes are processed per loop.
|
||||
const bigSwitchover = 128
|
||||
|
||||
func galMulSlice(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
if o.useAVX2 {
|
||||
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
if len(in) >= bigSwitchover {
|
||||
galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done := (len(in) >> 6) << 6
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
if len(in) > 32 {
|
||||
galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done := (len(in) >> 5) << 5
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
} else if o.useSSSE3 {
|
||||
galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
done := (len(in) >> 4) << 4
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
out = out[:len(in)]
|
||||
mt := mulTable[c][:256]
|
||||
for i := range in {
|
||||
|
@ -59,16 +80,25 @@ func galMulSlice(c byte, in, out []byte, o *options) {
|
|||
}
|
||||
|
||||
func galMulSliceXor(c byte, in, out []byte, o *options) {
|
||||
var done int
|
||||
if o.useAVX2 {
|
||||
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 5) << 5
|
||||
if len(in) >= bigSwitchover {
|
||||
galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done := (len(in) >> 6) << 6
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
if len(in) >= 32 {
|
||||
galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done := (len(in) >> 5) << 5
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
} else if o.useSSSE3 {
|
||||
galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
done := (len(in) >> 4) << 4
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
out = out[:len(in)]
|
||||
mt := mulTable[c][:256]
|
||||
for i := range in {
|
||||
|
@ -78,13 +108,20 @@ func galMulSliceXor(c byte, in, out []byte, o *options) {
|
|||
|
||||
// slice galois add
|
||||
func sliceXor(in, out []byte, sse2 bool) {
|
||||
var done int
|
||||
if sse2 {
|
||||
sSE2XorSlice(in, out)
|
||||
done = (len(in) >> 4) << 4
|
||||
if len(in) >= bigSwitchover {
|
||||
sSE2XorSlice_64(in, out)
|
||||
done := (len(in) >> 6) << 6
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
if len(in) >= 16 {
|
||||
sSE2XorSlice(in, out)
|
||||
done := (len(in) >> 4) << 4
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
}
|
||||
}
|
||||
in = in[done:]
|
||||
out = out[done:]
|
||||
out = out[:len(in)]
|
||||
for i := range in {
|
||||
out[i] ^= in[i]
|
||||
|
|
132
galois_amd64.s
132
galois_amd64.s
|
@ -234,3 +234,135 @@ loopback_xor_sse2:
|
|||
|
||||
done_xor_sse2:
|
||||
RET
|
||||
|
||||
// func galMulAVX2Xor_64(low, high, in, out []byte)
|
||||
TEXT ·galMulAVX2Xor_64(SB), 7, $0
|
||||
MOVQ low+0(FP), SI // SI: &low
|
||||
MOVQ high+24(FP), DX // DX: &high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X5
|
||||
MOVOU (SI), X6 // X6: low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||
|
||||
VINSERTI128 $1, X6, Y6, Y6 // low
|
||||
VINSERTI128 $1, X7, Y7, Y7 // high
|
||||
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||||
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
MOVQ in+48(FP), SI // SI: &in
|
||||
TESTQ R9, R9
|
||||
JZ done_xor_avx2_64
|
||||
|
||||
loopback_xor_avx2_64:
|
||||
VMOVDQU (SI), Y0
|
||||
VMOVDQU 32(SI), Y10
|
||||
VMOVDQU (DX), Y4
|
||||
VMOVDQU 32(DX), Y14
|
||||
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||||
VPSRLQ $4, Y10, Y11 // Y11: high input 2
|
||||
VPAND Y8, Y0, Y0 // Y0: low input
|
||||
VPAND Y8, Y10, Y10 // Y10: low input 2
|
||||
VPAND Y8, Y1, Y1 // Y11: high input
|
||||
VPAND Y8, Y11, Y11 // Y11: high input 2
|
||||
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||||
VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
|
||||
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||||
VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
|
||||
VPXOR Y3, Y2, Y3 // Y3: Result
|
||||
VPXOR Y13, Y12, Y13 // Y13: Result 2
|
||||
VPXOR Y4, Y3, Y4 // Y4: Result
|
||||
VPXOR Y14, Y13, Y14 // Y4: Result 2
|
||||
VMOVDQU Y4, (DX)
|
||||
VMOVDQU Y14, 32(DX)
|
||||
|
||||
ADDQ $64, SI // in+=64
|
||||
ADDQ $64, DX // out+=64
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_xor_avx2_64
|
||||
|
||||
done_xor_avx2_64:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func galMulAVX2_64(low, high, in, out []byte)
|
||||
TEXT ·galMulAVX2_64(SB), 7, $0
|
||||
MOVQ low+0(FP), SI // SI: &low
|
||||
MOVQ high+24(FP), DX // DX: &high
|
||||
MOVQ $15, BX // BX: low mask
|
||||
MOVQ BX, X5
|
||||
MOVOU (SI), X6 // X6: low
|
||||
MOVOU (DX), X7 // X7: high
|
||||
MOVQ in_len+56(FP), R9 // R9: len(in)
|
||||
|
||||
VINSERTI128 $1, X6, Y6, Y6 // low
|
||||
VINSERTI128 $1, X7, Y7, Y7 // high
|
||||
VPBROADCASTB X5, Y8 // Y8: lomask (unpacked)
|
||||
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
MOVQ out+72(FP), DX // DX: &out
|
||||
MOVQ in+48(FP), SI // SI: &in
|
||||
TESTQ R9, R9
|
||||
JZ done_avx2_64
|
||||
|
||||
loopback_avx2_64:
|
||||
VMOVDQU (SI), Y0
|
||||
VMOVDQU 32(SI), Y10
|
||||
VPSRLQ $4, Y0, Y1 // Y1: high input
|
||||
VPSRLQ $4, Y10, Y11 // Y11: high input 2
|
||||
VPAND Y8, Y0, Y0 // Y0: low input
|
||||
VPAND Y8, Y10, Y10 // Y10: low input
|
||||
VPAND Y8, Y1, Y1 // Y1: high input
|
||||
VPAND Y8, Y11, Y11 // Y11: high input 2
|
||||
VPSHUFB Y0, Y6, Y2 // Y2: mul low part
|
||||
VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2
|
||||
VPSHUFB Y1, Y7, Y3 // Y3: mul high part
|
||||
VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2
|
||||
VPXOR Y3, Y2, Y4 // Y4: Result
|
||||
VPXOR Y13, Y12, Y14 // Y14: Result 2
|
||||
VMOVDQU Y4, (DX)
|
||||
VMOVDQU Y14, 32(DX)
|
||||
|
||||
ADDQ $64, SI // in+=64
|
||||
ADDQ $64, DX // out+=64
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_avx2_64
|
||||
|
||||
done_avx2_64:
|
||||
VZEROUPPER
|
||||
RET
|
||||
|
||||
// func sSE2XorSlice_64(in, out []byte)
|
||||
TEXT ·sSE2XorSlice_64(SB), 7, $0
|
||||
MOVQ in+0(FP), SI // SI: &in
|
||||
MOVQ in_len+8(FP), R9 // R9: len(in)
|
||||
MOVQ out+24(FP), DX // DX: &out
|
||||
SHRQ $6, R9 // len(in) / 64
|
||||
CMPQ R9, $0
|
||||
JEQ done_xor_sse2_64
|
||||
|
||||
loopback_xor_sse2_64:
|
||||
MOVOU (SI), X0 // in[x]
|
||||
MOVOU 16(SI), X2 // in[x]
|
||||
MOVOU 32(SI), X4 // in[x]
|
||||
MOVOU 48(SI), X6 // in[x]
|
||||
MOVOU (DX), X1 // out[x]
|
||||
MOVOU 16(DX), X3 // out[x]
|
||||
MOVOU 32(DX), X5 // out[x]
|
||||
MOVOU 48(DX), X7 // out[x]
|
||||
PXOR X0, X1
|
||||
PXOR X2, X3
|
||||
PXOR X4, X5
|
||||
PXOR X6, X7
|
||||
MOVOU X1, (DX)
|
||||
MOVOU X3, 16(DX)
|
||||
MOVOU X5, 32(DX)
|
||||
MOVOU X7, 48(DX)
|
||||
ADDQ $64, SI // in+=64
|
||||
ADDQ $64, DX // out+=64
|
||||
SUBQ $1, R9
|
||||
JNZ loopback_xor_sse2_64
|
||||
|
||||
done_xor_sse2_64:
|
||||
RET
|
||||
|
|
|
@ -503,8 +503,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu
|
|||
if do < r.o.minSplitSize {
|
||||
do = r.o.minSplitSize
|
||||
}
|
||||
// Make sizes divisible by 32
|
||||
do = (do + 31) & (^31)
|
||||
// Make sizes divisible by 64
|
||||
do = (do + 63) & (^63)
|
||||
start := 0
|
||||
for start < byteCount {
|
||||
if start+do > byteCount {
|
||||
|
@ -576,8 +576,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp
|
|||
if do < r.o.minSplitSize {
|
||||
do = r.o.minSplitSize
|
||||
}
|
||||
// Make sizes divisible by 32
|
||||
do = (do + 31) & (^31)
|
||||
// Make sizes divisible by 64
|
||||
do = (do + 63) & (^63)
|
||||
start := 0
|
||||
for start < byteCount {
|
||||
if start+do > byteCount {
|
||||
|
|
Loading…
Reference in New Issue