diff --git a/galois_amd64.go b/galois_amd64.go index 2b63983..bf7faca 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -21,6 +21,15 @@ func galMulAVX2(low, high, in, out []byte) //go:noescape func sSE2XorSlice(in, out []byte) +//go:noescape +func galMulAVX2Xor_64(low, high, in, out []byte) + +//go:noescape +func galMulAVX2_64(low, high, in, out []byte) + +//go:noescape +func sSE2XorSlice_64(in, out []byte) + // This is what the assembler routines do in blocks of 16 bytes: /* func galMulSSSE3(low, high, in, out []byte) { @@ -40,17 +49,29 @@ func galMulSSSE3Xor(low, high, in, out []byte) { } */ +// bigSwitchover is the size where 64 bytes are processed per loop. +const bigSwitchover = 128 + func galMulSlice(c byte, in, out []byte, o *options) { - var done int if o.useAVX2 { - galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 + if len(in) >= bigSwitchover { + galMulAVX2_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) > 32 { + galMulAVX2(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } } else if o.useSSSE3 { galMulSSSE3(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 4) << 4 + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] } - in = in[done:] - out = out[done:] out = out[:len(in)] mt := mulTable[c][:256] for i := range in { @@ -59,16 +80,25 @@ func galMulSlice(c byte, in, out []byte, o *options) { } func galMulSliceXor(c byte, in, out []byte, o *options) { - var done int if o.useAVX2 { - galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 5) << 5 + if len(in) >= bigSwitchover { + galMulAVX2Xor_64(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) >= 32 { + galMulAVX2Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) + done := (len(in) >> 5) << 5 + in = in[done:] + out = out[done:] + } } else if o.useSSSE3 { galMulSSSE3Xor(mulTableLow[c][:], mulTableHigh[c][:], in, out) - done = (len(in) >> 4) << 4 + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] } - in = in[done:] - out = out[done:] out = out[:len(in)] mt := mulTable[c][:256] for i := range in { @@ -78,13 +108,20 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { // slice galois add func sliceXor(in, out []byte, sse2 bool) { - var done int if sse2 { - sSE2XorSlice(in, out) - done = (len(in) >> 4) << 4 + if len(in) >= bigSwitchover { + sSE2XorSlice_64(in, out) + done := (len(in) >> 6) << 6 + in = in[done:] + out = out[done:] + } + if len(in) >= 16 { + sSE2XorSlice(in, out) + done := (len(in) >> 4) << 4 + in = in[done:] + out = out[done:] + } } - in = in[done:] - out = out[done:] out = out[:len(in)] for i := range in { out[i] ^= in[i] diff --git a/galois_amd64.s b/galois_amd64.s index b768028..3501110 100644 --- a/galois_amd64.s +++ b/galois_amd64.s @@ -234,3 +234,135 @@ loopback_xor_sse2: done_xor_sse2: RET + +// func galMulAVX2Xor_64(low, high, in, out []byte) +TEXT ·galMulAVX2Xor_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_xor_avx2_64 + +loopback_xor_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VMOVDQU (DX), Y4 + VMOVDQU 32(DX), Y14 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input 2 + VPAND Y8, Y1, Y1 // Y11: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y3 // Y3: Result + VPXOR Y13, Y12, Y13 // Y13: Result 2 + VPXOR Y4, Y3, Y4 // Y4: Result + VPXOR Y14, Y13, Y14 // Y4: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_avx2_64 + +done_xor_avx2_64: + VZEROUPPER + RET + +// func galMulAVX2_64(low, high, in, out []byte) +TEXT ·galMulAVX2_64(SB), 7, $0 + MOVQ low+0(FP), SI // SI: &low + MOVQ high+24(FP), DX // DX: &high + MOVQ $15, BX // BX: low mask + MOVQ BX, X5 + MOVOU (SI), X6 // X6: low + MOVOU (DX), X7 // X7: high + MOVQ in_len+56(FP), R9 // R9: len(in) + + VINSERTI128 $1, X6, Y6, Y6 // low + VINSERTI128 $1, X7, Y7, Y7 // high + VPBROADCASTB X5, Y8 // Y8: lomask (unpacked) + + SHRQ $6, R9 // len(in) / 64 + MOVQ out+72(FP), DX // DX: &out + MOVQ in+48(FP), SI // SI: &in + TESTQ R9, R9 + JZ done_avx2_64 + +loopback_avx2_64: + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y10 + VPSRLQ $4, Y0, Y1 // Y1: high input + VPSRLQ $4, Y10, Y11 // Y11: high input 2 + VPAND Y8, Y0, Y0 // Y0: low input + VPAND Y8, Y10, Y10 // Y10: low input + VPAND Y8, Y1, Y1 // Y1: high input + VPAND Y8, Y11, Y11 // Y11: high input 2 + VPSHUFB Y0, Y6, Y2 // Y2: mul low part + VPSHUFB Y10, Y6, Y12 // Y12: mul low part 2 + VPSHUFB Y1, Y7, Y3 // Y3: mul high part + VPSHUFB Y11, Y7, Y13 // Y13: mul high part 2 + VPXOR Y3, Y2, Y4 // Y4: Result + VPXOR Y13, Y12, Y14 // Y14: Result 2 + VMOVDQU Y4, (DX) + VMOVDQU Y14, 32(DX) + + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_avx2_64 + +done_avx2_64: + VZEROUPPER + RET + +// func sSE2XorSlice_64(in, out []byte) +TEXT ·sSE2XorSlice_64(SB), 7, $0 + MOVQ in+0(FP), SI // SI: &in + MOVQ in_len+8(FP), R9 // R9: len(in) + MOVQ out+24(FP), DX // DX: &out + SHRQ $6, R9 // len(in) / 64 + CMPQ R9, $0 + JEQ done_xor_sse2_64 + +loopback_xor_sse2_64: + MOVOU (SI), X0 // in[x] + MOVOU 16(SI), X2 // in[x] + MOVOU 32(SI), X4 // in[x] + MOVOU 48(SI), X6 // in[x] + MOVOU (DX), X1 // out[x] + MOVOU 16(DX), X3 // out[x] + MOVOU 32(DX), X5 // out[x] + MOVOU 48(DX), X7 // out[x] + PXOR X0, X1 + PXOR X2, X3 + PXOR X4, X5 + PXOR X6, X7 + MOVOU X1, (DX) + MOVOU X3, 16(DX) + MOVOU X5, 32(DX) + MOVOU X7, 48(DX) + ADDQ $64, SI // in+=64 + ADDQ $64, DX // out+=64 + SUBQ $1, R9 + JNZ loopback_xor_sse2_64 + +done_xor_sse2_64: + RET diff --git a/reedsolomon.go b/reedsolomon.go index 999fcc3..9bfbb26 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -503,8 +503,8 @@ func (r reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outpu if do < r.o.minSplitSize { do = r.o.minSplitSize } - // Make sizes divisible by 32 - do = (do + 31) & (^31) + // Make sizes divisible by 64 + do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount { @@ -576,8 +576,8 @@ func (r reedSolomon) checkSomeShardsP(matrixRows, inputs, toCheck [][]byte, outp if do < r.o.minSplitSize { do = r.o.minSplitSize } - // Make sizes divisible by 32 - do = (do + 31) & (^31) + // Make sizes divisible by 64 + do = (do + 63) & (^63) start := 0 for start < byteCount { if start+do > byteCount {