From 3a82d28edb9c541355386d595ac4e14d158f9d2c Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Tue, 26 Jul 2022 03:37:28 -0700 Subject: [PATCH] Add GF16 AVX2, AVX512 and SSSE3 (#193) * Add GF16 AVX2 * Add SSSE3 fallback. * Fix reconstruction was skipped if first shard was empty. * Combine lookups in pure Go * Faster xor on pure Go. * Add 4way butterfly AVX2. * Add fftDIT4 avx2. Add avx512 version. Add noescape. * Remove +build space. Do size varied 800x200 bench. * Use VPTERNLOGD for avx512. * Remove refMulAdd inner loop bounds checks. ~10-20% faster --- _gen/gen.go | 3 +- _gen/gf16.go | 846 +++++++++++++++++++++++++++++ galois.go | 23 + galois_amd64.go | 112 ++++ galois_arm64.go | 30 ++ galois_gen_amd64.go | 30 ++ galois_gen_amd64.s | 1240 ++++++++++++++++++++++++++++++++++++++++++- galois_noasm.go | 49 +- galois_ppc64le.go | 28 + leopard.go | 157 +++--- reedsolomon_test.go | 49 +- 11 files changed, 2483 insertions(+), 84 deletions(-) create mode 100644 _gen/gf16.go diff --git a/_gen/gen.go b/_gen/gen.go index 50b8043..4c0a696 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -1,7 +1,7 @@ //go:build generate // +build generate -//go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon +//go:generate go run -tags=generate . -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon //go:generate go fmt ../galois_gen_switch_amd64.go //go:generate go fmt ../galois_gen_amd64.go //go:generate go run cleanup.go ../galois_gen_amd64.s @@ -129,6 +129,7 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } `) + genGF16() Generate() } diff --git a/_gen/gf16.go b/_gen/gf16.go new file mode 100644 index 0000000..d294898 --- /dev/null +++ b/_gen/gf16.go @@ -0,0 +1,846 @@ +//go:build generate +// +build generate + +package main + +import ( + "fmt" + + "github.com/mmcloughlin/avo/attr" + . "github.com/mmcloughlin/avo/build" + . "github.com/mmcloughlin/avo/operand" + "github.com/mmcloughlin/avo/reg" +) + +type table256 struct { + Lo, Hi Op + loadLo128, loadHi128 *Mem + loadLo256, loadHi256 *Mem + useZmmLo, useZmmHi *reg.VecPhysical +} + +func (t *table256) prepare() { + t.prepareLo() + t.prepareHi() +} + +func (t *table256) prepareHi() { + if t.loadHi128 != nil { + t.Hi = YMM() + // Load and expand tables + VBROADCASTI128(*t.loadHi128, t.Hi) + } + if t.loadHi256 != nil { + t.Hi = YMM() + // Load and expand tables + VMOVDQU(*t.loadHi256, t.Hi) + } + if t.useZmmHi != nil { + r := *t.useZmmHi + t.Hi = r.AsY() + } +} + +func (t *table256) prepareLo() { + if t.loadLo128 != nil { + t.Lo = YMM() + // Load and expand tables + VBROADCASTI128(*t.loadLo128, t.Lo) + } + if t.loadLo256 != nil { + t.Lo = YMM() + // Load and expand tables + VMOVDQU(*t.loadLo256, t.Lo) + } + if t.useZmmLo != nil { + r := *t.useZmmLo + t.Lo = r.AsY() + } +} + +// table128 contains memory pointers to tables +type table128 struct { + Lo, Hi Op +} + +type gf16ctx struct { + clrMask reg.VecVirtual + clrMask128 reg.VecVirtual + avx512 bool +} + +func genGF16() { + var ctx gf16ctx + // Ported from static void IFFT_DIT2 + // https://github.com/catid/leopard/blob/master/LeopardFF16.cpp#L629 + { + TEXT("ifftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table256{} + for i, t := range tables { + t.Lo, t.Hi = YMM(), YMM() + // Load and expand tables + VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + tables[i] = t + } + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM() + Label("loop") + VMOVDQU(Mem{Base: x, Disp: 0}, xLo) + VMOVDQU(Mem{Base: x, Disp: 32}, xHi) + VMOVDQU(Mem{Base: y, Disp: 0}, yLo) + VMOVDQU(Mem{Base: y, Disp: 32}, yHi) + VPXOR(yLo, xLo, yLo) + VPXOR(yHi, xHi, yHi) + VMOVDQU(yLo, Mem{Base: y, Disp: 0}) + VMOVDQU(yHi, Mem{Base: y, Disp: 32}) + leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables) + VMOVDQU(xLo, Mem{Base: x, Disp: 0}) + VMOVDQU(xHi, Mem{Base: x, Disp: 32}) + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + VZEROUPPER() + RET() + } + { + TEXT("fftDIT2_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table256{} + for i, t := range tables { + t.Lo, t.Hi = YMM(), YMM() + // Load and expand tables + VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + tables[i] = t + } + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + xLo, xHi, yLo, yHi := YMM(), YMM(), YMM(), YMM() + Label("loop") + VMOVDQU(Mem{Base: x, Disp: 0}, xLo) + VMOVDQU(Mem{Base: x, Disp: 32}, xHi) + VMOVDQU(Mem{Base: y, Disp: 0}, yLo) + VMOVDQU(Mem{Base: y, Disp: 32}, yHi) + + leoMulAdd256(ctx, xLo, xHi, yLo, yHi, tables) + VMOVDQU(xLo, Mem{Base: x, Disp: 0}) + VMOVDQU(xHi, Mem{Base: x, Disp: 32}) + + // Reload, or we go beyond 16 regs.. + if true { + yLo, yHi = YMM(), YMM() + VMOVDQU(Mem{Base: y, Disp: 0}, yLo) + VMOVDQU(Mem{Base: y, Disp: 32}, yHi) + } + + VPXOR(yLo, xLo, yLo) + VPXOR(yHi, xHi, yHi) + VMOVDQU(yLo, Mem{Base: y, Disp: 0}) + VMOVDQU(yHi, Mem{Base: y, Disp: 32}) + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + VZEROUPPER() + RET() + } + + { + TEXT("mulgf16_avx2", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table256{} + for i, t := range tables { + t.Lo, t.Hi = YMM(), YMM() + // Load and expand tables + VBROADCASTI128(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + VBROADCASTI128(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + tables[i] = t + } + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + dataLo, dataHi := YMM(), YMM() + Label("loop") + VMOVDQU(Mem{Base: y, Disp: 0}, dataLo) + VMOVDQU(Mem{Base: y, Disp: 32}, dataHi) + + prodLo, prodHi := leoMul256(ctx, dataLo, dataHi, tables) + VMOVDQU(prodLo, Mem{Base: x, Disp: 0}) + VMOVDQU(prodHi, Mem{Base: x, Disp: 32}) + + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + VZEROUPPER() + RET() + } + for _, avx512 := range []bool{true, false} { + // AVX-512 only uses more registers for tables. + var suffix = "avx2" + if avx512 { + suffix = "avx512" + } + ctx.avx512 = avx512 + extZMMs := []reg.VecPhysical{reg.Z16, reg.Z17, reg.Z18, reg.Z19, reg.Z20, reg.Z21, reg.Z22, reg.Z23, reg.Z24, reg.Z25, reg.Z26, reg.Z27, reg.Z28, reg.Z29, reg.Z30, reg.Z31} + { + TEXT("ifftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)")) + Pragma("noescape") + Comment("dist must be multiplied by 24 (size of slice header)") + Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3") + + // Unpack tables to stack. Slower. + const unpackTables = false + + table01Ptr := Load(Param("table01"), GP64()) + table23Ptr := Load(Param("table23"), GP64()) + table02Ptr := Load(Param("table02"), GP64()) + + // Prepare table pointers. + table01 := [4]table256{} + table23 := [4]table256{} + table02 := [4]table256{} + if avx512 { + usedZmm := 0 + fill := func(t *[4]table256, ptr reg.Register) { + for i := range table01 { + t := &t[i] + if len(extZMMs)-usedZmm >= 2 { + tmpLo, tmpHi := YMM(), YMM() + t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1] + usedZmm += 2 + // Load and expand tables + VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo) + VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi) + VMOVAPS(tmpLo.AsZ(), *t.useZmmLo) + VMOVAPS(tmpHi.AsZ(), *t.useZmmHi) + } else { + t.loadLo128 = &Mem{Base: ptr, Disp: i * 16} + t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4} + } + } + } + fill(&table02, table02Ptr) + fill(&table01, table01Ptr) + fill(&table23, table23Ptr) + } + for i := range table01 { + if avx512 { + continue + } + + if unpackTables { + toStack := func(m Mem) *Mem { + stack := AllocLocal(32) + y := YMM() + VBROADCASTI128(m, y) + VMOVDQU(y, stack) + return &stack + } + + table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16}) + table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16}) + table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16}) + + table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4}) + table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4}) + table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4}) + } else { + table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16} + table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16} + table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16} + + table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4} + table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4} + table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4} + } + } + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + dist := Load(Param("dist"), GP64()) + + // Pointers to each "work" + var work [4]reg.GPVirtual + workTable := Load(Param("work").Base(), GP64()) // &work[0] + bytes := GP64() + + // Load length of work[0] + MOVQ(Mem{Base: workTable, Disp: 8}, bytes) + + offset := GP64() + XORQ(offset, offset) + for i := range work { + work[i] = GP64() + // work[i] = &workTable[dist*i] + MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if i < len(work)-1 { + ADDQ(dist, offset) + } + } + var workRegLo [4]reg.VecVirtual + var workRegHi [4]reg.VecVirtual + + workRegLo[0], workRegHi[0] = YMM(), YMM() + workRegLo[1], workRegHi[1] = YMM(), YMM() + + mask := Load(Param("logMask"), GP64()) + Label("loop") + VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0]) + VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0]) + VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1]) + VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1]) + + // First layer: + VPXOR(workRegLo[0], workRegLo[1], workRegLo[1]) + VPXOR(workRegHi[0], workRegHi[1], workRegHi[1]) + + // Test bit 0 + BTQ(U8(0), mask) + JC(LabelRef("skip_m01")) + leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01) + + Label("skip_m01") + workRegLo[2], workRegHi[2] = YMM(), YMM() + workRegLo[3], workRegHi[3] = YMM(), YMM() + VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2]) + VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2]) + VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3]) + VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3]) + + VPXOR(workRegLo[2], workRegLo[3], workRegLo[3]) + VPXOR(workRegHi[2], workRegHi[3], workRegHi[3]) + + // Test bit 1 + BTQ(U8(1), mask) + JC(LabelRef("skip_m23")) + leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23) + Label("skip_m23") + + // Second layer: + VPXOR(workRegLo[0], workRegLo[2], workRegLo[2]) + VPXOR(workRegHi[0], workRegHi[2], workRegHi[2]) + VPXOR(workRegLo[1], workRegLo[3], workRegLo[3]) + VPXOR(workRegHi[1], workRegHi[3], workRegHi[3]) + + // Test bit 2 + BTQ(U8(2), mask) + JC(LabelRef("skip_m02")) + leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02) + leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02) + Label("skip_m02") + + // Store + Next loop: + for i := range work { + VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32}) + ADDQ(U8(64), work[i]) + } + + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + VZEROUPPER() + RET() + } + { + TEXT("fftDIT4_"+suffix, attr.NOSPLIT, fmt.Sprintf("func(work [][]byte, dist int, table01 *[8*16]uint8, table23 *[8*16]uint8, table02 *[8*16]uint8, logMask uint8)")) + Pragma("noescape") + Comment("dist must be multiplied by 24 (size of slice header)") + Comment("logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3") + + // Unpack tables to stack. Slower. + const unpackTables = false + + table01Ptr := Load(Param("table01"), GP64()) + table23Ptr := Load(Param("table23"), GP64()) + table02Ptr := Load(Param("table02"), GP64()) + + // Prepare table pointers. + table01 := [4]table256{} + table23 := [4]table256{} + table02 := [4]table256{} + if avx512 { + usedZmm := 0 + fill := func(t *[4]table256, ptr reg.Register) { + for i := range table01 { + t := &t[i] + if len(extZMMs)-usedZmm >= 2 { + tmpLo, tmpHi := YMM(), YMM() + t.useZmmLo, t.useZmmHi = &extZMMs[usedZmm], &extZMMs[usedZmm+1] + usedZmm += 2 + // Load and expand tables + VBROADCASTI128(Mem{Base: ptr, Disp: i * 16}, tmpLo) + VBROADCASTI128(Mem{Base: ptr, Disp: i*16 + 16*4}, tmpHi) + VMOVAPS(tmpLo.AsZ(), *t.useZmmLo) + VMOVAPS(tmpHi.AsZ(), *t.useZmmHi) + } else { + t.loadLo128 = &Mem{Base: ptr, Disp: i * 16} + t.loadHi128 = &Mem{Base: ptr, Disp: i*16 + 16*4} + } + } + } + fill(&table02, table02Ptr) + fill(&table01, table01Ptr) + fill(&table23, table23Ptr) + } + for i := range table01 { + if avx512 { + continue + } + if unpackTables { + toStack := func(m Mem) *Mem { + stack := AllocLocal(32) + y := YMM() + VBROADCASTI128(m, y) + VMOVDQU(y, stack) + return &stack + } + + table01[i].loadLo256 = toStack(Mem{Base: table01Ptr, Disp: i * 16}) + table23[i].loadLo256 = toStack(Mem{Base: table23Ptr, Disp: i * 16}) + table02[i].loadLo256 = toStack(Mem{Base: table02Ptr, Disp: i * 16}) + + table01[i].loadHi256 = toStack(Mem{Base: table01Ptr, Disp: i*16 + 16*4}) + table23[i].loadHi256 = toStack(Mem{Base: table23Ptr, Disp: i*16 + 16*4}) + table02[i].loadHi256 = toStack(Mem{Base: table02Ptr, Disp: i*16 + 16*4}) + } else { + table01[i].loadLo128 = &Mem{Base: table01Ptr, Disp: i * 16} + table23[i].loadLo128 = &Mem{Base: table23Ptr, Disp: i * 16} + table02[i].loadLo128 = &Mem{Base: table02Ptr, Disp: i * 16} + + table01[i].loadHi128 = &Mem{Base: table01Ptr, Disp: i*16 + 16*4} + table23[i].loadHi128 = &Mem{Base: table23Ptr, Disp: i*16 + 16*4} + table02[i].loadHi128 = &Mem{Base: table02Ptr, Disp: i*16 + 16*4} + } + } + // Generate mask + ctx.clrMask = YMM() + tmpMask := GP64() + MOVQ(U32(15), tmpMask) + MOVQ(tmpMask, ctx.clrMask.AsX()) + VPBROADCASTB(ctx.clrMask.AsX(), ctx.clrMask) + + dist := Load(Param("dist"), GP64()) + + // Pointers to each "work" + var work [4]reg.GPVirtual + workTable := Load(Param("work").Base(), GP64()) // &work[0] + bytes := GP64() + + // Load length of work[0] + MOVQ(Mem{Base: workTable, Disp: 8}, bytes) + + offset := GP64() + XORQ(offset, offset) + for i := range work { + work[i] = GP64() + // work[i] = &workTable[dist*i] + MOVQ(Mem{Base: workTable, Index: offset, Scale: 1}, work[i]) + if i < len(work)-1 { + ADDQ(dist, offset) + } + } + var workRegLo [4]reg.VecVirtual + var workRegHi [4]reg.VecVirtual + + workRegLo[0], workRegHi[0] = YMM(), YMM() + workRegLo[1], workRegHi[1] = YMM(), YMM() + workRegLo[2], workRegHi[2] = YMM(), YMM() + workRegLo[3], workRegHi[3] = YMM(), YMM() + + mask := Load(Param("logMask"), GP64()) + Label("loop") + VMOVDQU(Mem{Base: work[0], Disp: 0}, workRegLo[0]) + VMOVDQU(Mem{Base: work[0], Disp: 32}, workRegHi[0]) + VMOVDQU(Mem{Base: work[2], Disp: 0}, workRegLo[2]) + VMOVDQU(Mem{Base: work[2], Disp: 32}, workRegHi[2]) + + VMOVDQU(Mem{Base: work[1], Disp: 0}, workRegLo[1]) + VMOVDQU(Mem{Base: work[1], Disp: 32}, workRegHi[1]) + VMOVDQU(Mem{Base: work[3], Disp: 0}, workRegLo[3]) + VMOVDQU(Mem{Base: work[3], Disp: 32}, workRegHi[3]) + + // First layer: + + // Test bit 0 + BTQ(U8(0), mask) + JC(LabelRef("skip_m02")) + leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[2], workRegHi[2], table02) + leoMulAdd256(ctx, workRegLo[1], workRegHi[1], workRegLo[3], workRegHi[3], table02) + + Label("skip_m02") + + VPXOR(workRegLo[0], workRegLo[2], workRegLo[2]) + VPXOR(workRegHi[0], workRegHi[2], workRegHi[2]) + VPXOR(workRegLo[1], workRegLo[3], workRegLo[3]) + VPXOR(workRegHi[1], workRegHi[3], workRegHi[3]) + + // Second layer: + // Test bit 1 + BTQ(U8(1), mask) + JC(LabelRef("skip_m01")) + leoMulAdd256(ctx, workRegLo[0], workRegHi[0], workRegLo[1], workRegHi[1], table01) + Label("skip_m01") + VPXOR(workRegLo[0], workRegLo[1], workRegLo[1]) + VPXOR(workRegHi[0], workRegHi[1], workRegHi[1]) + + // Store... + for i := range work[:2] { + VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32}) + ADDQ(U8(64), work[i]) + } + + // Test bit 2 + BTQ(U8(2), mask) + JC(LabelRef("skip_m23")) + leoMulAdd256(ctx, workRegLo[2], workRegHi[2], workRegLo[3], workRegHi[3], table23) + Label("skip_m23") + VPXOR(workRegLo[2], workRegLo[3], workRegLo[3]) + VPXOR(workRegHi[2], workRegHi[3], workRegHi[3]) + + // Store + Next loop: + for i := range work[2:] { + i := i + 2 + VMOVDQU(workRegLo[i], Mem{Base: work[i], Disp: 0}) + VMOVDQU(workRegHi[i], Mem{Base: work[i], Disp: 32}) + ADDQ(U8(64), work[i]) + } + + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + VZEROUPPER() + RET() + } + } + // SSSE3: + ctx.avx512 = false + { + TEXT("ifftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table128{} + for i, t := range tables { + // We almost have enough space for all tables. + if i > 2 { + t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4} + } else { + t.Lo, t.Hi = XMM(), XMM() + MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + } + tables[i] = t + } + // Generate mask + zero := XMM() + XORPS(zero, zero) // Zero, so bytes will be copied. + fifteen, mask := GP64(), XMM() + MOVQ(U32(0xf), fifteen) + MOVQ(fifteen, mask) + PSHUFB(zero, mask) + ctx.clrMask128 = mask + + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + + Label("loop") + for i := 0; i < 2; i++ { + xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM() + MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo) + MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi) + MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo) + MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi) + PXOR(xLo, yLo) + PXOR(xHi, yHi) + MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0}) + MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32}) + leoMulAdd128(ctx, xLo, xHi, yLo, yHi, tables) + MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0}) + MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32}) + } + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + RET() + } + { + TEXT("fftDIT2_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table128{} + for i, t := range tables { + // We almost have enough space for all tables. + if i > 2 { + t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4} + } else { + t.Lo, t.Hi = XMM(), XMM() + MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + } + tables[i] = t + } + // Generate mask + zero := XMM() + XORPS(zero, zero) // Zero, so bytes will be copied. + fifteen, mask := GP64(), XMM() + MOVQ(U32(0xf), fifteen) + MOVQ(fifteen, mask) + PSHUFB(zero, mask) + ctx.clrMask128 = mask + + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + + Label("loop") + for i := 0; i < 2; i++ { + xLo, xHi, yLo, yHi := XMM(), XMM(), XMM(), XMM() + MOVUPS(Mem{Base: y, Disp: i*16 + 0}, yLo) + MOVUPS(Mem{Base: y, Disp: i*16 + 32}, yHi) + + prodLo, prodHi := leoMul128(ctx, yLo, yHi, tables) + + MOVUPS(Mem{Base: x, Disp: i*16 + 0}, xLo) + MOVUPS(Mem{Base: x, Disp: i*16 + 32}, xHi) + PXOR(prodLo, xLo) + PXOR(prodHi, xHi) + MOVUPS(xLo, Mem{Base: x, Disp: i*16 + 0}) + MOVUPS(xHi, Mem{Base: x, Disp: i*16 + 32}) + + PXOR(xLo, yLo) + PXOR(xHi, yHi) + MOVUPS(yLo, Mem{Base: y, Disp: i*16 + 0}) + MOVUPS(yHi, Mem{Base: y, Disp: i*16 + 32}) + + } + + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + RET() + } + { + TEXT("mulgf16_ssse3", attr.NOSPLIT, fmt.Sprintf("func(x, y []byte, table *[8*16]uint8)")) + Pragma("noescape") + tablePtr := Load(Param("table"), GP64()) + tables := [4]table128{} + for i, t := range tables { + // We have enough space for all tables. + if i > 3 { + t.Lo, t.Hi = Mem{Base: tablePtr, Disp: i * 16}, Mem{Base: tablePtr, Disp: i*16 + 16*4} + } else { + t.Lo, t.Hi = XMM(), XMM() + MOVUPS(Mem{Base: tablePtr, Disp: i * 16}, t.Lo) + MOVUPS(Mem{Base: tablePtr, Disp: i*16 + 16*4}, t.Hi) + } + tables[i] = t + } + bytes := Load(Param("x").Len(), GP64()) + x := Load(Param("x").Base(), GP64()) + y := Load(Param("y").Base(), GP64()) + // Generate mask + zero := XMM() + XORPS(zero, zero) // Zero, so bytes will be copied. + fifteen, mask := GP64(), XMM() + MOVQ(U32(0xf), fifteen) + MOVQ(fifteen, mask) + PSHUFB(zero, mask) + ctx.clrMask128 = mask + + Label("loop") + for i := 0; i < 2; i++ { + dataLo, dataHi := XMM(), XMM() + MOVUPS(Mem{Base: y, Disp: i*16 + 0}, dataLo) + MOVUPS(Mem{Base: y, Disp: i*16 + 32}, dataHi) + + prodLo, prodHi := leoMul128(ctx, dataLo, dataHi, tables) + MOVUPS(prodLo, Mem{Base: x, Disp: i*16 + 0}) + MOVUPS(prodHi, Mem{Base: x, Disp: i*16 + 32}) + } + + ADDQ(U8(64), x) + ADDQ(U8(64), y) + SUBQ(U8(64), bytes) + JNZ(LabelRef("loop")) + + RET() + } + +} + +// xLo, xHi updated, yLo, yHi preserved... +func leoMulAdd256(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table256) { + // inlined: + // prodLo, prodHi := leoMul256(ctx, yLo, yHi, table) + lo := yLo + hi := yHi + data0, data1 := YMM(), YMM() + VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4 + VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf + VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf + prodLo, prodHi := YMM(), YMM() + table[0].prepare() + VPSHUFB(data0, table[0].Lo, prodLo) + VPSHUFB(data0, table[0].Hi, prodHi) + tmpLo, tmpHi := YMM(), YMM() + table[1].prepare() + VPSHUFB(data1, table[1].Lo, tmpLo) + VPSHUFB(data1, table[1].Hi, tmpHi) + VPXOR(prodLo, tmpLo, prodLo) + VPXOR(prodHi, tmpHi, prodHi) + + // Now process high + data0, data1 = YMM(), YMM() // Realloc to break dep + VPAND(hi, ctx.clrMask, data0) + VPSRLQ(U8(4), hi, data1) + VPAND(ctx.clrMask, data1, data1) + + tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep + table[2].prepare() + VPSHUFB(data0, table[2].Lo, tmpLo) + VPSHUFB(data0, table[2].Hi, tmpHi) + VPXOR(prodLo, tmpLo, prodLo) + VPXOR(prodHi, tmpHi, prodHi) + table[3].prepare() + VPSHUFB(data1, table[3].Lo, tmpLo) + VPSHUFB(data1, table[3].Hi, tmpHi) + if ctx.avx512 { + VPTERNLOGD(U8(0x96), prodLo, tmpLo, xLo) + VPTERNLOGD(U8(0x96), prodHi, tmpHi, xHi) + } else { + VPXOR3way(prodLo, tmpLo, xLo) + VPXOR3way(prodHi, tmpHi, xHi) + } +} + +// leoMul256 lo, hi preserved... +func leoMul256(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table256) (prodLo, prodHi reg.VecVirtual) { + data0, data1 := YMM(), YMM() + VPSRLQ(U8(4), lo, data1) // data1 = lo >> 4 + VPAND(ctx.clrMask, lo, data0) // data0 = lo&0xf + VPAND(ctx.clrMask, data1, data1) // data 1 = data1 &0xf + prodLo, prodHi = YMM(), YMM() + table[0].prepare() + VPSHUFB(data0, table[0].Lo, prodLo) + VPSHUFB(data0, table[0].Hi, prodHi) + tmpLo, tmpHi := YMM(), YMM() + table[1].prepare() + VPSHUFB(data1, table[1].Lo, tmpLo) + VPSHUFB(data1, table[1].Hi, tmpHi) + VPXOR(prodLo, tmpLo, prodLo) + VPXOR(prodHi, tmpHi, prodHi) + + // Now process high + data0, data1 = YMM(), YMM() // Realloc to break dep + VPAND(hi, ctx.clrMask, data0) + VPSRLQ(U8(4), hi, data1) + VPAND(ctx.clrMask, data1, data1) + + tmpLo, tmpHi = YMM(), YMM() // Realloc to break dep + table[2].prepare() + VPSHUFB(data0, table[2].Lo, tmpLo) + VPSHUFB(data0, table[2].Hi, tmpHi) + VPXOR(prodLo, tmpLo, prodLo) + VPXOR(prodHi, tmpHi, prodHi) + table[3].prepare() + VPSHUFB(data1, table[3].Lo, tmpLo) + VPSHUFB(data1, table[3].Hi, tmpHi) + VPXOR(prodLo, tmpLo, prodLo) + VPXOR(prodHi, tmpHi, prodHi) + return +} + +func leoMulAdd128(ctx gf16ctx, xLo, xHi, yLo, yHi reg.VecVirtual, table [4]table128) { + prodLo, prodHi := leoMul128(ctx, yLo, yHi, table) + PXOR(prodLo, xLo) + PXOR(prodHi, xHi) +} + +// leoMul128 lo, hi preseved (but likely will take extra regs to reuse) +func leoMul128(ctx gf16ctx, lo, hi reg.VecVirtual, table [4]table128) (prodLo, prodHi reg.VecVirtual) { + data0, data1 := XMM(), XMM() + MOVAPS(lo, data1) + PSRLQ(U8(4), data1) // data1 = lo >> 4 + MOVAPS(lo, data0) + PAND(ctx.clrMask128, data0) // data0 = lo&0xf + PAND(ctx.clrMask128, data1) // data 1 = data1 &0xf + prodLo, prodHi = XMM(), XMM() + MOVUPS(table[0].Lo, prodLo) + MOVUPS(table[0].Hi, prodHi) + PSHUFB(data0, prodLo) + PSHUFB(data0, prodHi) + tmpLo, tmpHi := XMM(), XMM() + MOVUPS(table[1].Lo, tmpLo) + MOVUPS(table[1].Hi, tmpHi) + PSHUFB(data1, tmpLo) + PSHUFB(data1, tmpHi) + PXOR(tmpLo, prodLo) + PXOR(tmpHi, prodHi) + + // Now process high + data0, data1 = XMM(), XMM() // Realloc to break dep + MOVAPS(hi, data0) + MOVAPS(hi, data1) + PAND(ctx.clrMask128, data0) + PSRLQ(U8(4), data1) + PAND(ctx.clrMask128, data1) + + tmpLo, tmpHi = XMM(), XMM() // Realloc to break dep + MOVUPS(table[2].Lo, tmpLo) + MOVUPS(table[2].Hi, tmpHi) + PSHUFB(data0, tmpLo) + PSHUFB(data0, tmpHi) + PXOR(tmpLo, prodLo) + PXOR(tmpHi, prodHi) + MOVUPS(table[3].Lo, tmpLo) + MOVUPS(table[3].Hi, tmpHi) + PSHUFB(data1, tmpLo) + PSHUFB(data1, tmpHi) + PXOR(tmpLo, prodLo) + PXOR(tmpHi, prodHi) + return +} diff --git a/galois.go b/galois.go index 30e9e03..703f209 100644 --- a/galois.go +++ b/galois.go @@ -6,6 +6,8 @@ package reedsolomon +import "encoding/binary" + const ( // The number of elements in the field. fieldSize = 256 @@ -929,3 +931,24 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) } return dst } + +// xor slices writing to out. +func sliceXorGo(in, out []byte, _ *options) { + for len(out) >= 32 { + inS := in[:32] + v0 := binary.LittleEndian.Uint64(out[:8]) ^ binary.LittleEndian.Uint64(inS[:8]) + v1 := binary.LittleEndian.Uint64(out[8:16]) ^ binary.LittleEndian.Uint64(inS[8:16]) + v2 := binary.LittleEndian.Uint64(out[16:24]) ^ binary.LittleEndian.Uint64(inS[16:24]) + v3 := binary.LittleEndian.Uint64(out[24:32]) ^ binary.LittleEndian.Uint64(inS[24:32]) + binary.LittleEndian.PutUint64(out[:8], v0) + binary.LittleEndian.PutUint64(out[8:16], v1) + binary.LittleEndian.PutUint64(out[16:24], v2) + binary.LittleEndian.PutUint64(out[24:32], v3) + out = out[32:] + in = in[32:] + } + out = out[:len(in)] + for n, input := range in { + out[n] ^= input + } +} diff --git a/galois_amd64.go b/galois_amd64.go index d722e31..1eb4465 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -132,9 +132,121 @@ func sliceXor(in, out []byte, o *options) { in = in[done:] out = out[done:] } + } else { + sliceXorGo(in, out, o) + return } out = out[:len(in)] for i := range in { out[i] ^= in[i] } } + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + if o.useAVX2 || o.useAVX512 { + if len(work[0]) > 0 { + var mask uint8 + if log_m01 == modulus { + mask |= 1 << 0 + } + if log_m23 == modulus { + mask |= 1 << 1 + } + if log_m02 == modulus { + mask |= 1 << 2 + } + t01 := &multiply256LUT[log_m01] + t23 := &multiply256LUT[log_m23] + t02 := &multiply256LUT[log_m02] + if o.useAVX512 { + ifftDIT4_avx512(work, dist*24, t01, t23, t02, mask) + } else { + ifftDIT4_avx2(work, dist*24, t01, t23, t02, mask) + } + } + return + } + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + if o.useAVX2 || o.useAVX512 { + if len(work[0]) > 0 { + var mask uint8 + if log_m02 == modulus { + mask |= 1 << 0 + } + if log_m01 == modulus { + mask |= 1 << 1 + } + if log_m23 == modulus { + mask |= 1 << 2 + } + t01 := &multiply256LUT[log_m01] + t23 := &multiply256LUT[log_m23] + t02 := &multiply256LUT[log_m02] + if o.useAVX512 { + fftDIT4_avx512(work, dist*24, t01, t23, t02, mask) + } else { + fftDIT4_avx2(work, dist*24, t01, t23, t02, mask) + } + } + return + } + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + if o.useAVX2 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + fftDIT2_avx2(x, y, tmp) + } + } else if o.useSSSE3 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + fftDIT2_ssse3(x, y, tmp) + } + } else { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) + } +} + +// 2-way butterfly +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + if o.useAVX2 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + ifftDIT2_avx2(x, y, tmp) + } + } else if o.useSSSE3 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + ifftDIT2_ssse3(x, y, tmp) + } + } else { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) + } +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + if o.useAVX2 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + mulgf16_avx2(x, y, tmp) + } + } else if o.useSSSE3 { + if len(x) > 0 { + tmp := &multiply256LUT[log_m] + mulgf16_ssse3(x, y, tmp) + } + } else { + refMul(x, y, log_m) + } +} diff --git a/galois_arm64.go b/galois_arm64.go index df79a98..92b67b8 100644 --- a/galois_arm64.go +++ b/galois_arm64.go @@ -64,3 +64,33 @@ func sliceXor(in, out []byte, o *options) { } } } + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + // 64 byte aligned, always full. + galXorNEON(x, y) +} + +// 2-way butterfly +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // 64 byte aligned, always full. + galXorNEON(x, y) + // Reference version: + refMulAdd(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 72c4ca4..899dade 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -1176,3 +1176,33 @@ func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. //go:noescape func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +//go:noescape +func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func ifftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) + +//go:noescape +func fftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) + +//go:noescape +func ifftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) + +//go:noescape +func fftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) + +//go:noescape +func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) + +//go:noescape +func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 97ead9c..f0818d7 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -1,7 +1,7 @@ // Code generated by command: go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon. DO NOT EDIT. -//go:build !appengine && !noasm && !nogen && gc -// +build !appengine,!noasm,!nogen,gc +// go:build !appengine && !noasm && !nogen && gc +//+build !appengine,!noasm,!nogen,gc #include "textflag.h" @@ -63420,3 +63420,1239 @@ mulAvxTwo_10x10Xor_loop: mulAvxTwo_10x10Xor_end: RET + +// func ifftDIT2_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT2_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPXOR Y11, Y9, Y11 + VPXOR Y12, Y10, Y12 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + VPSRLQ $0x04, Y11, Y13 + VPAND Y8, Y11, Y11 + VPAND Y8, Y13, Y13 + VPSHUFB Y11, Y0, Y14 + VPSHUFB Y11, Y1, Y11 + VPSHUFB Y13, Y2, Y15 + VPSHUFB Y13, Y3, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPAND Y12, Y8, Y13 + VPSRLQ $0x04, Y12, Y12 + VPAND Y8, Y12, Y12 + VPSHUFB Y13, Y4, Y15 + VPSHUFB Y13, Y5, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPSHUFB Y12, Y6, Y15 + VPSHUFB Y12, Y7, Y13 + XOR3WAY( $0x00, Y14, Y15, Y9) + XOR3WAY( $0x00, Y11, Y13, Y10) + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func fftDIT2_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT2_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPSRLQ $0x04, Y11, Y13 + VPAND Y8, Y11, Y11 + VPAND Y8, Y13, Y13 + VPSHUFB Y11, Y0, Y14 + VPSHUFB Y11, Y1, Y11 + VPSHUFB Y13, Y2, Y15 + VPSHUFB Y13, Y3, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPAND Y12, Y8, Y13 + VPSRLQ $0x04, Y12, Y12 + VPAND Y8, Y12, Y12 + VPSHUFB Y13, Y4, Y15 + VPSHUFB Y13, Y5, Y13 + VPXOR Y14, Y15, Y14 + VPXOR Y11, Y13, Y11 + VPSHUFB Y12, Y6, Y15 + VPSHUFB Y12, Y7, Y13 + XOR3WAY( $0x00, Y14, Y15, Y9) + XOR3WAY( $0x00, Y11, Y13, Y10) + VMOVDQU Y9, (CX) + VMOVDQU Y10, 32(CX) + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y12 + VPXOR Y11, Y9, Y11 + VPXOR Y12, Y10, Y12 + VMOVDQU Y11, (DX) + VMOVDQU Y12, 32(DX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func mulgf16_avx2(x []byte, y []byte, table *[128]uint8) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulgf16_avx2(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + VBROADCASTI128 (AX), Y0 + VBROADCASTI128 64(AX), Y1 + VBROADCASTI128 16(AX), Y2 + VBROADCASTI128 80(AX), Y3 + VBROADCASTI128 32(AX), Y4 + VBROADCASTI128 96(AX), Y5 + VBROADCASTI128 48(AX), Y6 + VBROADCASTI128 112(AX), Y7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X8 + VPBROADCASTB X8, Y8 + +loop: + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y10 + VPSRLQ $0x04, Y9, Y11 + VPAND Y8, Y9, Y9 + VPAND Y8, Y11, Y11 + VPSHUFB Y9, Y0, Y12 + VPSHUFB Y9, Y1, Y9 + VPSHUFB Y11, Y2, Y13 + VPSHUFB Y11, Y3, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VPAND Y10, Y8, Y11 + VPSRLQ $0x04, Y10, Y10 + VPAND Y8, Y10, Y10 + VPSHUFB Y11, Y4, Y13 + VPSHUFB Y11, Y5, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VPSHUFB Y10, Y6, Y13 + VPSHUFB Y10, Y7, Y11 + VPXOR Y12, Y13, Y12 + VPXOR Y9, Y11, Y9 + VMOVDQU Y12, (CX) + VMOVDQU Y9, 32(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx512(SB), NOSPLIT, $0-57 + // dist must be multiplied by 24 (size of slice header) + // logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3 + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (DX), Y1 + VBROADCASTI128 64(DX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(DX), Y1 + VBROADCASTI128 80(DX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(DX), Y1 + VBROADCASTI128 96(DX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(DX), Y1 + VBROADCASTI128 112(DX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ AX, SI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (DX)(SI*1), R9 + ADDQ AX, SI + MOVQ (DX)(SI*1), AX + MOVBQZX logMask+56(FP), DX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + BTQ $0x00, DX + JC skip_m01 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VPSHUFB Y5, Y24, Y7 + VPSHUFB Y5, Y25, Y5 + VPSHUFB Y6, Y26, Y8 + VPSHUFB Y6, Y27, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VPSHUFB Y6, Y28, Y9 + VPSHUFB Y6, Y29, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VPSHUFB Y8, Y30, Y9 + VPSHUFB Y8, Y31, Y6 + VPTERNLOGD $0x96, Y7, Y9, Y1 + VPTERNLOGD $0x96, Y5, Y6, Y2 + +skip_m01: + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + BTQ $0x01, DX + JC skip_m23 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y5 + VPTERNLOGD $0x96, Y9, Y10, Y6 + +skip_m23: + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + BTQ $0x02, DX + JC skip_m02 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +skip_m02: + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx512(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx512(SB), NOSPLIT, $0-57 + // dist must be multiplied by 24 (size of slice header) + // logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3 + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + VBROADCASTI128 (DX), Y1 + VBROADCASTI128 64(DX), Y0 + VMOVAPS Z1, Z16 + VMOVAPS Z0, Z17 + VBROADCASTI128 16(DX), Y1 + VBROADCASTI128 80(DX), Y0 + VMOVAPS Z1, Z18 + VMOVAPS Z0, Z19 + VBROADCASTI128 32(DX), Y1 + VBROADCASTI128 96(DX), Y0 + VMOVAPS Z1, Z20 + VMOVAPS Z0, Z21 + VBROADCASTI128 48(DX), Y1 + VBROADCASTI128 112(DX), Y0 + VMOVAPS Z1, Z22 + VMOVAPS Z0, Z23 + VBROADCASTI128 (AX), Y1 + VBROADCASTI128 64(AX), Y0 + VMOVAPS Z1, Z24 + VMOVAPS Z0, Z25 + VBROADCASTI128 16(AX), Y1 + VBROADCASTI128 80(AX), Y0 + VMOVAPS Z1, Z26 + VMOVAPS Z0, Z27 + VBROADCASTI128 32(AX), Y1 + VBROADCASTI128 96(AX), Y0 + VMOVAPS Z1, Z28 + VMOVAPS Z0, Z29 + VBROADCASTI128 48(AX), Y1 + VBROADCASTI128 112(AX), Y0 + VMOVAPS Z1, Z30 + VMOVAPS Z0, Z31 + MOVQ $0x0000000f, AX + MOVQ AX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), AX + MOVQ work_base+0(FP), DX + MOVQ 8(DX), BX + XORQ SI, SI + MOVQ (DX)(SI*1), DI + ADDQ AX, SI + MOVQ (DX)(SI*1), R8 + ADDQ AX, SI + MOVQ (DX)(SI*1), R9 + ADDQ AX, SI + MOVQ (DX)(SI*1), AX + MOVBQZX logMask+56(FP), DX + +loop: + VMOVDQU (DI), Y1 + VMOVDQU 32(DI), Y2 + VMOVDQU (R9), Y5 + VMOVDQU 32(R9), Y6 + VMOVDQU (R8), Y3 + VMOVDQU 32(R8), Y4 + VMOVDQU (AX), Y7 + VMOVDQU 32(AX), Y8 + BTQ $0x00, DX + JC skip_m02 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y16, Y11 + VPSHUFB Y9, Y17, Y9 + VPSHUFB Y10, Y18, Y12 + VPSHUFB Y10, Y19, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y20, Y13 + VPSHUFB Y10, Y21, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y22, Y13 + VPSHUFB Y12, Y23, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y3 + VPTERNLOGD $0x96, Y9, Y10, Y4 + +skip_m02: + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + BTQ $0x01, DX + JC skip_m01 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VPSHUFB Y9, Y24, Y11 + VPSHUFB Y9, Y25, Y9 + VPSHUFB Y10, Y26, Y12 + VPSHUFB Y10, Y27, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VPSHUFB Y10, Y28, Y13 + VPSHUFB Y10, Y29, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VPSHUFB Y12, Y30, Y13 + VPSHUFB Y12, Y31, Y10 + VPTERNLOGD $0x96, Y11, Y13, Y1 + VPTERNLOGD $0x96, Y9, Y10, Y2 + +skip_m01: + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (DI) + VMOVDQU Y2, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y3, (R8) + VMOVDQU Y4, 32(R8) + ADDQ $0x40, R8 + BTQ $0x02, DX + JC skip_m23 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 64(CX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(CX), Y4 + VBROADCASTI128 80(CX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(CX), Y9 + VBROADCASTI128 96(CX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(CX), Y9 + VBROADCASTI128 112(CX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + VPTERNLOGD $0x96, Y3, Y9, Y5 + VPTERNLOGD $0x96, Y1, Y2, Y6 + +skip_m23: + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R9) + VMOVDQU Y6, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y7, (AX) + VMOVDQU Y8, 32(AX) + ADDQ $0x40, AX + SUBQ $0x40, BX + JNZ loop + VZEROUPPER + RET + +// func ifftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·ifftDIT4_avx2(SB), NOSPLIT, $0-57 + // dist must be multiplied by 24 (size of slice header) + // logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3 + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), BX + MOVQ work_base+0(FP), SI + MOVQ 8(SI), DI + XORQ R8, R8 + MOVQ (SI)(R8*1), R9 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R10 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R11 + ADDQ BX, R8 + MOVQ (SI)(R8*1), BX + MOVBQZX logMask+56(FP), SI + +loop: + VMOVDQU (R9), Y1 + VMOVDQU 32(R9), Y2 + VMOVDQU (R10), Y3 + VMOVDQU 32(R10), Y4 + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + BTQ $0x00, SI + JC skip_m01 + VPSRLQ $0x04, Y3, Y6 + VPAND Y0, Y3, Y5 + VPAND Y0, Y6, Y6 + VBROADCASTI128 (AX), Y7 + VBROADCASTI128 64(AX), Y8 + VPSHUFB Y5, Y7, Y7 + VPSHUFB Y5, Y8, Y5 + VBROADCASTI128 16(AX), Y8 + VBROADCASTI128 80(AX), Y9 + VPSHUFB Y6, Y8, Y8 + VPSHUFB Y6, Y9, Y6 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y6, Y5 + VPAND Y4, Y0, Y6 + VPSRLQ $0x04, Y4, Y8 + VPAND Y0, Y8, Y8 + VBROADCASTI128 32(AX), Y9 + VBROADCASTI128 96(AX), Y10 + VPSHUFB Y6, Y9, Y9 + VPSHUFB Y6, Y10, Y6 + VPXOR Y7, Y9, Y7 + VPXOR Y5, Y6, Y5 + VBROADCASTI128 48(AX), Y9 + VBROADCASTI128 112(AX), Y6 + VPSHUFB Y8, Y9, Y9 + VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y7, Y9, Y1) + XOR3WAY( $0x00, Y5, Y6, Y2) + +skip_m01: + VMOVDQU (R11), Y5 + VMOVDQU 32(R11), Y6 + VMOVDQU (BX), Y7 + VMOVDQU 32(BX), Y8 + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + BTQ $0x01, SI + JC skip_m23 + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (CX), Y11 + VBROADCASTI128 64(CX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(CX), Y12 + VBROADCASTI128 80(CX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(CX), Y13 + VBROADCASTI128 96(CX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(CX), Y13 + VBROADCASTI128 112(CX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y5) + XOR3WAY( $0x00, Y9, Y10, Y6) + +skip_m23: + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + BTQ $0x02, SI + JC skip_m02 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + +skip_m02: + VMOVDQU Y1, (R9) + VMOVDQU Y2, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y3, (R10) + VMOVDQU Y4, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y5, (R11) + VMOVDQU Y6, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y7, (BX) + VMOVDQU Y8, 32(BX) + ADDQ $0x40, BX + SUBQ $0x40, DI + JNZ loop + VZEROUPPER + RET + +// func fftDIT4_avx2(work [][]byte, dist int, table01 *[128]uint8, table23 *[128]uint8, table02 *[128]uint8, logMask uint8) +// Requires: AVX, AVX2, AVX512F, AVX512VL, SSE2 +TEXT ·fftDIT4_avx2(SB), NOSPLIT, $0-57 + // dist must be multiplied by 24 (size of slice header) + // logmask must be log_m01==kModulus, log_m23==kModulus, log_m02==kModulus from lowest to bit 3 + MOVQ table01+32(FP), AX + MOVQ table23+40(FP), CX + MOVQ table02+48(FP), DX + MOVQ $0x0000000f, BX + MOVQ BX, X0 + VPBROADCASTB X0, Y0 + MOVQ dist+24(FP), BX + MOVQ work_base+0(FP), SI + MOVQ 8(SI), DI + XORQ R8, R8 + MOVQ (SI)(R8*1), R9 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R10 + ADDQ BX, R8 + MOVQ (SI)(R8*1), R11 + ADDQ BX, R8 + MOVQ (SI)(R8*1), BX + MOVBQZX logMask+56(FP), SI + +loop: + VMOVDQU (R9), Y1 + VMOVDQU 32(R9), Y2 + VMOVDQU (R11), Y5 + VMOVDQU 32(R11), Y6 + VMOVDQU (R10), Y3 + VMOVDQU 32(R10), Y4 + VMOVDQU (BX), Y7 + VMOVDQU 32(BX), Y8 + BTQ $0x00, SI + JC skip_m02 + VPSRLQ $0x04, Y5, Y10 + VPAND Y0, Y5, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y6, Y0, Y10 + VPSRLQ $0x04, Y6, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + VPSRLQ $0x04, Y7, Y10 + VPAND Y0, Y7, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (DX), Y11 + VBROADCASTI128 64(DX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(DX), Y12 + VBROADCASTI128 80(DX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y8, Y0, Y10 + VPSRLQ $0x04, Y8, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(DX), Y13 + VBROADCASTI128 96(DX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(DX), Y13 + VBROADCASTI128 112(DX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y3) + XOR3WAY( $0x00, Y9, Y10, Y4) + +skip_m02: + VPXOR Y1, Y5, Y5 + VPXOR Y2, Y6, Y6 + VPXOR Y3, Y7, Y7 + VPXOR Y4, Y8, Y8 + BTQ $0x01, SI + JC skip_m01 + VPSRLQ $0x04, Y3, Y10 + VPAND Y0, Y3, Y9 + VPAND Y0, Y10, Y10 + VBROADCASTI128 (AX), Y11 + VBROADCASTI128 64(AX), Y12 + VPSHUFB Y9, Y11, Y11 + VPSHUFB Y9, Y12, Y9 + VBROADCASTI128 16(AX), Y12 + VBROADCASTI128 80(AX), Y13 + VPSHUFB Y10, Y12, Y12 + VPSHUFB Y10, Y13, Y10 + VPXOR Y11, Y12, Y11 + VPXOR Y9, Y10, Y9 + VPAND Y4, Y0, Y10 + VPSRLQ $0x04, Y4, Y12 + VPAND Y0, Y12, Y12 + VBROADCASTI128 32(AX), Y13 + VBROADCASTI128 96(AX), Y14 + VPSHUFB Y10, Y13, Y13 + VPSHUFB Y10, Y14, Y10 + VPXOR Y11, Y13, Y11 + VPXOR Y9, Y10, Y9 + VBROADCASTI128 48(AX), Y13 + VBROADCASTI128 112(AX), Y10 + VPSHUFB Y12, Y13, Y13 + VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y11, Y13, Y1) + XOR3WAY( $0x00, Y9, Y10, Y2) + +skip_m01: + VPXOR Y1, Y3, Y3 + VPXOR Y2, Y4, Y4 + VMOVDQU Y1, (R9) + VMOVDQU Y2, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y3, (R10) + VMOVDQU Y4, 32(R10) + ADDQ $0x40, R10 + BTQ $0x02, SI + JC skip_m23 + VPSRLQ $0x04, Y7, Y2 + VPAND Y0, Y7, Y1 + VPAND Y0, Y2, Y2 + VBROADCASTI128 (CX), Y3 + VBROADCASTI128 64(CX), Y4 + VPSHUFB Y1, Y3, Y3 + VPSHUFB Y1, Y4, Y1 + VBROADCASTI128 16(CX), Y4 + VBROADCASTI128 80(CX), Y9 + VPSHUFB Y2, Y4, Y4 + VPSHUFB Y2, Y9, Y2 + VPXOR Y3, Y4, Y3 + VPXOR Y1, Y2, Y1 + VPAND Y8, Y0, Y2 + VPSRLQ $0x04, Y8, Y4 + VPAND Y0, Y4, Y4 + VBROADCASTI128 32(CX), Y9 + VBROADCASTI128 96(CX), Y10 + VPSHUFB Y2, Y9, Y9 + VPSHUFB Y2, Y10, Y2 + VPXOR Y3, Y9, Y3 + VPXOR Y1, Y2, Y1 + VBROADCASTI128 48(CX), Y9 + VBROADCASTI128 112(CX), Y2 + VPSHUFB Y4, Y9, Y9 + VPSHUFB Y4, Y2, Y2 + XOR3WAY( $0x00, Y3, Y9, Y5) + XOR3WAY( $0x00, Y1, Y2, Y6) + +skip_m23: + VPXOR Y5, Y7, Y7 + VPXOR Y6, Y8, Y8 + VMOVDQU Y5, (R11) + VMOVDQU Y6, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y7, (BX) + VMOVDQU Y8, 32(BX) + ADDQ $0x40, BX + SUBQ $0x40, DI + JNZ loop + VZEROUPPER + RET + +// func ifftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·ifftDIT2_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + XORPS X6, X6 + MOVQ $0x0000000f, CX + MOVQ CX, X7 + PSHUFB X6, X7 + MOVQ x_len+8(FP), CX + MOVQ x_base+0(FP), DX + MOVQ y_base+24(FP), BX + +loop: + MOVUPS (DX), X6 + MOVUPS 32(DX), X8 + MOVUPS (BX), X9 + MOVUPS 32(BX), X10 + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, (BX) + MOVUPS X10, 32(BX) + MOVAPS X9, X11 + PSRLQ $0x04, X11 + MOVAPS X9, X9 + PAND X7, X9 + PAND X7, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X9, X12 + PSHUFB X9, X13 + MOVUPS X2, X9 + MOVUPS X3, X14 + PSHUFB X11, X9 + PSHUFB X11, X14 + PXOR X9, X12 + PXOR X14, X13 + MOVAPS X10, X9 + MOVAPS X10, X10 + PAND X7, X9 + PSRLQ $0x04, X10 + PAND X7, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X9, X11 + PSHUFB X9, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS 48(AX), X11 + MOVUPS 112(AX), X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + PXOR X12, X6 + PXOR X13, X8 + MOVUPS X6, (DX) + MOVUPS X8, 32(DX) + MOVUPS 16(DX), X6 + MOVUPS 48(DX), X8 + MOVUPS 16(BX), X9 + MOVUPS 48(BX), X10 + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, 16(BX) + MOVUPS X10, 48(BX) + MOVAPS X9, X11 + PSRLQ $0x04, X11 + MOVAPS X9, X9 + PAND X7, X9 + PAND X7, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X9, X12 + PSHUFB X9, X13 + MOVUPS X2, X9 + MOVUPS X3, X14 + PSHUFB X11, X9 + PSHUFB X11, X14 + PXOR X9, X12 + PXOR X14, X13 + MOVAPS X10, X9 + MOVAPS X10, X10 + PAND X7, X9 + PSRLQ $0x04, X10 + PAND X7, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X9, X11 + PSHUFB X9, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS 48(AX), X11 + MOVUPS 112(AX), X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + PXOR X12, X6 + PXOR X13, X8 + MOVUPS X6, 16(DX) + MOVUPS X8, 48(DX) + ADDQ $0x40, DX + ADDQ $0x40, BX + SUBQ $0x40, CX + JNZ loop + RET + +// func fftDIT2_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·fftDIT2_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + XORPS X6, X6 + MOVQ $0x0000000f, CX + MOVQ CX, X7 + PSHUFB X6, X7 + MOVQ x_len+8(FP), CX + MOVQ x_base+0(FP), DX + MOVQ y_base+24(FP), BX + +loop: + MOVUPS (BX), X9 + MOVUPS 32(BX), X10 + MOVAPS X9, X8 + PSRLQ $0x04, X8 + MOVAPS X9, X6 + PAND X7, X6 + PAND X7, X8 + MOVUPS X0, X11 + MOVUPS X1, X12 + PSHUFB X6, X11 + PSHUFB X6, X12 + MOVUPS X2, X6 + MOVUPS X3, X13 + PSHUFB X8, X6 + PSHUFB X8, X13 + PXOR X6, X11 + PXOR X13, X12 + MOVAPS X10, X6 + MOVAPS X10, X8 + PAND X7, X6 + PSRLQ $0x04, X8 + PAND X7, X8 + MOVUPS X4, X13 + MOVUPS X5, X14 + PSHUFB X6, X13 + PSHUFB X6, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 48(AX), X13 + MOVUPS 112(AX), X14 + PSHUFB X8, X13 + PSHUFB X8, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS (DX), X6 + MOVUPS 32(DX), X8 + PXOR X11, X6 + PXOR X12, X8 + MOVUPS X6, (DX) + MOVUPS X8, 32(DX) + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, (BX) + MOVUPS X10, 32(BX) + MOVUPS 16(BX), X9 + MOVUPS 48(BX), X10 + MOVAPS X9, X8 + PSRLQ $0x04, X8 + MOVAPS X9, X6 + PAND X7, X6 + PAND X7, X8 + MOVUPS X0, X11 + MOVUPS X1, X12 + PSHUFB X6, X11 + PSHUFB X6, X12 + MOVUPS X2, X6 + MOVUPS X3, X13 + PSHUFB X8, X6 + PSHUFB X8, X13 + PXOR X6, X11 + PXOR X13, X12 + MOVAPS X10, X6 + MOVAPS X10, X8 + PAND X7, X6 + PSRLQ $0x04, X8 + PAND X7, X8 + MOVUPS X4, X13 + MOVUPS X5, X14 + PSHUFB X6, X13 + PSHUFB X6, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 48(AX), X13 + MOVUPS 112(AX), X14 + PSHUFB X8, X13 + PSHUFB X8, X14 + PXOR X13, X11 + PXOR X14, X12 + MOVUPS 16(DX), X6 + MOVUPS 48(DX), X8 + PXOR X11, X6 + PXOR X12, X8 + MOVUPS X6, 16(DX) + MOVUPS X8, 48(DX) + PXOR X6, X9 + PXOR X8, X10 + MOVUPS X9, 16(BX) + MOVUPS X10, 48(BX) + ADDQ $0x40, DX + ADDQ $0x40, BX + SUBQ $0x40, CX + JNZ loop + RET + +// func mulgf16_ssse3(x []byte, y []byte, table *[128]uint8) +// Requires: SSE, SSE2, SSSE3 +TEXT ·mulgf16_ssse3(SB), NOSPLIT, $0-56 + MOVQ table+48(FP), AX + MOVUPS (AX), X0 + MOVUPS 64(AX), X1 + MOVUPS 16(AX), X2 + MOVUPS 80(AX), X3 + MOVUPS 32(AX), X4 + MOVUPS 96(AX), X5 + MOVUPS 48(AX), X6 + MOVUPS 112(AX), X7 + MOVQ x_len+8(FP), AX + MOVQ x_base+0(FP), CX + MOVQ y_base+24(FP), DX + XORPS X8, X8 + MOVQ $0x0000000f, BX + MOVQ BX, X9 + PSHUFB X8, X9 + +loop: + MOVUPS (DX), X8 + MOVUPS 32(DX), X10 + MOVAPS X8, X11 + PSRLQ $0x04, X11 + MOVAPS X8, X8 + PAND X9, X8 + PAND X9, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X8, X12 + PSHUFB X8, X13 + MOVUPS X2, X8 + MOVUPS X3, X14 + PSHUFB X11, X8 + PSHUFB X11, X14 + PXOR X8, X12 + PXOR X14, X13 + MOVAPS X10, X8 + MOVAPS X10, X10 + PAND X9, X8 + PSRLQ $0x04, X10 + PAND X9, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X8, X11 + PSHUFB X8, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X6, X11 + MOVUPS X7, X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X12, (CX) + MOVUPS X13, 32(CX) + MOVUPS 16(DX), X8 + MOVUPS 48(DX), X10 + MOVAPS X8, X11 + PSRLQ $0x04, X11 + MOVAPS X8, X8 + PAND X9, X8 + PAND X9, X11 + MOVUPS X0, X12 + MOVUPS X1, X13 + PSHUFB X8, X12 + PSHUFB X8, X13 + MOVUPS X2, X8 + MOVUPS X3, X14 + PSHUFB X11, X8 + PSHUFB X11, X14 + PXOR X8, X12 + PXOR X14, X13 + MOVAPS X10, X8 + MOVAPS X10, X10 + PAND X9, X8 + PSRLQ $0x04, X10 + PAND X9, X10 + MOVUPS X4, X11 + MOVUPS X5, X14 + PSHUFB X8, X11 + PSHUFB X8, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X6, X11 + MOVUPS X7, X14 + PSHUFB X10, X11 + PSHUFB X10, X14 + PXOR X11, X12 + PXOR X14, X13 + MOVUPS X12, 16(CX) + MOVUPS X13, 48(CX) + ADDQ $0x40, CX + ADDQ $0x40, DX + SUBQ $0x40, AX + JNZ loop + RET diff --git a/galois_noasm.go b/galois_noasm.go index 7ef78f8..47e24d7 100644 --- a/galois_noasm.go +++ b/galois_noasm.go @@ -7,8 +7,6 @@ package reedsolomon -import "encoding/binary" - func galMulSlice(c byte, in, out []byte, o *options) { out = out[:len(in)] if c == 1 { @@ -34,25 +32,38 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { } // simple slice xor -func sliceXor(in, out []byte, _ *options) { - for len(out) >= 32 { - inS := in[:32] - v0 := binary.LittleEndian.Uint64(out[:]) ^ binary.LittleEndian.Uint64(inS[:]) - v1 := binary.LittleEndian.Uint64(out[8:]) ^ binary.LittleEndian.Uint64(inS[8:]) - v2 := binary.LittleEndian.Uint64(out[16:]) ^ binary.LittleEndian.Uint64(inS[16:]) - v3 := binary.LittleEndian.Uint64(out[24:]) ^ binary.LittleEndian.Uint64(inS[24:]) - binary.LittleEndian.PutUint64(out[:], v0) - binary.LittleEndian.PutUint64(out[8:], v1) - binary.LittleEndian.PutUint64(out[16:], v2) - binary.LittleEndian.PutUint64(out[24:], v3) - out = out[32:] - in = in[32:] - } - for n, input := range in { - out[n] ^= input - } +func sliceXor(in, out []byte, o *options) { + sliceXorGo(in, out, o) } func init() { defaultOptions.useAVX512 = false } + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXorGo(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXorGo(x, y, o) + refMulAdd(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} diff --git a/galois_ppc64le.go b/galois_ppc64le.go index 52e8c23..415828a 100644 --- a/galois_ppc64le.go +++ b/galois_ppc64le.go @@ -72,3 +72,31 @@ func sliceXor(in, out []byte, o *options) { out[n] ^= input } } + +// 4-way butterfly +func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + ifftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 4-way butterfly +func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { + fftDIT4Ref(work, dist, log_m01, log_m23, log_m02, o) +} + +// 2-way butterfly forward +func fftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + refMulAdd(x, y, log_m) + sliceXor(x, y, o) +} + +// 2-way butterfly inverse +func ifftDIT2(x, y []byte, log_m ffe, o *options) { + // Reference version: + sliceXor(x, y, o) + refMulAdd(x, y, log_m) +} + +func mulgf16(x, y []byte, log_m ffe, o *options) { + refMul(x, y, log_m) +} diff --git a/leopard.go b/leopard.go index 9181dfc..dc53a00 100644 --- a/leopard.go +++ b/leopard.go @@ -17,6 +17,8 @@ import ( "math/bits" "sync" "unsafe" + + "github.com/klauspost/cpuid/v2" ) // reedSolomonFF16 is like reedSolomon but for more than 256 total shards. @@ -25,6 +27,8 @@ type reedSolomonFF16 struct { ParityShards int // Number of parity shards, should not be modified. Shards int // Total number of shards. Calculated, and should not be modified. + workPool sync.Pool + o options } @@ -77,9 +81,15 @@ var ( var mul16LUTs *[order]mul16LUT type mul16LUT struct { - LUT [4 * 16]ffe + // Contains Lo product as a single lookup. + // Should be XORed with Hi lookup for result. + Lo [256]ffe + Hi [256]ffe } +// Stores lookup for avx2 +var multiply256LUT *[order][8 * 16]byte + func (r *reedSolomonFF16) Encode(shards [][]byte) error { if len(shards) != r.Shards { return ErrTooFewShards @@ -98,11 +108,23 @@ func (r *reedSolomonFF16) encode(shards [][]byte) error { } m := ceilPow2(r.ParityShards) - - work := make([][]byte, m*2) - for i := range work { - work[i] = make([]byte, shardSize) + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w } + if cap(work) >= m*2 { + work = work[:m*2] + } else { + work = make([][]byte, m*2) + } + for i := range work { + if cap(work[i]) < shardSize { + work[i] = make([]byte, shardSize) + } else { + work[i] = work[i][:shardSize] + } + } + defer r.workPool.Put(work) mtrunc := m if r.DataShards < mtrunc { @@ -245,7 +267,7 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error { return err } - shardSize := len(shards[0]) + shardSize := shardSize(shards) if shardSize%64 != 0 { return ErrShardSize } @@ -278,16 +300,29 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error { fwht(errLocs[:], order, order) - work := make([][]byte, n) - for i := range work { - work[i] = make([]byte, shardSize) + var work [][]byte + if w, ok := r.workPool.Get().([][]byte); ok { + work = w } + if cap(work) >= n { + work = work[:n] + } else { + work = make([][]byte, n) + } + for i := range work { + if cap(work[i]) < shardSize { + work[i] = make([]byte, shardSize) + } else { + work[i] = work[i][:shardSize] + } + } + defer r.workPool.Put(work) // work <- recovery data for i := 0; i < r.ParityShards; i++ { if len(shards[i+r.DataShards]) != 0 { - mul(work[i], shards[i+r.DataShards], errLocs[i]) + mulgf16(work[i], shards[i+r.DataShards], errLocs[i], &r.o) } else { memclr(work[i]) } @@ -300,7 +335,7 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error { for i := 0; i < r.DataShards; i++ { if len(shards[i]) != 0 { - mul(work[m+i], shards[i], errLocs[m+i]) + mulgf16(work[m+i], shards[i], errLocs[m+i], &r.o) } else { memclr(work[m+i]) } @@ -353,13 +388,12 @@ func (r *reedSolomonFF16) reconstruct(shards [][]byte, recoverAll bool) error { } if i >= r.DataShards { // Parity shard. - mul(shards[i], work[i-r.DataShards], modulus-errLocs[i-r.DataShards]) + mulgf16(shards[i], work[i-r.DataShards], modulus-errLocs[i-r.DataShards], &r.o) } else { // Data shard. - mul(shards[i], work[i+m], modulus-errLocs[i+m]) + mulgf16(shards[i], work[i+m], modulus-errLocs[i+m], &r.o) } } - return nil } @@ -453,7 +487,7 @@ func fftDIT(work [][]byte, mtrunc, m int, skewLUT []ffe, o *options) { } // 4-way butterfly -func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { +func fftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { // First layer: if log_m02 == modulus { sliceXor(work[0], work[dist*2], o) @@ -477,13 +511,6 @@ func fftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) } } -// 2-way butterfly -func fftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - refMulAdd(x, y, log_m) - sliceXor(x, y, o) -} - // Unrolled IFFT for encoder func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m int, skewLUT []ffe, o *options) { // I tried rolling the memcpy/memset into the first layer of the FFT and @@ -556,8 +583,7 @@ func ifftDITEncoder(data [][]byte, mtrunc int, work [][]byte, xorRes [][]byte, m } } -// 4-way butterfly -func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { +func ifftDIT4Ref(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options) { // First layer: if log_m01 == modulus { sliceXor(work[0], work[dist], o) @@ -581,31 +607,24 @@ func ifftDIT4(work [][]byte, dist int, log_m01, log_m23, log_m02 ffe, o *options } } -// 2-way butterfly -func ifftDIT2(x, y []byte, log_m ffe, o *options) { - // Reference version: - sliceXor(x, y, o) - refMulAdd(x, y, log_m) -} - // Reference version of muladd: x[] ^= y[] * log_m func refMulAdd(x, y []byte, log_m ffe) { - lut := mul16LUTs[log_m].LUT + lut := &mul16LUTs[log_m] - for off := 0; off < len(x); off += 64 { - for i := 0; i < 32; i++ { - lo := y[off+i] - hi := y[off+i+32] + for len(x) >= 64 { + // Assert sizes for no bounds checks in loop + hiA := y[32:64] + loA := y[:32] + dst := x[:64] // Needed, but not checked... + for i, lo := range loA { + hi := hiA[i] + prod := lut.Lo[lo] ^ lut.Hi[hi] - prod := - lut[(lo&15)] ^ - lut[(lo>>4)+16] ^ - lut[(hi&15)+32] ^ - lut[(hi>>4)+48] - - x[off+i] ^= byte(prod) - x[off+i+32] ^= byte(prod >> 8) + dst[i] ^= byte(prod) + dst[i+32] ^= byte(prod >> 8) } + x = x[64:] + y = y[64:] } } @@ -622,24 +641,17 @@ func slicesXor(v1, v2 [][]byte, o *options) { } } -func mul(x, y []byte, log_m ffe) { - refMul(x, y, log_m) -} - // Reference version of mul: x[] = y[] * log_m func refMul(x, y []byte, log_m ffe) { - lut := mul16LUTs[log_m].LUT + lut := &mul16LUTs[log_m] for off := 0; off < len(x); off += 64 { - for i := 0; i < 32; i++ { - lo := y[off+i] - hi := y[off+i+32] - - prod := - lut[(lo&15)] ^ - lut[(lo>>4)+16] ^ - lut[(hi&15)+32] ^ - lut[(hi>>4)+48] + loA := y[off : off+32] + hiA := y[off+32:] + hiA = hiA[:len(loA)] + for i, lo := range loA { + hi := hiA[i] + prod := lut.Lo[lo] ^ lut.Hi[hi] x[off+i] = byte(prod) x[off+i+32] = byte(prod >> 8) @@ -843,10 +855,9 @@ func initMul16LUT() { // For each log_m multiplicand: for log_m := 0; log_m < order; log_m++ { - lut := &mul16LUTs[log_m] - + var tmp [64]ffe for nibble, shift := 0, 0; nibble < 4; { - nibble_lut := lut.LUT[nibble*16:] + nibble_lut := tmp[nibble*16:] for xnibble := 0; xnibble < 16; xnibble++ { prod := mulLog(ffe(xnibble<>4)+16)] + lut.Hi[i] = tmp[((i&15)+32)] ^ tmp[((i>>4)+48)] + } + } + if cpuid.CPU.Has(cpuid.SSSE3) || cpuid.CPU.Has(cpuid.AVX2) || cpuid.CPU.Has(cpuid.AVX512F) { + multiply256LUT = &[order][16 * 8]byte{} + + for logM := range multiply256LUT[:] { + // For each 4 bits of the finite field width in bits: + shift := 0 + for i := 0; i < 4; i++ { + // Construct 16 entry LUT for PSHUFB + prodLo := multiply256LUT[logM][i*16 : i*16+16] + prodHi := multiply256LUT[logM][4*16+i*16 : 4*16+i*16+16] + for x := range prodLo[:] { + prod := mulLog(ffe(x<> 8) + } + shift += 4 + } + } } } diff --git a/reedsolomon_test.go b/reedsolomon_test.go index 93031ed..6004985 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -14,6 +14,7 @@ import ( "math/rand" "os" "runtime" + "strconv" "testing" ) @@ -192,7 +193,7 @@ func TestEncoding(t *testing.T) { var testSizes = [][2]int{ {1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0}, {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}, - {256, 1}, + {256, 20}, {500, 300}, {2945, 129}, } var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055} var testDataSizesShort = []int{10, 10001, 100003} @@ -208,6 +209,9 @@ func testEncoding(t *testing.T, o ...Option) { } for _, perShard := range sz { if data+parity > 256 { + if perShard > 1000 { + t.Skip("long tests not needed. Not length sensitive") + } // Round up to 64 bytes. perShard = (perShard + 63) &^ 63 } @@ -1004,6 +1008,22 @@ func BenchmarkEncode2x1x1M(b *testing.B) { benchmarkEncode(b, 2, 1, 1024*1024) } +// Benchmark 800 data slices with 200 parity slices +func BenchmarkEncode800x200(b *testing.B) { + for size := 64; size <= 1<<20; size *= 4 { + b.Run(fmt.Sprintf("%v", size), func(b *testing.B) { + benchmarkEncode(b, 800, 200, size) + }) + } +} + +func BenchmarkEncodeLeopard(b *testing.B) { + size := (64 << 20) / 800 / 64 * 64 + b.Run(strconv.Itoa(size), func(b *testing.B) { + benchmarkEncode(b, 800, 200, size) + }) +} + func BenchmarkEncode10x2x10000(b *testing.B) { benchmarkEncode(b, 10, 2, 10000) } @@ -1097,6 +1117,15 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) { } } +// Benchmark 800 data slices with 200 parity slices +func BenchmarkVerify800x200(b *testing.B) { + for size := 64; size <= 1<<20; size *= 4 { + b.Run(fmt.Sprintf("%v", size), func(b *testing.B) { + benchmarkVerify(b, 800, 200, size) + }) + } +} + // Benchmark 10 data slices with 2 parity slices holding 10000 bytes each func BenchmarkVerify10x2x10000(b *testing.B) { benchmarkVerify(b, 10, 2, 10000) @@ -1177,6 +1206,15 @@ func BenchmarkReconstruct10x2x10000(b *testing.B) { benchmarkReconstruct(b, 10, 2, 10000) } +// Benchmark 800 data slices with 200 parity slices +func BenchmarkReconstruct800x200(b *testing.B) { + for size := 64; size <= 1<<20; size *= 4 { + b.Run(fmt.Sprintf("%v", size), func(b *testing.B) { + benchmarkReconstruct(b, 800, 200, size) + }) + } +} + // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkReconstruct50x5x50000(b *testing.B) { benchmarkReconstruct(b, 50, 5, 100000) @@ -1252,6 +1290,15 @@ func BenchmarkReconstructData10x2x10000(b *testing.B) { benchmarkReconstructData(b, 10, 2, 10000) } +// Benchmark 800 data slices with 200 parity slices +func BenchmarkReconstructData800x200(b *testing.B) { + for size := 64; size <= 1<<20; size *= 4 { + b.Run(fmt.Sprintf("%v", size), func(b *testing.B) { + benchmarkReconstructData(b, 800, 200, size) + }) + } +} + // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each func BenchmarkReconstructData50x5x50000(b *testing.B) { benchmarkReconstructData(b, 50, 5, 100000)