diff --git a/_gen/gen.go b/_gen/gen.go index 4755c72..36709e8 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -2,7 +2,8 @@ // +build generate //go:generate go run gen.go -out ../galois_gen_amd64.s -stubs ../galois_gen_amd64.go -pkg=reedsolomon -//go:generate gofmt -w ../galois_gen_switch_amd64.go +//go:generate go fmt ../galois_gen_switch_amd64.go +//go:generate go fmt ../galois_gen_amd64.go package main @@ -36,14 +37,15 @@ func main() { Constraint(buildtags.Not("nogen").ToConstraint()) Constraint(buildtags.Term("gc").ToConstraint()) - const perLoopBits = 5 + const perLoopBits = 6 const perLoop = 1 << perLoopBits for i := 1; i <= inputMax; i++ { for j := 1; j <= outputMax; j++ { - //genMulAvx2(fmt.Sprintf("mulAvxTwoXor_%dx%d", i, j), i, j, true) genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%d", i, j), i, j, false) genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64", i, j), i, j, false) + genMulAvx2(fmt.Sprintf("mulAvxTwo_%dx%dXor", i, j), i, j, true) + genMulAvx2Sixty64(fmt.Sprintf("mulAvxTwo_%dx%d_64Xor", i, j), i, j, true) } } f, err := os.Create("../galois_gen_switch_amd64.go") @@ -62,19 +64,26 @@ func main() { package reedsolomon -import "fmt" +import ( + "fmt" +) `) - w.WriteString("const avx2CodeGen = true\n") - w.WriteString(fmt.Sprintf("const maxAvx2Inputs = %d\nconst maxAvx2Outputs = %d\n", inputMax, outputMax)) + w.WriteString(fmt.Sprintf(`const ( +avx2CodeGen = true +maxAvx2Inputs = %d +maxAvx2Outputs = %d +minAvx2Size = %d +avxSizeMask = maxInt - (minAvx2Size-1) +)`, inputMax, outputMax, perLoop)) w.WriteString(` func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop-start + n := (stop-start) & avxSizeMask + `) - w.WriteString(fmt.Sprintf("n = (n>>%d)<<%d\n\n", perLoopBits, perLoopBits)) w.WriteString(`switch len(in) { `) for in, defs := range switchDefs[:] { @@ -88,6 +97,25 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { w.WriteString(`} panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop-start) & avxSizeMask + +`) + + w.WriteString(`switch len(in) { +`) + for in, defs := range switchDefsX[:] { + w.WriteString(fmt.Sprintf(" case %d:\n switch len(out) {\n", in+1)) + for out, def := range defs[:] { + w.WriteString(fmt.Sprintf(" case %d:\n", out+1)) + w.WriteString(def) + } + w.WriteString("}\n") + } + w.WriteString(`} + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} `) Generate() } @@ -129,12 +157,21 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { } } + x := "" + if xor { + x = "Xor" + } + TEXT(name, attr.NOSPLIT, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) // SWITCH DEFINITION: - s := fmt.Sprintf(" mulAvxTwo_%dx%d(matrix, in, out, start, n)\n", inputs, outputs) + s := fmt.Sprintf(" mulAvxTwo_%dx%d%s(matrix, in, out, start, n)\n", inputs, outputs, x) s += fmt.Sprintf("\t\t\t\treturn n\n") - switchDefs[inputs-1][outputs-1] = s + if xor { + switchDefsX[inputs-1][outputs-1] = s + } else { + switchDefs[inputs-1][outputs-1] = s + } if loadNone { Comment("Loading no tables to registers") @@ -197,7 +234,6 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { if err != nil { panic(err) } - outBase := addr.Addr outSlicePtr := GP64() MOVQ(addr.Addr, outSlicePtr) for i := range dst { @@ -241,13 +277,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { SHRQ(U8(perLoopBits), length) } Label(name + "_loop") - if xor { + + // Load data before loop or during first iteration? + // No clear winner. + preloadInput := xor && false + if preloadInput { Commentf("Load %d outputs", outputs) - } else { - Commentf("Clear %d outputs", outputs) - } - for i := range dst { - if xor { + for i := range dst { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) if prefetchDst > 0 { @@ -256,13 +292,11 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { continue } ptr := GP64() - MOVQ(outBase, ptr) + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } - } else { - VPXOR(dst[i], dst[i], dst[i]) } } @@ -279,6 +313,22 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { VPAND(lowMask, inLow, inLow) VPAND(lowMask, inHigh, inHigh) for j := range dst { + //Commentf(" xor:%v i: %v", xor, i) + if !preloadInput && xor && i == 0 { + if regDst { + VMOVDQU(Mem{Base: dstPtr[j]}, dst[j]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: dstPtr[j], Disp: prefetchDst}) + } + } else { + ptr := GP64() + MOVQ(Mem{Base: outSlicePtr, Disp: j * 24}, ptr) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[j]) + if prefetchDst > 0 { + PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) + } + } + } if loadNone { VMOVDQU(Mem{Base: matrixBase, Disp: 64 * (i*outputs + j)}, lookLow) VMOVDQU(Mem{Base: matrixBase, Disp: 32 + 64*(i*outputs+j)}, lookHigh) @@ -288,8 +338,13 @@ func genMulAvx2(name string, inputs int, outputs int, xor bool) { VPSHUFB(inLow, inLo[i*outputs+j], lookLow) VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) } - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow, dst[j], dst[j]) + if i == 0 && !xor { + // We don't have any existing data, write directly. + VPXOR(lookLow, lookHigh, dst[j]) + } else { + VPXOR(lookLow, lookHigh, lookLow) + VPXOR(lookLow, dst[j], dst[j]) + } } } Commentf("Store %d outputs", outputs) @@ -340,35 +395,42 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { // Load shuffle masks on every use. var loadNone bool // Use registers for destination registers. - var regDst = false + var regDst = true var reloadLength = false // lo, hi, 1 in, 1 out, 2 tmp, 1 mask - est := total*2 + outputs + 5 + est := total*4 + outputs + 7 if outputs == 1 { // We don't need to keep a copy of the input if only 1 output. est -= 2 } - if true || est > 16 { + if est > 16 { loadNone = true // We run out of GP registers first, now. if inputs+outputs > 13 { regDst = false } // Save one register by reloading length. - if true || inputs+outputs > 12 && regDst { + if inputs+outputs > 12 && regDst { reloadLength = true } } TEXT(name, 0, fmt.Sprintf("func(matrix []byte, in [][]byte, out [][]byte, start, n int)")) - + x := "" + if xor { + x = "Xor" + } // SWITCH DEFINITION: - s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) - s += fmt.Sprintf(" mulAvxTwo_%dx%d_64(matrix, in, out, start, n)\n", inputs, outputs) + //s := fmt.Sprintf("n = (n>>%d)<<%d\n", perLoopBits, perLoopBits) + s := fmt.Sprintf(" mulAvxTwo_%dx%d_64%s(matrix, in, out, start, n)\n", inputs, outputs, x) s += fmt.Sprintf("\t\t\t\treturn n\n") - switchDefs[inputs-1][outputs-1] = s + if xor { + switchDefsX[inputs-1][outputs-1] = s + } else { + switchDefs[inputs-1][outputs-1] = s + } if loadNone { Comment("Loading no tables to registers") @@ -474,33 +536,31 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VPBROADCASTB(lowMask.AsX(), lowMask) if reloadLength { + Commentf("Reload length to save a register") length = Load(Param("n"), GP64()) SHRQ(U8(perLoopBits), length) } Label(name + "_loop") + if xor { Commentf("Load %d outputs", outputs) - } else { - Commentf("Clear %d outputs", outputs) - } - for i := range dst { - if xor { + for i := range dst { if regDst { VMOVDQU(Mem{Base: dstPtr[i]}, dst[i]) + VMOVDQU(Mem{Base: dstPtr[i], Disp: 32}, dst2[i]) if prefetchDst > 0 { PREFETCHT0(Mem{Base: dstPtr[i], Disp: prefetchDst}) } continue } ptr := GP64() - MOVQ(outBase, ptr) + MOVQ(Mem{Base: outSlicePtr, Disp: i * 24}, ptr) VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1}, dst[i]) + VMOVDQU(Mem{Base: ptr, Index: offset, Scale: 1, Disp: 32}, dst2[i]) + if prefetchDst > 0 { PREFETCHT0(Mem{Base: ptr, Disp: prefetchDst, Index: offset, Scale: 1}) } - } else { - VPXOR(dst[i], dst[i], dst[i]) - VPXOR(dst2[i], dst2[i], dst2[i]) } } @@ -536,10 +596,16 @@ func genMulAvx2Sixty64(name string, inputs int, outputs int, xor bool) { VPSHUFB(inHigh, inHi[i*outputs+j], lookHigh) VPSHUFB(in2High, inHi[i*outputs+j], lookHigh2) } - VPXOR(lookLow, lookHigh, lookLow) - VPXOR(lookLow2, lookHigh2, lookLow2) - VPXOR(lookLow, dst[j], dst[j]) - VPXOR(lookLow2, dst2[j], dst2[j]) + if i == 0 && !xor { + // We don't have any existing data, write directly. + VPXOR(lookLow, lookHigh, dst[j]) + VPXOR(lookLow2, lookHigh2, dst2[j]) + } else { + VPXOR(lookLow, lookHigh, lookLow) + VPXOR(lookLow2, lookHigh2, lookLow2) + VPXOR(lookLow, dst[j], dst[j]) + VPXOR(lookLow2, dst2[j], dst2[j]) + } } } Commentf("Store %d outputs", outputs) diff --git a/galois.go b/galois.go index bc4de4f..30e9e03 100644 --- a/galois.go +++ b/galois.go @@ -901,7 +901,7 @@ func galExp(a byte, n int) byte { return expTable[logResult] } -func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte { +func genAvx2Matrix(matrixRows [][]byte, inputs, inIdx, outputs int, dst []byte) []byte { if !avx2CodeGen { panic("codegen not enabled") } @@ -915,7 +915,7 @@ func genAvx2Matrix(matrixRows [][]byte, inputs, outputs int, dst []byte) []byte dst = dst[:wantBytes] } for i, row := range matrixRows[:outputs] { - for j, idx := range row[:inputs] { + for j, idx := range row[inIdx : inIdx+inputs] { dstIdx := (j*outputs + i) * 64 dstPart := dst[dstIdx:] dstPart = dstPart[:64] diff --git a/galoisAvx512_amd64.go b/galoisAvx512_amd64.go index 0f240b7..79207e6 100644 --- a/galoisAvx512_amd64.go +++ b/galoisAvx512_amd64.go @@ -225,8 +225,9 @@ func galMulAVX512LastInput(inputOffset int, inputEnd int, outputOffset int, outp // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { // Process using no goroutines + outputCount := len(outputs) start, end := 0, r.o.perRound if end > byteCount { end = byteCount @@ -272,7 +273,8 @@ func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, // Perform the same as codeSomeShards, but taking advantage of // AVX512 parallelism for up to 4x faster execution as compared to AVX2 -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { + outputCount := len(outputs) var wg sync.WaitGroup do := byteCount / r.o.maxGoroutines if do < r.o.minSplitSize { diff --git a/galoisAvx512_amd64_test.go b/galoisAvx512_amd64_test.go index 18acec8..6792e98 100644 --- a/galoisAvx512_amd64_test.go +++ b/galoisAvx512_amd64_test.go @@ -331,9 +331,9 @@ func testCodeSomeShardsAvx512WithLength(t *testing.T, ds, ps, l int, parallel bo } if parallel { - r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShardsAvx512P(r.parity, shards[:r.DataShards], shards[r.DataShards:], len(shards[0])) } else { - r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShardsAvx512(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) } correct, _ := r.Verify(shards) diff --git a/galois_amd64.go b/galois_amd64.go index 03754d2..d722e31 100644 --- a/galois_amd64.go +++ b/galois_amd64.go @@ -107,6 +107,9 @@ func galMulSliceXor(c byte, in, out []byte, o *options) { in = in[done:] out = out[done:] } + if len(in) == 0 { + return + } out = out[:len(in)] mt := mulTable[c][:256] for i := range in { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 15d4522..817c7ea 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -15,6 +15,14 @@ func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x1Xor takes 1 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x1_64Xor takes 1 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x2 takes 1 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -25,6 +33,14 @@ func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x2Xor takes 1 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x2_64Xor takes 1 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x3 takes 1 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -35,41 +51,77 @@ func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_1x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x3Xor takes 1 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_1x3_64Xor takes 1 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x4 takes 1 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x4Xor takes 1 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x5 takes 1 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x5Xor takes 1 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x6 takes 1 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x6Xor takes 1 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x7 takes 1 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x7Xor takes 1 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x8 takes 1 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x8Xor takes 1 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x9 takes 1 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x9Xor takes 1 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_1x10 takes 1 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_1x10Xor takes 1 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x1 takes 2 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -80,6 +132,14 @@ func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x1Xor takes 2 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x1_64Xor takes 2 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x2 takes 2 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -90,6 +150,14 @@ func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x2Xor takes 2 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x2_64Xor takes 2 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x3 takes 2 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -100,41 +168,77 @@ func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_2x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x3Xor takes 2 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_2x3_64Xor takes 2 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x4 takes 2 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x4Xor takes 2 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x5 takes 2 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x5Xor takes 2 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x6 takes 2 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x6Xor takes 2 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x7 takes 2 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x7Xor takes 2 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x8 takes 2 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x8Xor takes 2 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x9 takes 2 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x9Xor takes 2 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_2x10 takes 2 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_2x10Xor takes 2 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x1 takes 3 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -145,6 +249,14 @@ func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x1Xor takes 3 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x1_64Xor takes 3 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x2 takes 3 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -155,6 +267,14 @@ func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x2Xor takes 3 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x2_64Xor takes 3 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x3 takes 3 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -165,41 +285,77 @@ func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_3x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x3Xor takes 3 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_3x3_64Xor takes 3 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x4 takes 3 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x4Xor takes 3 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x5 takes 3 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x5Xor takes 3 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x6 takes 3 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x6Xor takes 3 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x7 takes 3 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x7Xor takes 3 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x8 takes 3 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x8Xor takes 3 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x9 takes 3 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x9Xor takes 3 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_3x10 takes 3 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_3x10Xor takes 3 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x1 takes 4 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -210,6 +366,14 @@ func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x1Xor takes 4 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x1_64Xor takes 4 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x2 takes 4 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -220,6 +384,14 @@ func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x2Xor takes 4 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x2_64Xor takes 4 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x3 takes 4 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -230,41 +402,77 @@ func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_4x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x3Xor takes 4 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_4x3_64Xor takes 4 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x4 takes 4 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x4Xor takes 4 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x5 takes 4 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x5Xor takes 4 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x6 takes 4 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x6Xor takes 4 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x7 takes 4 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x7Xor takes 4 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x8 takes 4 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x8Xor takes 4 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x9 takes 4 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x9Xor takes 4 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_4x10 takes 4 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_4x10Xor takes 4 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x1 takes 5 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -275,6 +483,14 @@ func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x1Xor takes 5 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x1_64Xor takes 5 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x2 takes 5 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -285,6 +501,14 @@ func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x2Xor takes 5 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x2_64Xor takes 5 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x3 takes 5 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -295,41 +519,77 @@ func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_5x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x3Xor takes 5 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_5x3_64Xor takes 5 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x4 takes 5 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x4Xor takes 5 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x5 takes 5 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x5Xor takes 5 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x6 takes 5 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x6Xor takes 5 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x7 takes 5 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x7Xor takes 5 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x8 takes 5 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x8Xor takes 5 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x9 takes 5 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x9Xor takes 5 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_5x10 takes 5 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_5x10Xor takes 5 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x1 takes 6 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -340,6 +600,14 @@ func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x1Xor takes 6 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x1_64Xor takes 6 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x2 takes 6 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -350,6 +618,14 @@ func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x2Xor takes 6 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x2_64Xor takes 6 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x3 takes 6 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -360,41 +636,77 @@ func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_6x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x3Xor takes 6 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_6x3_64Xor takes 6 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x4 takes 6 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x4Xor takes 6 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x5 takes 6 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x5Xor takes 6 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x6 takes 6 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x6Xor takes 6 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x7 takes 6 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x7Xor takes 6 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x8 takes 6 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x8Xor takes 6 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x9 takes 6 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x9Xor takes 6 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_6x10 takes 6 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_6x10Xor takes 6 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x1 takes 7 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -405,6 +717,14 @@ func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x1Xor takes 7 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x1_64Xor takes 7 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x2 takes 7 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -415,6 +735,14 @@ func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x2Xor takes 7 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x2_64Xor takes 7 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x3 takes 7 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -425,41 +753,77 @@ func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_7x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x3Xor takes 7 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_7x3_64Xor takes 7 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x4 takes 7 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x4Xor takes 7 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x5 takes 7 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x5Xor takes 7 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x6 takes 7 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x6Xor takes 7 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x7 takes 7 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x7Xor takes 7 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x8 takes 7 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x8Xor takes 7 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x9 takes 7 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x9Xor takes 7 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_7x10 takes 7 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_7x10Xor takes 7 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x1 takes 8 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -470,6 +834,14 @@ func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x1Xor takes 8 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x1_64Xor takes 8 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x2 takes 8 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -480,6 +852,14 @@ func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x2Xor takes 8 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x2_64Xor takes 8 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x3 takes 8 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -490,41 +870,77 @@ func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_8x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x3Xor takes 8 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_8x3_64Xor takes 8 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x4 takes 8 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x4Xor takes 8 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x5 takes 8 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x5Xor takes 8 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x6 takes 8 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x6Xor takes 8 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x7 takes 8 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x7Xor takes 8 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x8 takes 8 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x8Xor takes 8 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x9 takes 8 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x9Xor takes 8 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_8x10 takes 8 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_8x10Xor takes 8 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x1 takes 9 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -535,6 +951,14 @@ func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x1Xor takes 9 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x1_64Xor takes 9 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x2 takes 9 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -545,6 +969,14 @@ func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x2Xor takes 9 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x2_64Xor takes 9 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x3 takes 9 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -555,41 +987,77 @@ func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x3Xor takes 9 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_9x3_64Xor takes 9 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x4 takes 9 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x4Xor takes 9 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x5 takes 9 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x5Xor takes 9 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x6 takes 9 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x6Xor takes 9 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x7 takes 9 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x7Xor takes 9 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x8 takes 9 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x8Xor takes 9 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x9 takes 9 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x9Xor takes 9 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_9x10 takes 9 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_9x10Xor takes 9 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x1 takes 10 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape @@ -600,6 +1068,14 @@ func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x1Xor takes 10 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x1_64Xor takes 10 inputs and produces 1 outputs. +//go:noescape +func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x2 takes 10 inputs and produces 2 outputs. // The output is initialized to 0. //go:noescape @@ -610,6 +1086,14 @@ func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x2_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x2Xor takes 10 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x2_64Xor takes 10 inputs and produces 2 outputs. +//go:noescape +func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x3 takes 10 inputs and produces 3 outputs. // The output is initialized to 0. //go:noescape @@ -620,37 +1104,73 @@ func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) //go:noescape func mulAvxTwo_10x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x3Xor takes 10 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x3_64Xor takes 10 inputs and produces 3 outputs. +//go:noescape +func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x4 takes 10 inputs and produces 4 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x4Xor takes 10 inputs and produces 4 outputs. +//go:noescape +func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x5 takes 10 inputs and produces 5 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x5Xor takes 10 inputs and produces 5 outputs. +//go:noescape +func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x6 takes 10 inputs and produces 6 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x6Xor takes 10 inputs and produces 6 outputs. +//go:noescape +func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x7 takes 10 inputs and produces 7 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x7Xor takes 10 inputs and produces 7 outputs. +//go:noescape +func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x8 takes 10 inputs and produces 8 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x8Xor takes 10 inputs and produces 8 outputs. +//go:noescape +func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x9 takes 10 inputs and produces 9 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// mulAvxTwo_10x9Xor takes 10 inputs and produces 9 outputs. +//go:noescape +func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) + // mulAvxTwo_10x10 takes 10 inputs and produces 10 outputs. // The output is initialized to 0. //go:noescape func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) + +// mulAvxTwo_10x10Xor takes 10 inputs and produces 10 outputs. +//go:noescape +func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index ab699ac..36e885f 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -36,19 +36,15 @@ TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_1x1_loop: - // Clear 1 outputs - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 1 outputs - VMOVDQU (CX), Y4 + VMOVDQU (CX), Y2 ADDQ $0x20, CX - VPSRLQ $0x04, Y4, Y5 + VPSRLQ $0x04, Y2, Y4 + VPAND Y3, Y2, Y2 VPAND Y3, Y4, Y4 - VPAND Y3, Y5, Y5 - VPSHUFB Y4, Y0, Y4 - VPSHUFB Y5, Y1, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPSHUFB Y2, Y0, Y2 + VPSHUFB Y4, Y1, Y4 + VPXOR Y2, Y4, Y2 // Store 1 outputs VMOVDQU Y2, (DX) @@ -65,68 +61,182 @@ mulAvxTwo_1x1_end: // func mulAvxTwo_1x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 6 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_1x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), SI - SHRQ $0x06, SI + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 mulAvxTwo_1x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + ADDQ $0x40, CX + VPSRLQ $0x04, Y2, Y6 + VPSRLQ $0x04, Y3, Y5 + VPAND Y4, Y2, Y2 + VPAND Y4, Y3, Y3 + VPAND Y4, Y6, Y6 + VPAND Y4, Y5, Y5 + VPSHUFB Y2, Y0, Y2 + VPSHUFB Y3, Y0, Y3 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y5, Y1, Y5 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 // Store 1 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX // Prepare for next loop - ADDQ $0x40, BX - DECQ SI + DECQ AX JNZ mulAvxTwo_1x1_64_loop VZEROUPPER mulAvxTwo_1x1_64_end: RET +// func mulAvxTwo_1x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 6 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_1x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y4 + ADDQ $0x20, CX + VPSRLQ $0x04, Y4, Y5 + VPAND Y3, Y4, Y4 + VPAND Y3, Y5, Y5 + VMOVDQU (DX), Y2 + VPSHUFB Y4, Y0, Y4 + VPSHUFB Y5, Y1, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 1 outputs + VMOVDQU Y2, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1Xor_loop + VZEROUPPER + +mulAvxTwo_1x1Xor_end: + RET + +// func mulAvxTwo_1x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ out_base+48(FP), DX + MOVQ (DX), DX + MOVQ start+72(FP), BX + + // Add start offset to output + ADDQ BX, DX + + // Add start offset to input + ADDQ BX, CX + MOVQ $0x0000000f, BX + MOVQ BX, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DX), Y2 + VMOVDQU 32(DX), Y3 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + ADDQ $0x40, CX + VPSRLQ $0x04, Y5, Y6 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y5, Y5 + VPAND Y4, Y7, Y7 + VPAND Y4, Y6, Y6 + VPAND Y4, Y8, Y8 + VPSHUFB Y5, Y0, Y5 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y6, Y1, Y6 + VPSHUFB Y8, Y1, Y8 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 1 outputs + VMOVDQU Y2, (DX) + VMOVDQU Y3, 32(DX) + ADDQ $0x40, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_1x1_64Xor_end: + RET + // func mulAvxTwo_1x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 @@ -160,24 +270,18 @@ TEXT ·mulAvxTwo_1x2(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_1x2_loop: - // Clear 2 outputs - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 2 outputs - VMOVDQU (CX), Y9 + VMOVDQU (CX), Y8 ADDQ $0x20, CX - VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y6, Y8, Y8 VPAND Y6, Y9, Y9 - VPAND Y6, Y10, Y10 - VPSHUFB Y9, Y0, Y7 - VPSHUFB Y10, Y1, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 - VPSHUFB Y9, Y2, Y7 - VPSHUFB Y10, Y3, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPSHUFB Y8, Y0, Y5 + VPSHUFB Y9, Y1, Y7 + VPXOR Y5, Y7, Y4 + VPSHUFB Y8, Y2, Y5 + VPSHUFB Y9, Y3, Y7 + VPXOR Y5, Y7, Y5 // Store 2 outputs VMOVDQU Y4, (BX) @@ -197,38 +301,179 @@ mulAvxTwo_1x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 11 YMM used + // Destination kept in GP registers + // Full registers estimated 17 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X4 + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), SI - SHRQ $0x06, SI mulAvxTwo_1x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y4, Y7, Y7 + VPAND Y4, Y9, Y9 + VPAND Y4, Y8, Y8 + VPAND Y4, Y10, Y10 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y0 + VPXOR Y3, Y5, Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y6 + VPSHUFB Y9, Y2, Y3 + VPSHUFB Y7, Y2, Y2 + VPSHUFB Y10, Y6, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y2, Y6, Y2 + VPXOR Y3, Y5, Y3 + + // Store 2 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2_64_loop + VZEROUPPER + +mulAvxTwo_1x2_64_end: + RET + +// func mulAvxTwo_1x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 11 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + ADDQ SI, DX + + // Add start offset to input + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (CX), Y9 + ADDQ $0x20, CX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (BX), Y4 + VPSHUFB Y9, Y0, Y7 + VPSHUFB Y10, Y1, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (DX), Y5 + VPSHUFB Y9, Y2, Y7 + VPSHUFB Y10, Y3, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 2 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + VMOVDQU Y5, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x2Xor_loop + VZEROUPPER + +mulAvxTwo_1x2Xor_end: + RET + +// func mulAvxTwo_1x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + MOVQ $0x0000000f, DI + MOVQ DI, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (BX), Y2 + VMOVDQU 32(BX), Y3 // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -257,20 +502,19 @@ mulAvxTwo_1x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) - MOVQ 24(DX), DI - VMOVDQU Y2, (DI)(BX*1) - VMOVDQU Y3, 32(DI)(BX*1) + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (BX) + VMOVDQU Y3, 32(BX) + ADDQ $0x40, BX // Prepare for next loop - ADDQ $0x40, BX - DECQ SI - JNZ mulAvxTwo_1x2_64_loop + DECQ AX + JNZ mulAvxTwo_1x2_64Xor_loop VZEROUPPER -mulAvxTwo_1x2_64_end: +mulAvxTwo_1x2_64Xor_end: RET // func mulAvxTwo_1x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -310,29 +554,21 @@ TEXT ·mulAvxTwo_1x3(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_1x3_loop: - // Clear 3 outputs - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 3 outputs - VMOVDQU (CX), Y12 + VMOVDQU (CX), Y11 ADDQ $0x20, CX - VPSRLQ $0x04, Y12, Y13 + VPSRLQ $0x04, Y11, Y12 + VPAND Y9, Y11, Y11 VPAND Y9, Y12, Y12 - VPAND Y9, Y13, Y13 - VPSHUFB Y12, Y0, Y10 - VPSHUFB Y13, Y1, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 - VPSHUFB Y12, Y2, Y10 - VPSHUFB Y13, Y3, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 - VPSHUFB Y12, Y4, Y10 - VPSHUFB Y13, Y5, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPSHUFB Y11, Y0, Y8 + VPSHUFB Y12, Y1, Y10 + VPXOR Y8, Y10, Y6 + VPSHUFB Y11, Y2, Y8 + VPSHUFB Y12, Y3, Y10 + VPXOR Y8, Y10, Y7 + VPSHUFB Y11, Y4, Y8 + VPSHUFB Y12, Y5, Y10 + VPXOR Y8, Y10, Y8 // Store 3 outputs VMOVDQU Y6, (BX) @@ -354,40 +590,207 @@ mulAvxTwo_1x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 14 YMM used + // Destination kept in GP registers + // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_1x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), AX - MOVQ out_base+48(FP), DX - MOVQ out_base+48(FP), DX - MOVQ start+72(FP), BX + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX // Add start offset to input - ADDQ BX, AX - MOVQ $0x0000000f, SI - MOVQ SI, X6 + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), SI - SHRQ $0x06, SI mulAvxTwo_1x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y6, Y9, Y9 + VPAND Y6, Y11, Y11 + VPAND Y6, Y10, Y10 + VPAND Y6, Y12, Y12 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y0 + VPXOR Y5, Y7, Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y2 + VPXOR Y5, Y7, Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y8 + VPSHUFB Y11, Y4, Y5 + VPSHUFB Y9, Y4, Y4 + VPSHUFB Y12, Y8, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y4, Y8, Y4 + VPXOR Y5, Y7, Y5 + + // Store 3 outputs + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3_64_loop + VZEROUPPER + +mulAvxTwo_1x3_64_end: + RET + +// func mulAvxTwo_1x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), CX + MOVQ out_base+48(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, BX + ADDQ DI, SI + ADDQ DI, DX + + // Add start offset to input + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_1x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (BX), Y6 + VPSHUFB Y12, Y0, Y10 + VPSHUFB Y13, Y1, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (SI), Y7 + VPSHUFB Y12, Y2, Y10 + VPSHUFB Y13, Y3, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (DX), Y8 + VPSHUFB Y12, Y4, Y10 + VPSHUFB Y13, Y5, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 3 outputs + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + VMOVDQU Y8, (DX) + ADDQ $0x20, DX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x3Xor_loop + VZEROUPPER + +mulAvxTwo_1x3Xor_end: + RET + +// func mulAvxTwo_1x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_1x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), BX + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, SI + ADDQ R8, DI + ADDQ R8, BX + + // Add start offset to input + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (SI), Y0 + VMOVDQU 32(SI), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -426,23 +829,22 @@ mulAvxTwo_1x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (DX), DI - VMOVDQU Y0, (DI)(BX*1) - VMOVDQU Y1, 32(DI)(BX*1) - MOVQ 24(DX), DI - VMOVDQU Y2, (DI)(BX*1) - VMOVDQU Y3, 32(DI)(BX*1) - MOVQ 48(DX), DI - VMOVDQU Y4, (DI)(BX*1) - VMOVDQU Y5, 32(DI)(BX*1) + VMOVDQU Y0, (SI) + VMOVDQU Y1, 32(SI) + ADDQ $0x40, SI + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX // Prepare for next loop - ADDQ $0x40, BX - DECQ SI - JNZ mulAvxTwo_1x3_64_loop + DECQ AX + JNZ mulAvxTwo_1x3_64Xor_loop VZEROUPPER -mulAvxTwo_1x3_64_end: +mulAvxTwo_1x3_64Xor_end: RET // func mulAvxTwo_1x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -478,36 +880,112 @@ TEXT ·mulAvxTwo_1x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_1x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y4, Y6, Y6 + VPAND Y4, Y7, Y7 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y0 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y1 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y2 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y7, Y5, Y5 + VPXOR Y3, Y5, Y3 + // Store 4 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x4_loop + VZEROUPPER + +mulAvxTwo_1x4_end: + RET + +// func mulAvxTwo_1x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 17 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), BX + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, BX + + // Add start offset to input + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_1x4Xor_loop: // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX VPSRLQ $0x04, Y7, Y8 VPAND Y4, Y7, Y7 VPAND Y4, Y8, Y8 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y5 VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 VPXOR Y5, Y6, Y5 VPXOR Y5, Y2, Y2 + VMOVDQU (BX), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 @@ -527,10 +1005,10 @@ mulAvxTwo_1x4_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x4_loop + JNZ mulAvxTwo_1x4Xor_loop VZEROUPPER -mulAvxTwo_1x4_end: +mulAvxTwo_1x4Xor_end: RET // func mulAvxTwo_1x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -568,43 +1046,128 @@ TEXT ·mulAvxTwo_1x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_1x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y5, Y7, Y7 + VPAND Y5, Y8, Y8 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y0 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y1 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y2 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y3 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y4, Y4 + VPSHUFB Y8, Y6, Y6 + VPXOR Y4, Y6, Y4 + // Store 5 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x5_loop + VZEROUPPER + +mulAvxTwo_1x5_end: + RET + +// func mulAvxTwo_1x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), BX + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, BX + + // Add start offset to input + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_1x5Xor_loop: // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX VPSRLQ $0x04, Y8, Y9 VPAND Y5, Y8, Y8 VPAND Y5, Y9, Y9 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y6 VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 VPXOR Y6, Y7, Y6 VPXOR Y6, Y3, Y3 + VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 @@ -626,10 +1189,10 @@ mulAvxTwo_1x5_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x5_loop + JNZ mulAvxTwo_1x5Xor_loop VZEROUPPER -mulAvxTwo_1x5_end: +mulAvxTwo_1x5Xor_end: RET // func mulAvxTwo_1x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -669,50 +1232,144 @@ TEXT ·mulAvxTwo_1x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_1x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y9, Y9 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y0 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y1 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y2 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y3 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y4 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y5, Y5 + VPSHUFB Y9, Y7, Y7 + VPXOR Y5, Y7, Y5 + // Store 6 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x6_loop + VZEROUPPER + +mulAvxTwo_1x6_end: + RET + +// func mulAvxTwo_1x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), BX + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, BX + + // Add start offset to input + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_1x6Xor_loop: // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX VPSRLQ $0x04, Y9, Y10 VPAND Y6, Y9, Y9 VPAND Y6, Y10, Y10 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 VPXOR Y7, Y8, Y7 VPXOR Y7, Y4, Y4 + VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 @@ -736,10 +1393,10 @@ mulAvxTwo_1x6_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x6_loop + JNZ mulAvxTwo_1x6Xor_loop VZEROUPPER -mulAvxTwo_1x6_end: +mulAvxTwo_1x6Xor_end: RET // func mulAvxTwo_1x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -781,57 +1438,160 @@ TEXT ·mulAvxTwo_1x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_1x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y7, Y9, Y9 + VPAND Y7, Y10, Y10 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y0 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y1 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y2 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y3 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y4 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y5 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y6, Y6 + VPSHUFB Y10, Y8, Y8 + VPXOR Y6, Y8, Y6 + // Store 7 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x7_loop + VZEROUPPER + +mulAvxTwo_1x7_end: + RET + +// func mulAvxTwo_1x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), BX + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, BX + + // Add start offset to input + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_1x7Xor_loop: // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX VPSRLQ $0x04, Y10, Y11 VPAND Y7, Y10, Y10 VPAND Y7, Y11, Y11 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y8 VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 VPXOR Y8, Y9, Y8 VPXOR Y8, Y5, Y5 + VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 @@ -857,10 +1617,10 @@ mulAvxTwo_1x7_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x7_loop + JNZ mulAvxTwo_1x7Xor_loop VZEROUPPER -mulAvxTwo_1x7_end: +mulAvxTwo_1x7Xor_end: RET // func mulAvxTwo_1x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -904,64 +1664,176 @@ TEXT ·mulAvxTwo_1x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_1x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y8, Y10, Y10 + VPAND Y8, Y11, Y11 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y0 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y1 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y2 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y3 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y4 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y5 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y6 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y7, Y7 + VPSHUFB Y11, Y9, Y9 + VPXOR Y7, Y9, Y7 + // Store 8 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x8_loop + VZEROUPPER + +mulAvxTwo_1x8_end: + RET + +// func mulAvxTwo_1x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 29 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), BX + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, BX + + // Add start offset to input + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_1x8Xor_loop: // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX VPSRLQ $0x04, Y11, Y12 VPAND Y8, Y11, Y11 VPAND Y8, Y12, Y12 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y9 VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y5, Y5 + VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 VPXOR Y9, Y10, Y9 VPXOR Y9, Y6, Y6 + VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 @@ -989,10 +1861,10 @@ mulAvxTwo_1x8_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x8_loop + JNZ mulAvxTwo_1x8Xor_loop VZEROUPPER -mulAvxTwo_1x8_end: +mulAvxTwo_1x8Xor_end: RET // func mulAvxTwo_1x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1038,71 +1910,192 @@ TEXT ·mulAvxTwo_1x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_1x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y9, Y11, Y11 + VPAND Y9, Y12, Y12 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y0 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y1 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y2 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y3 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y4 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y5 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y6 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y7 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y8, Y8 + VPSHUFB Y12, Y10, Y10 + VPXOR Y8, Y10, Y8 + // Store 9 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x9_loop + VZEROUPPER + +mulAvxTwo_1x9_end: + RET + +// func mulAvxTwo_1x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), BX + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, BX + + // Add start offset to input + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_1x9Xor_loop: // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX VPSRLQ $0x04, Y12, Y13 VPAND Y9, Y12, Y12 VPAND Y9, Y13, Y13 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y10 VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y5, Y5 + VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y6, Y6 + VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 VPXOR Y10, Y11, Y10 VPXOR Y10, Y7, Y7 + VMOVDQU (BX), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 @@ -1132,10 +2125,10 @@ mulAvxTwo_1x9_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x9_loop + JNZ mulAvxTwo_1x9Xor_loop VZEROUPPER -mulAvxTwo_1x9_end: +mulAvxTwo_1x9Xor_end: RET // func mulAvxTwo_1x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1183,78 +2176,208 @@ TEXT ·mulAvxTwo_1x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_1x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y10, Y12, Y12 + VPAND Y10, Y13, Y13 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y0 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y1 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y2 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y3 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y4 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y5 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y6 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y7 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y8 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y9, Y9 + VPSHUFB Y13, Y11, Y11 + VPXOR Y9, Y11, Y9 + // Store 10 outputs + VMOVDQU Y0, (SI) + ADDQ $0x20, SI + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + VMOVDQU Y5, (R11) + ADDQ $0x20, R11 + VMOVDQU Y6, (R12) + ADDQ $0x20, R12 + VMOVDQU Y7, (R13) + ADDQ $0x20, R13 + VMOVDQU Y8, (R14) + ADDQ $0x20, R14 + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_1x10_loop + VZEROUPPER + +mulAvxTwo_1x10_end: + RET + +// func mulAvxTwo_1x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_1x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_1x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), DX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), DI + MOVQ 48(BX), R8 + MOVQ 72(BX), R9 + MOVQ 96(BX), R10 + MOVQ 120(BX), R11 + MOVQ 144(BX), R12 + MOVQ 168(BX), R13 + MOVQ 192(BX), R14 + MOVQ 216(BX), BX + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, BX + + // Add start offset to input + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_1x10Xor_loop: // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX VPSRLQ $0x04, Y13, Y14 VPAND Y10, Y13, Y13 VPAND Y10, Y14, Y14 + VMOVDQU (SI), Y0 VMOVDQU (CX), Y11 VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 + VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -1286,10 +2409,10 @@ mulAvxTwo_1x10_loop: // Prepare for next loop DECQ AX - JNZ mulAvxTwo_1x10_loop + JNZ mulAvxTwo_1x10Xor_loop VZEROUPPER -mulAvxTwo_1x10_end: +mulAvxTwo_1x10Xor_end: RET // func mulAvxTwo_2x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -1325,9 +2448,6 @@ TEXT ·mulAvxTwo_2x1(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_2x1_loop: - // Clear 1 outputs - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -1336,8 +2456,7 @@ mulAvxTwo_2x1_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 @@ -1365,91 +2484,243 @@ mulAvxTwo_2x1_end: // func mulAvxTwo_2x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x1_64(SB), $0-88 - // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 8 YMM used - MOVQ n+80(FP), AX - MOVQ matrix_base+0(FP), CX - SHRQ $0x06, AX - TESTQ AX, AX - JZ mulAvxTwo_2x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX // Add start offset to input ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X2 - VPBROADCASTB X2, Y2 - MOVQ n+80(FP), DI - SHRQ $0x06, DI + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 mulAvxTwo_2x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 ADDQ $0x40, DX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU (CX), Y3 - VMOVDQU 32(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX - VPSRLQ $0x04, Y6, Y7 - VPSRLQ $0x04, Y5, Y8 - VPAND Y2, Y6, Y6 - VPAND Y2, Y5, Y5 - VPAND Y2, Y7, Y7 - VPAND Y2, Y8, Y8 - VMOVDQU 64(CX), Y3 - VMOVDQU 96(CX), Y4 - VPSHUFB Y5, Y3, Y5 - VPSHUFB Y6, Y3, Y3 - VPSHUFB Y8, Y4, Y6 - VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 // Store 1 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX // Prepare for next loop - ADDQ $0x40, SI - DECQ DI + DECQ AX JNZ mulAvxTwo_2x1_64_loop VZEROUPPER mulAvxTwo_2x1_64_end: RET +// func mulAvxTwo_2x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 8 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VMOVDQU (BX), Y4 + VPSHUFB Y6, Y0, Y6 + VPSHUFB Y7, Y1, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y6 + ADDQ $0x20, CX + VPSRLQ $0x04, Y6, Y7 + VPAND Y5, Y6, Y6 + VPAND Y5, Y7, Y7 + VPSHUFB Y6, Y2, Y6 + VPSHUFB Y7, Y3, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 1 outputs + VMOVDQU Y4, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1Xor_loop + VZEROUPPER + +mulAvxTwo_2x1Xor_end: + RET + +// func mulAvxTwo_2x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x1_64Xor(SB), $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x1_64Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ out_base+48(FP), BX + MOVQ (BX), BX + MOVQ start+72(FP), SI + + // Add start offset to output + ADDQ SI, BX + + // Add start offset to input + ADDQ SI, DX + ADDQ SI, CX + MOVQ $0x0000000f, SI + MOVQ SI, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (BX), Y4 + VMOVDQU 32(BX), Y5 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y7 + VMOVDQU 32(DX), Y9 + ADDQ $0x40, DX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y0, Y7 + VPSHUFB Y9, Y0, Y9 + VPSHUFB Y8, Y1, Y8 + VPSHUFB Y10, Y1, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y9 + ADDQ $0x40, CX + VPSRLQ $0x04, Y7, Y8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y7, Y7 + VPAND Y6, Y9, Y9 + VPAND Y6, Y8, Y8 + VPAND Y6, Y10, Y10 + VPSHUFB Y7, Y2, Y7 + VPSHUFB Y9, Y2, Y9 + VPSHUFB Y8, Y3, Y8 + VPSHUFB Y10, Y3, Y10 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 1 outputs + VMOVDQU Y4, (BX) + VMOVDQU Y5, 32(BX) + ADDQ $0x40, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x1_64Xor_end: + RET + // func mulAvxTwo_2x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 @@ -1489,10 +2760,6 @@ TEXT ·mulAvxTwo_2x2(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_2x2_loop: - // Clear 2 outputs - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -1501,12 +2768,10 @@ mulAvxTwo_2x2_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 @@ -1541,40 +2806,38 @@ mulAvxTwo_2x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 15 YMM used + // Destination kept in GP registers + // Full registers estimated 25 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI // Add start offset to input - ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X4 + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), DI - SHRQ $0x06, DI mulAvxTwo_2x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -1587,25 +2850,21 @@ mulAvxTwo_2x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -1634,22 +2893,222 @@ mulAvxTwo_2x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) - MOVQ 24(BX), R8 - VMOVDQU Y2, (R8)(SI*1) - VMOVDQU Y3, 32(R8)(SI*1) + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI // Prepare for next loop - ADDQ $0x40, SI - DECQ DI + DECQ AX JNZ mulAvxTwo_2x2_64_loop VZEROUPPER mulAvxTwo_2x2_64_end: RET +// func mulAvxTwo_2x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x2Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 15 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), CX + MOVQ out_base+48(FP), BX + MOVQ (BX), SI + MOVQ 24(BX), BX + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + ADDQ DI, BX + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (SI), Y8 + VPSHUFB Y13, Y0, Y11 + VPSHUFB Y14, Y1, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU (BX), Y9 + VPSHUFB Y13, Y2, Y11 + VPSHUFB Y14, Y3, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (CX), Y13 + ADDQ $0x20, CX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VPSHUFB Y13, Y4, Y11 + VPSHUFB Y14, Y5, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VPSHUFB Y13, Y6, Y11 + VPSHUFB Y14, Y7, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 2 outputs + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + VMOVDQU Y9, (BX) + ADDQ $0x20, BX + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2Xor_loop + VZEROUPPER + +mulAvxTwo_2x2Xor_end: + RET + +// func mulAvxTwo_2x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), SI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + ADDQ R8, SI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (SI), Y2 + VMOVDQU 32(SI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (SI) + VMOVDQU Y3, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x2_64Xor_end: + RET + // func mulAvxTwo_2x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 @@ -1683,11 +3142,6 @@ TEXT ·mulAvxTwo_2x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_2x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -1698,20 +3152,17 @@ mulAvxTwo_2x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 @@ -1758,42 +3209,40 @@ mulAvxTwo_2x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 20 YMM used + // Destination kept in GP registers + // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_2x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), AX - MOVQ out_base+48(FP), BX - MOVQ out_base+48(FP), BX - MOVQ start+72(FP), SI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI // Add start offset to input - ADDQ SI, DX - ADDQ SI, AX - MOVQ $0x0000000f, DI - MOVQ DI, X6 + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), DI - SHRQ $0x06, DI mulAvxTwo_2x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -1806,35 +3255,29 @@ mulAvxTwo_2x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -1873,25 +3316,269 @@ mulAvxTwo_2x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (BX), R8 - VMOVDQU Y0, (R8)(SI*1) - VMOVDQU Y1, 32(R8)(SI*1) - MOVQ 24(BX), R8 - VMOVDQU Y2, (R8)(SI*1) - VMOVDQU Y3, 32(R8)(SI*1) - MOVQ 48(BX), R8 - VMOVDQU Y4, (R8)(SI*1) - VMOVDQU Y5, 32(R8)(SI*1) + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI // Prepare for next loop - ADDQ $0x40, SI - DECQ DI + DECQ AX JNZ mulAvxTwo_2x3_64_loop VZEROUPPER mulAvxTwo_2x3_64_end: RET +// func mulAvxTwo_2x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_2x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (SI), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3Xor_loop + VZEROUPPER + +mulAvxTwo_2x3Xor_end: + RET + +// func mulAvxTwo_2x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_2x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), SI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, DI + ADDQ R9, R8 + ADDQ R9, SI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + VMOVDQU (SI), Y4 + VMOVDQU 32(SI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y4, (SI) + VMOVDQU Y5, 32(SI) + ADDQ $0x40, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_2x3_64Xor_end: + RET + // func mulAvxTwo_2x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 @@ -1927,12 +3614,6 @@ TEXT ·mulAvxTwo_2x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_2x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -1943,26 +3624,22 @@ mulAvxTwo_2x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 @@ -2013,6 +3690,125 @@ mulAvxTwo_2x4_loop: mulAvxTwo_2x4_end: RET +// func mulAvxTwo_2x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 25 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), SI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, SI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_2x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (SI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x4Xor_loop + VZEROUPPER + +mulAvxTwo_2x4Xor_end: + RET + // func mulAvxTwo_2x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 @@ -2050,13 +3846,6 @@ TEXT ·mulAvxTwo_2x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_2x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -2067,32 +3856,27 @@ mulAvxTwo_2x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 @@ -2151,6 +3935,142 @@ mulAvxTwo_2x5_loop: mulAvxTwo_2x5_end: RET +// func mulAvxTwo_2x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), SI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, SI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_2x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (SI), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x5Xor_loop + VZEROUPPER + +mulAvxTwo_2x5Xor_end: + RET + // func mulAvxTwo_2x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 @@ -2190,14 +4110,6 @@ TEXT ·mulAvxTwo_2x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_2x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -2208,38 +4120,32 @@ mulAvxTwo_2x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 @@ -2306,6 +4212,159 @@ mulAvxTwo_2x6_loop: mulAvxTwo_2x6_end: RET +// func mulAvxTwo_2x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), SI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, SI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_2x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (SI), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x6Xor_loop + VZEROUPPER + +mulAvxTwo_2x6Xor_end: + RET + // func mulAvxTwo_2x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 @@ -2347,15 +4406,6 @@ TEXT ·mulAvxTwo_2x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_2x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -2366,44 +4416,37 @@ mulAvxTwo_2x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 @@ -2478,6 +4521,176 @@ mulAvxTwo_2x7_loop: mulAvxTwo_2x7_end: RET +// func mulAvxTwo_2x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), SI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, SI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_2x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (SI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x7Xor_loop + VZEROUPPER + +mulAvxTwo_2x7Xor_end: + RET + // func mulAvxTwo_2x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 @@ -2521,16 +4734,6 @@ TEXT ·mulAvxTwo_2x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_2x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -2541,50 +4744,42 @@ mulAvxTwo_2x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 @@ -2667,6 +4862,193 @@ mulAvxTwo_2x8_loop: mulAvxTwo_2x8_end: RET +// func mulAvxTwo_2x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 45 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), SI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, SI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_2x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (SI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x8Xor_loop + VZEROUPPER + +mulAvxTwo_2x8Xor_end: + RET + // func mulAvxTwo_2x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 @@ -2712,17 +5094,6 @@ TEXT ·mulAvxTwo_2x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_2x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -2733,56 +5104,47 @@ mulAvxTwo_2x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 @@ -2873,6 +5235,210 @@ mulAvxTwo_2x9_loop: mulAvxTwo_2x9_end: RET +// func mulAvxTwo_2x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), SI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, SI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_2x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (SI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x9Xor_loop + VZEROUPPER + +mulAvxTwo_2x9Xor_end: + RET + // func mulAvxTwo_2x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 @@ -2920,18 +5486,6 @@ TEXT ·mulAvxTwo_2x10(SB), NOSPLIT, $8-88 VPBROADCASTB X10, Y10 mulAvxTwo_2x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -2942,62 +5496,52 @@ mulAvxTwo_2x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 @@ -3096,6 +5640,227 @@ mulAvxTwo_2x10_loop: mulAvxTwo_2x10_end: RET +// func mulAvxTwo_2x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_2x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 55 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_2x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), DX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_2x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU (R15), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU (SI), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_2x10Xor_loop + VZEROUPPER + +mulAvxTwo_2x10Xor_end: + RET + // func mulAvxTwo_3x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 @@ -3133,9 +5898,6 @@ TEXT ·mulAvxTwo_3x1(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_3x1_loop: - // Clear 1 outputs - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -3144,8 +5906,7 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 @@ -3185,40 +5946,38 @@ mulAvxTwo_3x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 10 YMM used + // Destination kept in GP registers + // Full registers estimated 18 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX MOVQ $0x0000000f, R8 MOVQ R8, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -3231,15 +5990,13 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -3258,9 +6015,9 @@ mulAvxTwo_3x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -3279,19 +6036,213 @@ mulAvxTwo_3x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 + DECQ AX JNZ mulAvxTwo_3x1_64_loop VZEROUPPER mulAvxTwo_3x1_64_end: RET +// func mulAvxTwo_3x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 10 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), CX + MOVQ out_base+48(FP), SI + MOVQ (SI), SI + MOVQ start+72(FP), DI + + // Add start offset to output + ADDQ DI, SI + + // Add start offset to input + ADDQ DI, DX + ADDQ DI, BX + ADDQ DI, CX + MOVQ $0x0000000f, DI + MOVQ DI, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_3x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VMOVDQU (SI), Y6 + VPSHUFB Y8, Y0, Y8 + VPSHUFB Y9, Y1, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y2, Y8 + VPSHUFB Y9, Y3, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (CX), Y8 + ADDQ $0x20, CX + VPSRLQ $0x04, Y8, Y9 + VPAND Y7, Y8, Y8 + VPAND Y7, Y9, Y9 + VPSHUFB Y8, Y4, Y8 + VPSHUFB Y9, Y5, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 1 outputs + VMOVDQU Y6, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1Xor_loop + VZEROUPPER + +mulAvxTwo_3x1Xor_end: + RET + +// func mulAvxTwo_3x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, DX + MOVQ $0x0000000f, R8 + MOVQ R8, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_3x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (DI), Y0 + VMOVDQU 32(DI), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (DI) + VMOVDQU Y1, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x1_64Xor_end: + RET + // func mulAvxTwo_3x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 @@ -3325,10 +6276,6 @@ TEXT ·mulAvxTwo_3x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_3x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -3339,14 +6286,12 @@ mulAvxTwo_3x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -3404,42 +6349,40 @@ mulAvxTwo_3x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 19 YMM used + // Destination kept in GP registers + // Full registers estimated 33 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX - MOVQ $0x0000000f, R8 - MOVQ R8, X4 + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3452,25 +6395,21 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3499,9 +6438,9 @@ mulAvxTwo_3x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -3530,22 +6469,276 @@ mulAvxTwo_3x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) - MOVQ 24(SI), R9 - VMOVDQU Y2, (R9)(DI*1) - VMOVDQU Y3, 32(R9)(DI*1) + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 + DECQ AX JNZ mulAvxTwo_3x2_64_loop VZEROUPPER mulAvxTwo_3x2_64_end: RET +// func mulAvxTwo_3x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 19 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_3x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (DI), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2Xor_loop + VZEROUPPER + +mulAvxTwo_3x2Xor_end: + RET + +// func mulAvxTwo_3x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), DI + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + ADDQ R9, DI + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (DI), Y2 + VMOVDQU 32(DI), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (DI) + VMOVDQU Y3, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x2_64Xor_end: + RET + // func mulAvxTwo_3x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 @@ -3581,11 +6774,6 @@ TEXT ·mulAvxTwo_3x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_3x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -3596,20 +6784,17 @@ mulAvxTwo_3x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -3681,44 +6866,42 @@ mulAvxTwo_3x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 26 YMM used + // Destination kept in GP registers + // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_3x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), AX - MOVQ out_base+48(FP), SI - MOVQ out_base+48(FP), SI - MOVQ start+72(FP), DI + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI // Add start offset to input - ADDQ DI, DX - ADDQ DI, BX - ADDQ DI, AX - MOVQ $0x0000000f, R8 - MOVQ R8, X6 + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R8 - SHRQ $0x06, R8 mulAvxTwo_3x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3731,35 +6914,29 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3798,9 +6975,9 @@ mulAvxTwo_3x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -3839,25 +7016,339 @@ mulAvxTwo_3x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (SI), R9 - VMOVDQU Y0, (R9)(DI*1) - VMOVDQU Y1, 32(R9)(DI*1) - MOVQ 24(SI), R9 - VMOVDQU Y2, (R9)(DI*1) - VMOVDQU Y3, 32(R9)(DI*1) - MOVQ 48(SI), R9 - VMOVDQU Y4, (R9)(DI*1) - VMOVDQU Y5, 32(R9)(DI*1) + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI // Prepare for next loop - ADDQ $0x40, DI - DECQ R8 + DECQ AX JNZ mulAvxTwo_3x3_64_loop VZEROUPPER mulAvxTwo_3x3_64_end: RET +// func mulAvxTwo_3x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_3x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (DI), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x3Xor_loop + VZEROUPPER + +mulAvxTwo_3x3Xor_end: + RET + +// func mulAvxTwo_3x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_3x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), DI + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R8 + ADDQ R10, R9 + ADDQ R10, DI + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + VMOVDQU (DI), Y4 + VMOVDQU 32(DI), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y4, (DI) + VMOVDQU Y5, 32(DI) + ADDQ $0x40, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_3x3_64Xor_end: + RET + // func mulAvxTwo_3x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 @@ -3895,12 +7386,6 @@ TEXT ·mulAvxTwo_3x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_3x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -3911,26 +7396,22 @@ mulAvxTwo_3x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -4012,6 +7493,158 @@ mulAvxTwo_3x4_loop: mulAvxTwo_3x4_end: RET +// func mulAvxTwo_3x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 33 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), DI + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, DI + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_3x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (DI), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x4Xor_loop + VZEROUPPER + +mulAvxTwo_3x4Xor_end: + RET + // func mulAvxTwo_3x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 @@ -4051,13 +7684,6 @@ TEXT ·mulAvxTwo_3x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_3x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -4068,32 +7694,27 @@ mulAvxTwo_3x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -4189,6 +7810,181 @@ mulAvxTwo_3x5_loop: mulAvxTwo_3x5_end: RET +// func mulAvxTwo_3x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 40 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), DI + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, DI + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_3x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (DI), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x5Xor_loop + VZEROUPPER + +mulAvxTwo_3x5Xor_end: + RET + // func mulAvxTwo_3x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 @@ -4230,14 +8026,6 @@ TEXT ·mulAvxTwo_3x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_3x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -4248,38 +8036,32 @@ mulAvxTwo_3x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -4389,6 +8171,204 @@ mulAvxTwo_3x6_loop: mulAvxTwo_3x6_end: RET +// func mulAvxTwo_3x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), DI + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, DI + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_3x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (DI), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x6Xor_loop + VZEROUPPER + +mulAvxTwo_3x6Xor_end: + RET + // func mulAvxTwo_3x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 @@ -4432,15 +8412,6 @@ TEXT ·mulAvxTwo_3x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_3x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -4451,44 +8422,37 @@ mulAvxTwo_3x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -4612,6 +8576,227 @@ mulAvxTwo_3x7_loop: mulAvxTwo_3x7_end: RET +// func mulAvxTwo_3x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 54 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), DI + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, DI + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_3x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (DI), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x7Xor_loop + VZEROUPPER + +mulAvxTwo_3x7Xor_end: + RET + // func mulAvxTwo_3x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 @@ -4657,16 +8842,6 @@ TEXT ·mulAvxTwo_3x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_3x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -4677,50 +8852,42 @@ mulAvxTwo_3x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -4858,6 +9025,250 @@ mulAvxTwo_3x8_loop: mulAvxTwo_3x8_end: RET +// func mulAvxTwo_3x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 61 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), DI + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, DI + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_3x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (DI), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x8Xor_loop + VZEROUPPER + +mulAvxTwo_3x8Xor_end: + RET + // func mulAvxTwo_3x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 @@ -4905,17 +9316,6 @@ TEXT ·mulAvxTwo_3x9(SB), NOSPLIT, $8-88 VPBROADCASTB X9, Y9 mulAvxTwo_3x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -4926,56 +9326,47 @@ mulAvxTwo_3x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -5127,6 +9518,273 @@ mulAvxTwo_3x9_loop: mulAvxTwo_3x9_end: RET +// func mulAvxTwo_3x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_3x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_3x9Xor_loop + VZEROUPPER + +mulAvxTwo_3x9Xor_end: + RET + // func mulAvxTwo_3x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 @@ -5178,18 +9836,6 @@ TEXT ·mulAvxTwo_3x10(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_3x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -5200,62 +9846,52 @@ mulAvxTwo_3x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 @@ -5421,6 +10057,298 @@ mulAvxTwo_3x10_loop: mulAvxTwo_3x10_end: RET +// func mulAvxTwo_3x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_3x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 75 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_3x10Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), AX + MOVQ out_base+48(FP), SI + MOVQ (SI), DI + MOVQ 24(SI), R8 + MOVQ 48(SI), R9 + MOVQ 72(SI), R10 + MOVQ 96(SI), R11 + MOVQ 120(SI), R12 + MOVQ 144(SI), R13 + MOVQ 168(SI), R14 + MOVQ 192(SI), R15 + MOVQ 216(SI), SI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, SI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_3x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU (DI), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU (R12), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU (R13), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU (R14), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU (R15), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU (SI), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (AX), Y13 + ADDQ $0x20, AX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + VMOVDQU Y0, (DI) + ADDQ $0x20, DI + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + VMOVDQU Y5, (R12) + ADDQ $0x20, R12 + VMOVDQU Y6, (R13) + ADDQ $0x20, R13 + VMOVDQU Y7, (R14) + ADDQ $0x20, R14 + VMOVDQU Y8, (R15) + ADDQ $0x20, R15 + VMOVDQU Y9, (SI) + ADDQ $0x20, SI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_3x10Xor_loop + VZEROUPPER + +mulAvxTwo_3x10Xor_end: + RET + // func mulAvxTwo_4x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 @@ -5462,9 +10390,6 @@ TEXT ·mulAvxTwo_4x1(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_4x1_loop: - // Clear 1 outputs - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -5473,8 +10398,7 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y0, Y10 VPSHUFB Y11, Y1, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 @@ -5525,42 +10449,40 @@ mulAvxTwo_4x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 12 YMM used + // Destination kept in GP registers + // Full registers estimated 22 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX MOVQ $0x0000000f, R9 MOVQ R9, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 mulAvxTwo_4x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5573,15 +10495,13 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5600,9 +10520,9 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5621,9 +10541,9 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -5642,19 +10562,251 @@ mulAvxTwo_4x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 + DECQ AX JNZ mulAvxTwo_4x1_64_loop VZEROUPPER mulAvxTwo_4x1_64_end: RET +// func mulAvxTwo_4x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 12 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), CX + MOVQ out_base+48(FP), DI + MOVQ (DI), DI + MOVQ start+72(FP), R8 + + // Add start offset to output + ADDQ R8, DI + + // Add start offset to input + ADDQ R8, DX + ADDQ R8, BX + ADDQ R8, SI + ADDQ R8, CX + MOVQ $0x0000000f, R8 + MOVQ R8, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_4x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VMOVDQU (DI), Y8 + VPSHUFB Y10, Y0, Y10 + VPSHUFB Y11, Y1, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y2, Y10 + VPSHUFB Y11, Y3, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y4, Y10 + VPSHUFB Y11, Y5, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (CX), Y10 + ADDQ $0x20, CX + VPSRLQ $0x04, Y10, Y11 + VPAND Y9, Y10, Y10 + VPAND Y9, Y11, Y11 + VPSHUFB Y10, Y6, Y10 + VPSHUFB Y11, Y7, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 1 outputs + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1Xor_loop + VZEROUPPER + +mulAvxTwo_4x1Xor_end: + RET + +// func mulAvxTwo_4x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R9 + MOVQ R9, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R8), Y0 + VMOVDQU 32(R8), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R8) + VMOVDQU Y1, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x1_64Xor_end: + RET + // func mulAvxTwo_4x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 @@ -5690,10 +10842,6 @@ TEXT ·mulAvxTwo_4x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_4x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -5704,14 +10852,12 @@ mulAvxTwo_4x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -5788,44 +10934,42 @@ mulAvxTwo_4x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 23 YMM used + // Destination kept in GP registers + // Full registers estimated 41 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX - MOVQ $0x0000000f, R9 - MOVQ R9, X4 + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 mulAvxTwo_4x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5838,25 +10982,21 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5885,9 +11025,9 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5916,9 +11056,9 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -5947,22 +11087,330 @@ mulAvxTwo_4x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) - MOVQ 24(DI), R10 - VMOVDQU Y2, (R10)(R8*1) - VMOVDQU Y3, 32(R10)(R8*1) + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 + DECQ AX JNZ mulAvxTwo_4x2_64_loop VZEROUPPER mulAvxTwo_4x2_64_end: RET +// func mulAvxTwo_4x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 23 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_4x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R8), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2Xor_loop + VZEROUPPER + +mulAvxTwo_4x2Xor_end: + RET + +// func mulAvxTwo_4x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R8 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + ADDQ R10, R8 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R8), Y2 + VMOVDQU 32(R8), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R8) + VMOVDQU Y3, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x2_64Xor_end: + RET + // func mulAvxTwo_4x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 @@ -6000,11 +11448,6 @@ TEXT ·mulAvxTwo_4x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_4x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -6015,20 +11458,17 @@ mulAvxTwo_4x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -6125,46 +11565,44 @@ mulAvxTwo_4x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 32 YMM used + // Destination kept in GP registers + // Full registers estimated 58 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_4x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), AX - MOVQ out_base+48(FP), DI - MOVQ out_base+48(FP), DI - MOVQ start+72(FP), R8 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 // Add start offset to input - ADDQ R8, DX - ADDQ R8, BX - ADDQ R8, SI - ADDQ R8, AX - MOVQ $0x0000000f, R9 - MOVQ R9, X6 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R9 - SHRQ $0x06, R9 mulAvxTwo_4x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6177,35 +11615,29 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6244,9 +11676,9 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6285,9 +11717,9 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -6326,25 +11758,409 @@ mulAvxTwo_4x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (DI), R10 - VMOVDQU Y0, (R10)(R8*1) - VMOVDQU Y1, 32(R10)(R8*1) - MOVQ 24(DI), R10 - VMOVDQU Y2, (R10)(R8*1) - VMOVDQU Y3, 32(R10)(R8*1) - MOVQ 48(DI), R10 - VMOVDQU Y4, (R10)(R8*1) - VMOVDQU Y5, 32(R10)(R8*1) + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 // Prepare for next loop - ADDQ $0x40, R8 - DECQ R9 + DECQ AX JNZ mulAvxTwo_4x3_64_loop VZEROUPPER mulAvxTwo_4x3_64_end: RET +// func mulAvxTwo_4x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 32 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_4x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R8), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3Xor_loop + VZEROUPPER + +mulAvxTwo_4x3Xor_end: + RET + +// func mulAvxTwo_4x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 58 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_4x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R8 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R9 + ADDQ R11, R10 + ADDQ R11, R8 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + VMOVDQU (R8), Y4 + VMOVDQU 32(R8), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y4, (R8) + VMOVDQU Y5, 32(R8) + ADDQ $0x40, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_4x3_64Xor_end: + RET + // func mulAvxTwo_4x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 @@ -6384,12 +12200,6 @@ TEXT ·mulAvxTwo_4x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_4x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -6400,26 +12210,22 @@ mulAvxTwo_4x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -6532,6 +12338,191 @@ mulAvxTwo_4x4_loop: mulAvxTwo_4x4_end: RET +// func mulAvxTwo_4x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 41 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R8 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R8 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_4x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R8), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x4Xor_loop + VZEROUPPER + +mulAvxTwo_4x4Xor_end: + RET + // func mulAvxTwo_4x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 @@ -6573,13 +12564,6 @@ TEXT ·mulAvxTwo_4x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_4x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -6590,32 +12574,27 @@ mulAvxTwo_4x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -6748,6 +12727,220 @@ mulAvxTwo_4x5_loop: mulAvxTwo_4x5_end: RET +// func mulAvxTwo_4x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R8 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R8 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_4x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R8), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x5Xor_loop + VZEROUPPER + +mulAvxTwo_4x5Xor_end: + RET + // func mulAvxTwo_4x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 @@ -6791,14 +12984,6 @@ TEXT ·mulAvxTwo_4x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_4x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -6809,38 +12994,32 @@ mulAvxTwo_4x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -6993,6 +13172,249 @@ mulAvxTwo_4x6_loop: mulAvxTwo_4x6_end: RET +// func mulAvxTwo_4x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 59 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R8 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R8 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_4x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R8), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x6Xor_loop + VZEROUPPER + +mulAvxTwo_4x6Xor_end: + RET + // func mulAvxTwo_4x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 @@ -7038,15 +13460,6 @@ TEXT ·mulAvxTwo_4x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_4x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -7057,44 +13470,37 @@ mulAvxTwo_4x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -7267,6 +13673,278 @@ mulAvxTwo_4x7_loop: mulAvxTwo_4x7_end: RET +// func mulAvxTwo_4x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R8 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R8 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_4x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R8), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x7Xor_loop + VZEROUPPER + +mulAvxTwo_4x7Xor_end: + RET + // func mulAvxTwo_4x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 @@ -7314,16 +13992,6 @@ TEXT ·mulAvxTwo_4x8(SB), NOSPLIT, $8-88 VPBROADCASTB X8, Y8 mulAvxTwo_4x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -7334,50 +14002,42 @@ mulAvxTwo_4x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -7570,6 +14230,307 @@ mulAvxTwo_4x8_loop: mulAvxTwo_4x8_end: RET +// func mulAvxTwo_4x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 77 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_4x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_4x8Xor_loop + VZEROUPPER + +mulAvxTwo_4x8Xor_end: + RET + // func mulAvxTwo_4x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 @@ -7621,17 +14582,6 @@ TEXT ·mulAvxTwo_4x9(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_4x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -7642,56 +14592,47 @@ mulAvxTwo_4x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 @@ -7904,6 +14845,338 @@ mulAvxTwo_4x9_loop: mulAvxTwo_4x9_end: RET +// func mulAvxTwo_4x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 86 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x9Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), AX + MOVQ out_base+48(FP), DI + MOVQ (DI), R8 + MOVQ 24(DI), R9 + MOVQ 48(DI), R10 + MOVQ 72(DI), R11 + MOVQ 96(DI), R12 + MOVQ 120(DI), R13 + MOVQ 144(DI), R14 + MOVQ 168(DI), R15 + MOVQ 192(DI), DI + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, DI + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_4x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU (R8), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU (R12), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU (R13), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU (R14), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU (R15), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU (DI), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (AX), Y12 + ADDQ $0x20, AX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + VMOVDQU Y0, (R8) + ADDQ $0x20, R8 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + VMOVDQU Y4, (R12) + ADDQ $0x20, R12 + VMOVDQU Y5, (R13) + ADDQ $0x20, R13 + VMOVDQU Y6, (R14) + ADDQ $0x20, R14 + VMOVDQU Y7, (R15) + ADDQ $0x20, R15 + VMOVDQU Y8, (DI) + ADDQ $0x20, DI + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_4x9Xor_loop + VZEROUPPER + +mulAvxTwo_4x9Xor_end: + RET + // func mulAvxTwo_4x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 @@ -7933,18 +15206,6 @@ TEXT ·mulAvxTwo_4x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_4x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -7955,62 +15216,52 @@ mulAvxTwo_4x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -8244,6 +15495,354 @@ mulAvxTwo_4x10_loop: mulAvxTwo_4x10_end: RET +// func mulAvxTwo_4x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_4x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_4x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), DX + MOVQ out_base+48(FP), R8 + MOVQ start+72(FP), R9 + + // Add start offset to input + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_4x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R8), R10 + VMOVDQU (R10)(R9*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R8), R10 + VMOVDQU (R10)(R9*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R8), R10 + VMOVDQU (R10)(R9*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R8), R10 + VMOVDQU (R10)(R9*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R8), R10 + VMOVDQU (R10)(R9*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R8), R10 + VMOVDQU (R10)(R9*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R8), R10 + VMOVDQU (R10)(R9*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R8), R10 + VMOVDQU (R10)(R9*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R8), R10 + VMOVDQU (R10)(R9*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R8), R10 + VMOVDQU (R10)(R9*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R8), R10 + VMOVDQU Y0, (R10)(R9*1) + MOVQ 24(R8), R10 + VMOVDQU Y1, (R10)(R9*1) + MOVQ 48(R8), R10 + VMOVDQU Y2, (R10)(R9*1) + MOVQ 72(R8), R10 + VMOVDQU Y3, (R10)(R9*1) + MOVQ 96(R8), R10 + VMOVDQU Y4, (R10)(R9*1) + MOVQ 120(R8), R10 + VMOVDQU Y5, (R10)(R9*1) + MOVQ 144(R8), R10 + VMOVDQU Y6, (R10)(R9*1) + MOVQ 168(R8), R10 + VMOVDQU Y7, (R10)(R9*1) + MOVQ 192(R8), R10 + VMOVDQU Y8, (R10)(R9*1) + MOVQ 216(R8), R10 + VMOVDQU Y9, (R10)(R9*1) + + // Prepare for next loop + ADDQ $0x20, R9 + DECQ AX + JNZ mulAvxTwo_4x10Xor_loop + VZEROUPPER + +mulAvxTwo_4x10Xor_end: + RET + // func mulAvxTwo_5x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 @@ -8289,9 +15888,6 @@ TEXT ·mulAvxTwo_5x1(SB), NOSPLIT, $0-88 VPBROADCASTB X11, Y11 mulAvxTwo_5x1_loop: - // Clear 1 outputs - VPXOR Y10, Y10, Y10 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -8300,8 +15896,7 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y0, Y12 VPSHUFB Y13, Y1, Y13 - VPXOR Y12, Y13, Y12 - VPXOR Y12, Y10, Y10 + VPXOR Y12, Y13, Y10 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y12 @@ -8363,44 +15958,42 @@ mulAvxTwo_5x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 14 YMM used + // Destination kept in GP registers + // Full registers estimated 26 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, AX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX MOVQ $0x0000000f, R10 MOVQ R10, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 mulAvxTwo_5x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8413,15 +16006,13 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8440,9 +16031,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8461,9 +16052,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8482,9 +16073,9 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -8503,19 +16094,289 @@ mulAvxTwo_5x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + DECQ AX JNZ mulAvxTwo_5x1_64_loop VZEROUPPER mulAvxTwo_5x1_64_end: RET +// func mulAvxTwo_5x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 14 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), CX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R8 + MOVQ start+72(FP), R9 + + // Add start offset to output + ADDQ R9, R8 + + // Add start offset to input + ADDQ R9, DX + ADDQ R9, BX + ADDQ R9, SI + ADDQ R9, DI + ADDQ R9, CX + MOVQ $0x0000000f, R9 + MOVQ R9, X11 + VPBROADCASTB X11, Y11 + +mulAvxTwo_5x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VMOVDQU (R8), Y10 + VPSHUFB Y12, Y0, Y12 + VPSHUFB Y13, Y1, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y2, Y12 + VPSHUFB Y13, Y3, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y4, Y12 + VPSHUFB Y13, Y5, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y6, Y12 + VPSHUFB Y13, Y7, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (CX), Y12 + ADDQ $0x20, CX + VPSRLQ $0x04, Y12, Y13 + VPAND Y11, Y12, Y12 + VPAND Y11, Y13, Y13 + VPSHUFB Y12, Y8, Y12 + VPSHUFB Y13, Y9, Y13 + VPXOR Y12, Y13, Y12 + VPXOR Y12, Y10, Y10 + + // Store 1 outputs + VMOVDQU Y10, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1Xor_loop + VZEROUPPER + +mulAvxTwo_5x1Xor_end: + RET + +// func mulAvxTwo_5x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 26 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R10 + MOVQ R10, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R9), Y0 + VMOVDQU 32(R9), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R9) + VMOVDQU Y1, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x1_64Xor_end: + RET + // func mulAvxTwo_5x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 @@ -8553,10 +16414,6 @@ TEXT ·mulAvxTwo_5x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_5x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -8567,14 +16424,12 @@ mulAvxTwo_5x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -8670,46 +16525,44 @@ mulAvxTwo_5x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 27 YMM used + // Destination kept in GP registers + // Full registers estimated 49 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, AX - MOVQ $0x0000000f, R10 - MOVQ R10, X4 + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 mulAvxTwo_5x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8722,25 +16575,21 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8769,9 +16618,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8800,9 +16649,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8831,9 +16680,9 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -8862,22 +16711,384 @@ mulAvxTwo_5x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) - MOVQ 24(R8), R11 - VMOVDQU Y2, (R11)(R9*1) - VMOVDQU Y3, 32(R11)(R9*1) + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + DECQ AX JNZ mulAvxTwo_5x2_64_loop VZEROUPPER mulAvxTwo_5x2_64_end: RET +// func mulAvxTwo_5x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 27 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_5x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R9), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2Xor_loop + VZEROUPPER + +mulAvxTwo_5x2Xor_end: + RET + +// func mulAvxTwo_5x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R9 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + ADDQ R11, R9 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R9), Y2 + VMOVDQU 32(R9), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R9) + VMOVDQU Y3, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x2_64Xor_end: + RET + // func mulAvxTwo_5x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 @@ -8917,11 +17128,6 @@ TEXT ·mulAvxTwo_5x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_5x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -8932,20 +17138,17 @@ mulAvxTwo_5x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -9067,48 +17270,46 @@ mulAvxTwo_5x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 38 YMM used + // Destination kept in GP registers + // Full registers estimated 70 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_5x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), AX - MOVQ out_base+48(FP), R8 - MOVQ out_base+48(FP), R8 - MOVQ start+72(FP), R9 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 // Add start offset to input - ADDQ R9, DX - ADDQ R9, BX - ADDQ R9, SI - ADDQ R9, DI - ADDQ R9, AX - MOVQ $0x0000000f, R10 - MOVQ R10, X6 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R10 - SHRQ $0x06, R10 mulAvxTwo_5x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9121,35 +17322,29 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9188,9 +17383,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9229,9 +17424,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9270,9 +17465,9 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -9311,25 +17506,479 @@ mulAvxTwo_5x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R8), R11 - VMOVDQU Y0, (R11)(R9*1) - VMOVDQU Y1, 32(R11)(R9*1) - MOVQ 24(R8), R11 - VMOVDQU Y2, (R11)(R9*1) - VMOVDQU Y3, 32(R11)(R9*1) - MOVQ 48(R8), R11 - VMOVDQU Y4, (R11)(R9*1) - VMOVDQU Y5, 32(R11)(R9*1) + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 // Prepare for next loop - ADDQ $0x40, R9 - DECQ R10 + DECQ AX JNZ mulAvxTwo_5x3_64_loop VZEROUPPER mulAvxTwo_5x3_64_end: RET +// func mulAvxTwo_5x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_5x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R9), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3Xor_loop + VZEROUPPER + +mulAvxTwo_5x3Xor_end: + RET + +// func mulAvxTwo_5x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_5x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R9 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R10 + ADDQ R12, R11 + ADDQ R12, R9 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + VMOVDQU (R9), Y4 + VMOVDQU 32(R9), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y4, (R9) + VMOVDQU Y5, 32(R9) + ADDQ $0x40, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_5x3_64Xor_end: + RET + // func mulAvxTwo_5x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 @@ -9371,12 +18020,6 @@ TEXT ·mulAvxTwo_5x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_5x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -9387,26 +18030,22 @@ mulAvxTwo_5x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -9550,6 +18189,224 @@ mulAvxTwo_5x4_loop: mulAvxTwo_5x4_end: RET +// func mulAvxTwo_5x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 49 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R9 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R9 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_5x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R9), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x4Xor_loop + VZEROUPPER + +mulAvxTwo_5x4Xor_end: + RET + // func mulAvxTwo_5x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 @@ -9593,13 +18450,6 @@ TEXT ·mulAvxTwo_5x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_5x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -9610,32 +18460,27 @@ mulAvxTwo_5x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -9805,6 +18650,259 @@ mulAvxTwo_5x5_loop: mulAvxTwo_5x5_end: RET +// func mulAvxTwo_5x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 60 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R9 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R9 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_5x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R9), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x5Xor_loop + VZEROUPPER + +mulAvxTwo_5x5Xor_end: + RET + // func mulAvxTwo_5x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 @@ -9850,14 +18948,6 @@ TEXT ·mulAvxTwo_5x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_5x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -9868,38 +18958,32 @@ mulAvxTwo_5x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -10095,6 +19179,294 @@ mulAvxTwo_5x6_loop: mulAvxTwo_5x6_end: RET +// func mulAvxTwo_5x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 71 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R9 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R9 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_5x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R9), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x6Xor_loop + VZEROUPPER + +mulAvxTwo_5x6Xor_end: + RET + // func mulAvxTwo_5x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 @@ -10142,15 +19514,6 @@ TEXT ·mulAvxTwo_5x7(SB), NOSPLIT, $8-88 VPBROADCASTB X7, Y7 mulAvxTwo_5x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -10161,44 +19524,37 @@ mulAvxTwo_5x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -10420,6 +19776,329 @@ mulAvxTwo_5x7_loop: mulAvxTwo_5x7_end: RET +// func mulAvxTwo_5x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_5x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_5x7Xor_loop + VZEROUPPER + +mulAvxTwo_5x7Xor_end: + RET + // func mulAvxTwo_5x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 @@ -10471,16 +20150,6 @@ TEXT ·mulAvxTwo_5x8(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_5x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -10491,50 +20160,42 @@ mulAvxTwo_5x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 @@ -10782,6 +20443,366 @@ mulAvxTwo_5x8_loop: mulAvxTwo_5x8_end: RET +// func mulAvxTwo_5x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 93 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x8Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), AX + MOVQ out_base+48(FP), R8 + MOVQ (R8), R9 + MOVQ 24(R8), R10 + MOVQ 48(R8), R11 + MOVQ 72(R8), R12 + MOVQ 96(R8), R13 + MOVQ 120(R8), R14 + MOVQ 144(R8), R15 + MOVQ 168(R8), R8 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R8 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_5x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU (R9), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU (R13), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU (R14), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU (R15), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU (R8), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (AX), Y11 + ADDQ $0x20, AX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + VMOVDQU Y0, (R9) + ADDQ $0x20, R9 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + VMOVDQU Y4, (R13) + ADDQ $0x20, R13 + VMOVDQU Y5, (R14) + ADDQ $0x20, R14 + VMOVDQU Y6, (R15) + ADDQ $0x20, R15 + VMOVDQU Y7, (R8) + ADDQ $0x20, R8 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_5x8Xor_loop + VZEROUPPER + +mulAvxTwo_5x8Xor_end: + RET + // func mulAvxTwo_5x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 @@ -10813,17 +20834,6 @@ TEXT ·mulAvxTwo_5x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_5x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -10834,56 +20844,47 @@ mulAvxTwo_5x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -11158,6 +21159,389 @@ mulAvxTwo_5x9_loop: mulAvxTwo_5x9_end: RET +// func mulAvxTwo_5x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 104 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_5x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x9Xor_loop + VZEROUPPER + +mulAvxTwo_5x9Xor_end: + RET + // func mulAvxTwo_5x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 @@ -11189,18 +21573,6 @@ TEXT ·mulAvxTwo_5x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_5x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -11211,62 +21583,52 @@ mulAvxTwo_5x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -11567,6 +21929,423 @@ mulAvxTwo_5x10_loop: mulAvxTwo_5x10_end: RET +// func mulAvxTwo_5x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_5x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 115 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_5x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), DX + MOVQ out_base+48(FP), R9 + MOVQ start+72(FP), R10 + + // Add start offset to input + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_5x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R9), R11 + VMOVDQU (R11)(R10*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R9), R11 + VMOVDQU (R11)(R10*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R9), R11 + VMOVDQU (R11)(R10*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R9), R11 + VMOVDQU (R11)(R10*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R9), R11 + VMOVDQU (R11)(R10*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R9), R11 + VMOVDQU (R11)(R10*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R9), R11 + VMOVDQU (R11)(R10*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R9), R11 + VMOVDQU (R11)(R10*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R9), R11 + VMOVDQU (R11)(R10*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R9), R11 + VMOVDQU (R11)(R10*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R9), R11 + VMOVDQU Y0, (R11)(R10*1) + MOVQ 24(R9), R11 + VMOVDQU Y1, (R11)(R10*1) + MOVQ 48(R9), R11 + VMOVDQU Y2, (R11)(R10*1) + MOVQ 72(R9), R11 + VMOVDQU Y3, (R11)(R10*1) + MOVQ 96(R9), R11 + VMOVDQU Y4, (R11)(R10*1) + MOVQ 120(R9), R11 + VMOVDQU Y5, (R11)(R10*1) + MOVQ 144(R9), R11 + VMOVDQU Y6, (R11)(R10*1) + MOVQ 168(R9), R11 + VMOVDQU Y7, (R11)(R10*1) + MOVQ 192(R9), R11 + VMOVDQU Y8, (R11)(R10*1) + MOVQ 216(R9), R11 + VMOVDQU Y9, (R11)(R10*1) + + // Prepare for next loop + ADDQ $0x20, R10 + DECQ AX + JNZ mulAvxTwo_5x10Xor_loop + VZEROUPPER + +mulAvxTwo_5x10Xor_end: + RET + // func mulAvxTwo_6x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 @@ -11616,9 +22395,6 @@ TEXT ·mulAvxTwo_6x1(SB), NOSPLIT, $0-88 VPBROADCASTB X13, Y13 mulAvxTwo_6x1_loop: - // Clear 1 outputs - VPXOR Y12, Y12, Y12 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (DX), Y14 ADDQ $0x20, DX @@ -11627,8 +22403,7 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y0, Y14 VPSHUFB Y15, Y1, Y15 - VPXOR Y14, Y15, Y14 - VPXOR Y14, Y12, Y12 + VPXOR Y14, Y15, Y12 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y14 @@ -11701,46 +22476,44 @@ mulAvxTwo_6x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 16 YMM used + // Destination kept in GP registers + // Full registers estimated 30 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, AX + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX MOVQ $0x0000000f, R11 MOVQ R11, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 mulAvxTwo_6x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11753,15 +22526,13 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11780,9 +22551,9 @@ mulAvxTwo_6x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11801,9 +22572,9 @@ mulAvxTwo_6x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11822,9 +22593,9 @@ mulAvxTwo_6x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11843,9 +22614,9 @@ mulAvxTwo_6x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -11864,19 +22635,327 @@ mulAvxTwo_6x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 + DECQ AX JNZ mulAvxTwo_6x1_64_loop VZEROUPPER mulAvxTwo_6x1_64_end: RET +// func mulAvxTwo_6x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x1Xor(SB), NOSPLIT, $0-88 + // Loading all tables to registers + // Destination kept in GP registers + // Full registers estimated 16 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1Xor_end + VMOVDQU (CX), Y0 + VMOVDQU 32(CX), Y1 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + MOVQ in_base+24(FP), CX + MOVQ (CX), DX + MOVQ 24(CX), BX + MOVQ 48(CX), SI + MOVQ 72(CX), DI + MOVQ 96(CX), R8 + MOVQ 120(CX), CX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R9 + MOVQ start+72(FP), R10 + + // Add start offset to output + ADDQ R10, R9 + + // Add start offset to input + ADDQ R10, DX + ADDQ R10, BX + ADDQ R10, SI + ADDQ R10, DI + ADDQ R10, R8 + ADDQ R10, CX + MOVQ $0x0000000f, R10 + MOVQ R10, X13 + VPBROADCASTB X13, Y13 + +mulAvxTwo_6x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (DX), Y14 + ADDQ $0x20, DX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VMOVDQU (R9), Y12 + VPSHUFB Y14, Y0, Y14 + VPSHUFB Y15, Y1, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (BX), Y14 + ADDQ $0x20, BX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y2, Y14 + VPSHUFB Y15, Y3, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (SI), Y14 + ADDQ $0x20, SI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y4, Y14 + VPSHUFB Y15, Y5, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (DI), Y14 + ADDQ $0x20, DI + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y6, Y14 + VPSHUFB Y15, Y7, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R8), Y14 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y8, Y14 + VPSHUFB Y15, Y9, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (CX), Y14 + ADDQ $0x20, CX + VPSRLQ $0x04, Y14, Y15 + VPAND Y13, Y14, Y14 + VPAND Y13, Y15, Y15 + VPSHUFB Y14, Y10, Y14 + VPSHUFB Y15, Y11, Y15 + VPXOR Y14, Y15, Y14 + VPXOR Y14, Y12, Y12 + + // Store 1 outputs + VMOVDQU Y12, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1Xor_loop + VZEROUPPER + +mulAvxTwo_6x1Xor_end: + RET + +// func mulAvxTwo_6x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 30 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R10 + MOVQ start+72(FP), R11 + + // Add start offset to output + ADDQ R11, R10 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R11 + MOVQ R11, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R10), Y0 + VMOVDQU 32(R10), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R10) + VMOVDQU Y1, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x1_64Xor_end: + RET + // func mulAvxTwo_6x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 @@ -11916,10 +22995,6 @@ TEXT ·mulAvxTwo_6x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_6x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -11930,14 +23005,12 @@ mulAvxTwo_6x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -12052,48 +23125,46 @@ mulAvxTwo_6x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 31 YMM used + // Destination kept in GP registers + // Full registers estimated 57 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, AX - MOVQ $0x0000000f, R11 - MOVQ R11, X4 + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 mulAvxTwo_6x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12106,25 +23177,21 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12153,9 +23220,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12184,9 +23251,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12215,9 +23282,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12246,9 +23313,9 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -12277,22 +23344,438 @@ mulAvxTwo_6x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) - MOVQ 24(R9), R12 - VMOVDQU Y2, (R12)(R10*1) - VMOVDQU Y3, 32(R12)(R10*1) + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 + DECQ AX JNZ mulAvxTwo_6x2_64_loop VZEROUPPER mulAvxTwo_6x2_64_end: RET +// func mulAvxTwo_6x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 31 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_6x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R10), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2Xor_loop + VZEROUPPER + +mulAvxTwo_6x2Xor_end: + RET + +// func mulAvxTwo_6x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R10 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + ADDQ R12, R10 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R10), Y2 + VMOVDQU 32(R10), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R10) + VMOVDQU Y3, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x2_64Xor_end: + RET + // func mulAvxTwo_6x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 @@ -12334,11 +23817,6 @@ TEXT ·mulAvxTwo_6x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_6x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -12349,20 +23827,17 @@ mulAvxTwo_6x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -12509,50 +23984,48 @@ mulAvxTwo_6x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 44 YMM used + // Destination kept in GP registers + // Full registers estimated 82 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_6x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), AX - MOVQ out_base+48(FP), R9 - MOVQ out_base+48(FP), R9 - MOVQ start+72(FP), R10 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 // Add start offset to input - ADDQ R10, DX - ADDQ R10, BX - ADDQ R10, SI - ADDQ R10, DI - ADDQ R10, R8 - ADDQ R10, AX - MOVQ $0x0000000f, R11 - MOVQ R11, X6 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R11 - SHRQ $0x06, R11 mulAvxTwo_6x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12565,35 +24038,29 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12632,9 +24099,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12673,9 +24140,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12714,9 +24181,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12755,9 +24222,9 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -12796,25 +24263,549 @@ mulAvxTwo_6x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R9), R12 - VMOVDQU Y0, (R12)(R10*1) - VMOVDQU Y1, 32(R12)(R10*1) - MOVQ 24(R9), R12 - VMOVDQU Y2, (R12)(R10*1) - VMOVDQU Y3, 32(R12)(R10*1) - MOVQ 48(R9), R12 - VMOVDQU Y4, (R12)(R10*1) - VMOVDQU Y5, 32(R12)(R10*1) + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 // Prepare for next loop - ADDQ $0x40, R10 - DECQ R11 + DECQ AX JNZ mulAvxTwo_6x3_64_loop VZEROUPPER mulAvxTwo_6x3_64_end: RET +// func mulAvxTwo_6x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 44 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_6x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R10), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3Xor_loop + VZEROUPPER + +mulAvxTwo_6x3Xor_end: + RET + +// func mulAvxTwo_6x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 82 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_6x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R10 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R11 + ADDQ R13, R12 + ADDQ R13, R10 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + VMOVDQU (R10), Y4 + VMOVDQU 32(R10), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y4, (R10) + VMOVDQU Y5, 32(R10) + ADDQ $0x40, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_6x3_64Xor_end: + RET + // func mulAvxTwo_6x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 @@ -12858,12 +24849,6 @@ TEXT ·mulAvxTwo_6x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_6x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -12874,26 +24859,22 @@ mulAvxTwo_6x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -13068,6 +25049,257 @@ mulAvxTwo_6x4_loop: mulAvxTwo_6x4_end: RET +// func mulAvxTwo_6x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 57 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R10 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R10 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_6x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R10), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x4Xor_loop + VZEROUPPER + +mulAvxTwo_6x4Xor_end: + RET + // func mulAvxTwo_6x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 @@ -13113,13 +25345,6 @@ TEXT ·mulAvxTwo_6x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_6x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -13130,32 +25355,27 @@ mulAvxTwo_6x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -13362,6 +25582,298 @@ mulAvxTwo_6x5_loop: mulAvxTwo_6x5_end: RET +// func mulAvxTwo_6x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 70 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R10 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R10 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_6x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R10), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x5Xor_loop + VZEROUPPER + +mulAvxTwo_6x5Xor_end: + RET + // func mulAvxTwo_6x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 @@ -13409,14 +25921,6 @@ TEXT ·mulAvxTwo_6x6(SB), NOSPLIT, $8-88 VPBROADCASTB X6, Y6 mulAvxTwo_6x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -13427,38 +25931,32 @@ mulAvxTwo_6x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -13697,6 +26195,339 @@ mulAvxTwo_6x6_loop: mulAvxTwo_6x6_end: RET +// func mulAvxTwo_6x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 83 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_6x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_6x6Xor_loop + VZEROUPPER + +mulAvxTwo_6x6Xor_end: + RET + // func mulAvxTwo_6x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 @@ -13748,15 +26579,6 @@ TEXT ·mulAvxTwo_6x7(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_6x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -13767,44 +26589,37 @@ mulAvxTwo_6x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 @@ -14075,6 +26890,382 @@ mulAvxTwo_6x7_loop: mulAvxTwo_6x7_end: RET +// func mulAvxTwo_6x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 96 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x7Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), AX + MOVQ out_base+48(FP), R9 + MOVQ (R9), R10 + MOVQ 24(R9), R11 + MOVQ 48(R9), R12 + MOVQ 72(R9), R13 + MOVQ 96(R9), R14 + MOVQ 120(R9), R15 + MOVQ 144(R9), R9 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R9 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_6x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU (R10), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU (R13), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU (R14), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU (R15), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU (R9), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (AX), Y10 + ADDQ $0x20, AX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + VMOVDQU Y0, (R10) + ADDQ $0x20, R10 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + VMOVDQU Y3, (R13) + ADDQ $0x20, R13 + VMOVDQU Y4, (R14) + ADDQ $0x20, R14 + VMOVDQU Y5, (R15) + ADDQ $0x20, R15 + VMOVDQU Y6, (R9) + ADDQ $0x20, R9 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_6x7Xor_loop + VZEROUPPER + +mulAvxTwo_6x7Xor_end: + RET + // func mulAvxTwo_6x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 @@ -14108,16 +27299,6 @@ TEXT ·mulAvxTwo_6x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_6x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -14128,50 +27309,42 @@ mulAvxTwo_6x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -14475,6 +27648,412 @@ mulAvxTwo_6x8_loop: mulAvxTwo_6x8_end: RET +// func mulAvxTwo_6x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 109 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_6x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x8Xor_loop + VZEROUPPER + +mulAvxTwo_6x8Xor_end: + RET + // func mulAvxTwo_6x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 @@ -14508,17 +28087,6 @@ TEXT ·mulAvxTwo_6x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_6x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -14529,56 +28097,47 @@ mulAvxTwo_6x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -14914,6 +28473,452 @@ mulAvxTwo_6x9_loop: mulAvxTwo_6x9_end: RET +// func mulAvxTwo_6x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 122 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_6x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x9Xor_loop + VZEROUPPER + +mulAvxTwo_6x9Xor_end: + RET + // func mulAvxTwo_6x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 @@ -14947,18 +28952,6 @@ TEXT ·mulAvxTwo_6x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_6x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -14969,62 +28962,52 @@ mulAvxTwo_6x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -15392,6 +29375,492 @@ mulAvxTwo_6x10_loop: mulAvxTwo_6x10_end: RET +// func mulAvxTwo_6x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_6x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 135 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_6x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), DX + MOVQ out_base+48(FP), R10 + MOVQ start+72(FP), R11 + + // Add start offset to input + ADDQ R11, BX + ADDQ R11, SI + ADDQ R11, DI + ADDQ R11, R8 + ADDQ R11, R9 + ADDQ R11, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_6x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R10), R12 + VMOVDQU (R12)(R11*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R10), R12 + VMOVDQU (R12)(R11*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R10), R12 + VMOVDQU (R12)(R11*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R10), R12 + VMOVDQU (R12)(R11*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R10), R12 + VMOVDQU (R12)(R11*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R10), R12 + VMOVDQU (R12)(R11*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R10), R12 + VMOVDQU (R12)(R11*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R10), R12 + VMOVDQU (R12)(R11*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R10), R12 + VMOVDQU (R12)(R11*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R10), R12 + VMOVDQU (R12)(R11*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R10), R12 + VMOVDQU Y0, (R12)(R11*1) + MOVQ 24(R10), R12 + VMOVDQU Y1, (R12)(R11*1) + MOVQ 48(R10), R12 + VMOVDQU Y2, (R12)(R11*1) + MOVQ 72(R10), R12 + VMOVDQU Y3, (R12)(R11*1) + MOVQ 96(R10), R12 + VMOVDQU Y4, (R12)(R11*1) + MOVQ 120(R10), R12 + VMOVDQU Y5, (R12)(R11*1) + MOVQ 144(R10), R12 + VMOVDQU Y6, (R12)(R11*1) + MOVQ 168(R10), R12 + VMOVDQU Y7, (R12)(R11*1) + MOVQ 192(R10), R12 + VMOVDQU Y8, (R12)(R11*1) + MOVQ 216(R10), R12 + VMOVDQU Y9, (R12)(R11*1) + + // Prepare for next loop + ADDQ $0x20, R11 + DECQ AX + JNZ mulAvxTwo_6x10Xor_loop + VZEROUPPER + +mulAvxTwo_6x10Xor_end: + RET + // func mulAvxTwo_7x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 @@ -15431,9 +29900,6 @@ TEXT ·mulAvxTwo_7x1(SB), NOSPLIT, $0-88 VPBROADCASTB X1, Y1 mulAvxTwo_7x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX @@ -15444,8 +29910,7 @@ mulAvxTwo_7x1_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPXOR Y2, Y3, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -15541,48 +30006,46 @@ mulAvxTwo_7x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 18 YMM used + // Destination kept in GP registers + // Full registers estimated 34 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 // Add start offset to input - ADDQ R11, DX - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, AX + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX MOVQ $0x0000000f, R12 MOVQ R12, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 mulAvxTwo_7x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15595,15 +30058,13 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15622,9 +30083,9 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15643,9 +30104,9 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15664,9 +30125,9 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15685,9 +30146,9 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15706,9 +30167,9 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -15727,19 +30188,365 @@ mulAvxTwo_7x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 + DECQ AX JNZ mulAvxTwo_7x1_64_loop VZEROUPPER mulAvxTwo_7x1_64_end: RET +// func mulAvxTwo_7x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 18 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_7x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1Xor_loop + VZEROUPPER + +mulAvxTwo_7x1Xor_end: + RET + +// func mulAvxTwo_7x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 34 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R11 + MOVQ start+72(FP), R12 + + // Add start offset to output + ADDQ R12, R11 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R12 + MOVQ R12, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R11), Y0 + VMOVDQU 32(R11), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R11) + VMOVDQU Y1, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x1_64Xor_end: + RET + // func mulAvxTwo_7x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 @@ -15781,10 +30588,6 @@ TEXT ·mulAvxTwo_7x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_7x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -15795,14 +30598,12 @@ mulAvxTwo_7x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -15936,50 +30737,48 @@ mulAvxTwo_7x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 35 YMM used + // Destination kept in GP registers + // Full registers estimated 65 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 // Add start offset to input - ADDQ R11, DX - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, AX - MOVQ $0x0000000f, R12 - MOVQ R12, X4 + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 mulAvxTwo_7x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -15992,25 +30791,21 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16039,9 +30834,9 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16070,9 +30865,9 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16101,9 +30896,9 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16132,9 +30927,9 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16163,9 +30958,9 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -16194,22 +30989,492 @@ mulAvxTwo_7x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) - MOVQ 24(R10), R13 - VMOVDQU Y2, (R13)(R11*1) - VMOVDQU Y3, 32(R13)(R11*1) + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 + DECQ AX JNZ mulAvxTwo_7x2_64_loop VZEROUPPER mulAvxTwo_7x2_64_end: RET +// func mulAvxTwo_7x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 35 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_7x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R11), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2Xor_loop + VZEROUPPER + +mulAvxTwo_7x2Xor_end: + RET + +// func mulAvxTwo_7x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R11 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + ADDQ R13, R11 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R11), Y2 + VMOVDQU 32(R11), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R11) + VMOVDQU Y3, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x2_64Xor_end: + RET + // func mulAvxTwo_7x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 @@ -16253,11 +31518,6 @@ TEXT ·mulAvxTwo_7x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_7x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -16268,20 +31528,17 @@ mulAvxTwo_7x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -16453,52 +31710,50 @@ mulAvxTwo_7x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 50 YMM used + // Destination kept in GP registers + // Full registers estimated 94 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_7x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), AX - MOVQ out_base+48(FP), R10 - MOVQ out_base+48(FP), R10 - MOVQ start+72(FP), R11 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 // Add start offset to input - ADDQ R11, DX - ADDQ R11, BX - ADDQ R11, SI - ADDQ R11, DI - ADDQ R11, R8 - ADDQ R11, R9 - ADDQ R11, AX - MOVQ $0x0000000f, R12 - MOVQ R12, X6 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R12 - SHRQ $0x06, R12 mulAvxTwo_7x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16511,35 +31766,29 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16578,9 +31827,9 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16619,9 +31868,9 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16660,9 +31909,9 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16701,9 +31950,9 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16742,9 +31991,9 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -16783,25 +32032,619 @@ mulAvxTwo_7x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R10), R13 - VMOVDQU Y0, (R13)(R11*1) - VMOVDQU Y1, 32(R13)(R11*1) - MOVQ 24(R10), R13 - VMOVDQU Y2, (R13)(R11*1) - VMOVDQU Y3, 32(R13)(R11*1) - MOVQ 48(R10), R13 - VMOVDQU Y4, (R13)(R11*1) - VMOVDQU Y5, 32(R13)(R11*1) + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 // Prepare for next loop - ADDQ $0x40, R11 - DECQ R12 + DECQ AX JNZ mulAvxTwo_7x3_64_loop VZEROUPPER mulAvxTwo_7x3_64_end: RET +// func mulAvxTwo_7x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 50 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_7x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R11), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3Xor_loop + VZEROUPPER + +mulAvxTwo_7x3Xor_end: + RET + +// func mulAvxTwo_7x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 94 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_7x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R11 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R12 + ADDQ R14, R13 + ADDQ R14, R11 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_7x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + VMOVDQU (R11), Y4 + VMOVDQU 32(R11), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y4, (R11) + VMOVDQU Y5, 32(R11) + ADDQ $0x40, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_7x3_64Xor_end: + RET + // func mulAvxTwo_7x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 @@ -16847,12 +32690,6 @@ TEXT ·mulAvxTwo_7x4(SB), NOSPLIT, $0-88 VPBROADCASTB X4, Y4 mulAvxTwo_7x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -16863,26 +32700,22 @@ mulAvxTwo_7x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -17088,6 +32921,290 @@ mulAvxTwo_7x4_loop: mulAvxTwo_7x4_end: RET +// func mulAvxTwo_7x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x4Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 65 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R11 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R11 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_7x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R11), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x4Xor_loop + VZEROUPPER + +mulAvxTwo_7x4Xor_end: + RET + // func mulAvxTwo_7x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 @@ -17135,13 +33252,6 @@ TEXT ·mulAvxTwo_7x5(SB), NOSPLIT, $8-88 VPBROADCASTB X5, Y5 mulAvxTwo_7x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -17152,32 +33262,27 @@ mulAvxTwo_7x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -17421,6 +33526,337 @@ mulAvxTwo_7x5_loop: mulAvxTwo_7x5_end: RET +// func mulAvxTwo_7x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 80 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_7x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_7x5Xor_loop + VZEROUPPER + +mulAvxTwo_7x5Xor_end: + RET + // func mulAvxTwo_7x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 @@ -17472,14 +33908,6 @@ TEXT ·mulAvxTwo_7x6(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_7x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -17490,38 +33918,32 @@ mulAvxTwo_7x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 @@ -17803,6 +34225,386 @@ mulAvxTwo_7x6_loop: mulAvxTwo_7x6_end: RET +// func mulAvxTwo_7x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 95 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x6Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), AX + MOVQ out_base+48(FP), R10 + MOVQ (R10), R11 + MOVQ 24(R10), R12 + MOVQ 48(R10), R13 + MOVQ 72(R10), R14 + MOVQ 96(R10), R15 + MOVQ 120(R10), R10 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R10 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_7x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU (R11), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU (R14), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU (R15), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU (R10), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (AX), Y9 + ADDQ $0x20, AX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + VMOVDQU Y0, (R11) + ADDQ $0x20, R11 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + VMOVDQU Y3, (R14) + ADDQ $0x20, R14 + VMOVDQU Y4, (R15) + ADDQ $0x20, R15 + VMOVDQU Y5, (R10) + ADDQ $0x20, R10 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_7x6Xor_loop + VZEROUPPER + +mulAvxTwo_7x6Xor_end: + RET + // func mulAvxTwo_7x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 @@ -17838,15 +34640,6 @@ TEXT ·mulAvxTwo_7x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_7x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -17857,44 +34650,37 @@ mulAvxTwo_7x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -18215,6 +35001,423 @@ mulAvxTwo_7x7_loop: mulAvxTwo_7x7_end: RET +// func mulAvxTwo_7x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_7x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x7Xor_loop + VZEROUPPER + +mulAvxTwo_7x7Xor_end: + RET + // func mulAvxTwo_7x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 @@ -18250,16 +35453,6 @@ TEXT ·mulAvxTwo_7x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_7x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -18270,50 +35463,42 @@ mulAvxTwo_7x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -18672,6 +35857,469 @@ mulAvxTwo_7x8_loop: mulAvxTwo_7x8_end: RET +// func mulAvxTwo_7x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 125 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_7x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x8Xor_loop + VZEROUPPER + +mulAvxTwo_7x8Xor_end: + RET + // func mulAvxTwo_7x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 @@ -18707,17 +36355,6 @@ TEXT ·mulAvxTwo_7x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_7x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -18728,56 +36365,47 @@ mulAvxTwo_7x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -19174,6 +36802,515 @@ mulAvxTwo_7x9_loop: mulAvxTwo_7x9_end: RET +// func mulAvxTwo_7x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 140 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_7x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x9Xor_loop + VZEROUPPER + +mulAvxTwo_7x9Xor_end: + RET + // func mulAvxTwo_7x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 @@ -19209,18 +37346,6 @@ TEXT ·mulAvxTwo_7x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_7x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -19231,62 +37356,52 @@ mulAvxTwo_7x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -19721,6 +37836,561 @@ mulAvxTwo_7x10_loop: mulAvxTwo_7x10_end: RET +// func mulAvxTwo_7x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_7x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 155 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_7x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), DX + MOVQ out_base+48(FP), R11 + MOVQ start+72(FP), R12 + + // Add start offset to input + ADDQ R12, BX + ADDQ R12, SI + ADDQ R12, DI + ADDQ R12, R8 + ADDQ R12, R9 + ADDQ R12, R10 + ADDQ R12, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_7x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R11), R13 + VMOVDQU (R13)(R12*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R11), R13 + VMOVDQU (R13)(R12*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R11), R13 + VMOVDQU (R13)(R12*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R11), R13 + VMOVDQU (R13)(R12*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R11), R13 + VMOVDQU (R13)(R12*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R11), R13 + VMOVDQU (R13)(R12*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R11), R13 + VMOVDQU (R13)(R12*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R11), R13 + VMOVDQU (R13)(R12*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R11), R13 + VMOVDQU (R13)(R12*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R11), R13 + VMOVDQU (R13)(R12*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R11), R13 + VMOVDQU Y0, (R13)(R12*1) + MOVQ 24(R11), R13 + VMOVDQU Y1, (R13)(R12*1) + MOVQ 48(R11), R13 + VMOVDQU Y2, (R13)(R12*1) + MOVQ 72(R11), R13 + VMOVDQU Y3, (R13)(R12*1) + MOVQ 96(R11), R13 + VMOVDQU Y4, (R13)(R12*1) + MOVQ 120(R11), R13 + VMOVDQU Y5, (R13)(R12*1) + MOVQ 144(R11), R13 + VMOVDQU Y6, (R13)(R12*1) + MOVQ 168(R11), R13 + VMOVDQU Y7, (R13)(R12*1) + MOVQ 192(R11), R13 + VMOVDQU Y8, (R13)(R12*1) + MOVQ 216(R11), R13 + VMOVDQU Y9, (R13)(R12*1) + + // Prepare for next loop + ADDQ $0x20, R12 + DECQ AX + JNZ mulAvxTwo_7x10Xor_loop + VZEROUPPER + +mulAvxTwo_7x10Xor_end: + RET + // func mulAvxTwo_8x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 @@ -19762,9 +38432,6 @@ TEXT ·mulAvxTwo_8x1(SB), NOSPLIT, $0-88 VPBROADCASTB X1, Y1 mulAvxTwo_8x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX @@ -19775,8 +38442,7 @@ mulAvxTwo_8x1_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPXOR Y2, Y3, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -19885,50 +38551,48 @@ mulAvxTwo_8x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 20 YMM used + // Destination kept in GP registers + // Full registers estimated 38 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX MOVQ $0x0000000f, R13 MOVQ R13, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 mulAvxTwo_8x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -19941,15 +38605,13 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -19968,9 +38630,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -19989,9 +38651,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -20010,9 +38672,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -20031,9 +38693,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -20052,9 +38714,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -20073,9 +38735,9 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -20094,19 +38756,403 @@ mulAvxTwo_8x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 + DECQ AX JNZ mulAvxTwo_8x1_64_loop VZEROUPPER mulAvxTwo_8x1_64_end: RET +// func mulAvxTwo_8x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 20 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_8x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1Xor_loop + VZEROUPPER + +mulAvxTwo_8x1Xor_end: + RET + +// func mulAvxTwo_8x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 38 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R12 + MOVQ start+72(FP), R13 + + // Add start offset to output + ADDQ R13, R12 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R13 + MOVQ R13, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R12), Y0 + VMOVDQU 32(R12), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R12) + VMOVDQU Y1, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x1_64Xor_end: + RET + // func mulAvxTwo_8x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 @@ -20150,10 +39196,6 @@ TEXT ·mulAvxTwo_8x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_8x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -20164,14 +39206,12 @@ mulAvxTwo_8x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -20324,52 +39364,50 @@ mulAvxTwo_8x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 39 YMM used + // Destination kept in GP registers + // Full registers estimated 73 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX - MOVQ $0x0000000f, R13 - MOVQ R13, X4 + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 mulAvxTwo_8x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20382,25 +39420,21 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20429,9 +39463,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20460,9 +39494,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20491,9 +39525,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20522,9 +39556,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20553,9 +39587,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20584,9 +39618,9 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -20615,22 +39649,546 @@ mulAvxTwo_8x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) - MOVQ 24(R11), R14 - VMOVDQU Y2, (R14)(R12*1) - VMOVDQU Y3, 32(R14)(R12*1) + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 + DECQ AX JNZ mulAvxTwo_8x2_64_loop VZEROUPPER mulAvxTwo_8x2_64_end: RET +// func mulAvxTwo_8x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 39 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_8x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R12), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2Xor_loop + VZEROUPPER + +mulAvxTwo_8x2Xor_end: + RET + +// func mulAvxTwo_8x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R12 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + ADDQ R14, R12 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R12), Y2 + VMOVDQU 32(R12), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R12) + VMOVDQU Y3, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x2_64Xor_end: + RET + // func mulAvxTwo_8x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 @@ -20676,11 +40234,6 @@ TEXT ·mulAvxTwo_8x3(SB), NOSPLIT, $0-88 VPBROADCASTB X3, Y3 mulAvxTwo_8x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -20691,20 +40244,17 @@ mulAvxTwo_8x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -20901,54 +40451,52 @@ mulAvxTwo_8x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x3_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 56 YMM used + // Destination kept in GP registers + // Full registers estimated 106 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_8x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), AX - MOVQ out_base+48(FP), R11 - MOVQ out_base+48(FP), R11 - MOVQ start+72(FP), R12 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 // Add start offset to input - ADDQ R12, DX - ADDQ R12, BX - ADDQ R12, SI - ADDQ R12, DI - ADDQ R12, R8 - ADDQ R12, R9 - ADDQ R12, R10 - ADDQ R12, AX - MOVQ $0x0000000f, R13 - MOVQ R13, X6 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R13 - SHRQ $0x06, R13 mulAvxTwo_8x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -20961,35 +40509,29 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21028,9 +40570,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21069,9 +40611,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21110,9 +40652,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21151,9 +40693,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21192,9 +40734,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y11 - VMOVDQU 32(R10), Y13 - ADDQ $0x40, R10 + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21233,9 +40775,9 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -21274,25 +40816,689 @@ mulAvxTwo_8x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R11), R14 - VMOVDQU Y0, (R14)(R12*1) - VMOVDQU Y1, 32(R14)(R12*1) - MOVQ 24(R11), R14 - VMOVDQU Y2, (R14)(R12*1) - VMOVDQU Y3, 32(R14)(R12*1) - MOVQ 48(R11), R14 - VMOVDQU Y4, (R14)(R12*1) - VMOVDQU Y5, 32(R14)(R12*1) + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 // Prepare for next loop - ADDQ $0x40, R12 - DECQ R13 + DECQ AX JNZ mulAvxTwo_8x3_64_loop VZEROUPPER mulAvxTwo_8x3_64_end: RET +// func mulAvxTwo_8x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 56 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_8x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R12), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3Xor_loop + VZEROUPPER + +mulAvxTwo_8x3Xor_end: + RET + +// func mulAvxTwo_8x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x3_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 106 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_8x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R12 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R13 + ADDQ R15, R14 + ADDQ R15, R12 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 + VMOVDQU (R12), Y4 + VMOVDQU 32(R12), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y4, (R12) + VMOVDQU Y5, 32(R12) + ADDQ $0x40, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_8x3_64Xor_end: + RET + // func mulAvxTwo_8x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 @@ -21340,12 +41546,6 @@ TEXT ·mulAvxTwo_8x4(SB), NOSPLIT, $8-88 VPBROADCASTB X4, Y4 mulAvxTwo_8x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -21356,26 +41556,22 @@ mulAvxTwo_8x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -21612,6 +41808,323 @@ mulAvxTwo_8x4_loop: mulAvxTwo_8x4_end: RET +// func mulAvxTwo_8x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 73 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_8x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_8x4Xor_loop + VZEROUPPER + +mulAvxTwo_8x4Xor_end: + RET + // func mulAvxTwo_8x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 @@ -21663,13 +42176,6 @@ TEXT ·mulAvxTwo_8x5(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_8x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -21680,32 +42186,27 @@ mulAvxTwo_8x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 @@ -21986,6 +42487,378 @@ mulAvxTwo_8x5_loop: mulAvxTwo_8x5_end: RET +// func mulAvxTwo_8x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 90 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x5Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), AX + MOVQ out_base+48(FP), R11 + MOVQ (R11), R12 + MOVQ 24(R11), R13 + MOVQ 48(R11), R14 + MOVQ 72(R11), R15 + MOVQ 96(R11), R11 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R11 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_8x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU (R12), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU (R14), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU (R15), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU (R11), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (AX), Y8 + ADDQ $0x20, AX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + VMOVDQU Y0, (R12) + ADDQ $0x20, R12 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + VMOVDQU Y2, (R14) + ADDQ $0x20, R14 + VMOVDQU Y3, (R15) + ADDQ $0x20, R15 + VMOVDQU Y4, (R11) + ADDQ $0x20, R11 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_8x5Xor_loop + VZEROUPPER + +mulAvxTwo_8x5Xor_end: + RET + // func mulAvxTwo_8x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 @@ -22023,14 +42896,6 @@ TEXT ·mulAvxTwo_8x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_8x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -22041,38 +42906,32 @@ mulAvxTwo_8x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -22398,6 +43257,422 @@ mulAvxTwo_8x6_loop: mulAvxTwo_8x6_end: RET +// func mulAvxTwo_8x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 107 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_8x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x6Xor_loop + VZEROUPPER + +mulAvxTwo_8x6Xor_end: + RET + // func mulAvxTwo_8x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 @@ -22435,15 +43710,6 @@ TEXT ·mulAvxTwo_8x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_8x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -22454,44 +43720,37 @@ mulAvxTwo_8x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -22861,6 +44120,474 @@ mulAvxTwo_8x7_loop: mulAvxTwo_8x7_end: RET +// func mulAvxTwo_8x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 124 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_8x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x7Xor_loop + VZEROUPPER + +mulAvxTwo_8x7Xor_end: + RET + // func mulAvxTwo_8x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 @@ -22898,16 +44625,6 @@ TEXT ·mulAvxTwo_8x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_8x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -22918,50 +44635,42 @@ mulAvxTwo_8x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -23375,6 +45084,526 @@ mulAvxTwo_8x8_loop: mulAvxTwo_8x8_end: RET +// func mulAvxTwo_8x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 141 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_8x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x8Xor_loop + VZEROUPPER + +mulAvxTwo_8x8Xor_end: + RET + // func mulAvxTwo_8x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 @@ -23412,17 +45641,6 @@ TEXT ·mulAvxTwo_8x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_8x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -23433,56 +45651,47 @@ mulAvxTwo_8x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -23940,6 +46149,578 @@ mulAvxTwo_8x9_loop: mulAvxTwo_8x9_end: RET +// func mulAvxTwo_8x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 158 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_8x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x9Xor_loop + VZEROUPPER + +mulAvxTwo_8x9Xor_end: + RET + // func mulAvxTwo_8x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 @@ -23977,18 +46758,6 @@ TEXT ·mulAvxTwo_8x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_8x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -23999,62 +46768,52 @@ mulAvxTwo_8x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -24556,6 +47315,630 @@ mulAvxTwo_8x10_loop: mulAvxTwo_8x10_end: RET +// func mulAvxTwo_8x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_8x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 175 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_8x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), DX + MOVQ out_base+48(FP), R12 + MOVQ start+72(FP), R13 + + // Add start offset to input + ADDQ R13, BX + ADDQ R13, SI + ADDQ R13, DI + ADDQ R13, R8 + ADDQ R13, R9 + ADDQ R13, R10 + ADDQ R13, R11 + ADDQ R13, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_8x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R12), R14 + VMOVDQU (R14)(R13*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R12), R14 + VMOVDQU (R14)(R13*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R12), R14 + VMOVDQU (R14)(R13*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R12), R14 + VMOVDQU (R14)(R13*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R12), R14 + VMOVDQU (R14)(R13*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R12), R14 + VMOVDQU (R14)(R13*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R12), R14 + VMOVDQU (R14)(R13*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R12), R14 + VMOVDQU (R14)(R13*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R12), R14 + VMOVDQU (R14)(R13*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R12), R14 + VMOVDQU (R14)(R13*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R12), R14 + VMOVDQU Y0, (R14)(R13*1) + MOVQ 24(R12), R14 + VMOVDQU Y1, (R14)(R13*1) + MOVQ 48(R12), R14 + VMOVDQU Y2, (R14)(R13*1) + MOVQ 72(R12), R14 + VMOVDQU Y3, (R14)(R13*1) + MOVQ 96(R12), R14 + VMOVDQU Y4, (R14)(R13*1) + MOVQ 120(R12), R14 + VMOVDQU Y5, (R14)(R13*1) + MOVQ 144(R12), R14 + VMOVDQU Y6, (R14)(R13*1) + MOVQ 168(R12), R14 + VMOVDQU Y7, (R14)(R13*1) + MOVQ 192(R12), R14 + VMOVDQU Y8, (R14)(R13*1) + MOVQ 216(R12), R14 + VMOVDQU Y9, (R14)(R13*1) + + // Prepare for next loop + ADDQ $0x20, R13 + DECQ AX + JNZ mulAvxTwo_8x10Xor_loop + VZEROUPPER + +mulAvxTwo_8x10Xor_end: + RET + // func mulAvxTwo_9x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 @@ -24599,9 +47982,6 @@ TEXT ·mulAvxTwo_9x1(SB), NOSPLIT, $0-88 VPBROADCASTB X1, Y1 mulAvxTwo_9x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX @@ -24612,8 +47992,7 @@ mulAvxTwo_9x1_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPXOR Y2, Y3, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -24735,52 +48114,50 @@ mulAvxTwo_9x1_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 22 YMM used + // Destination kept in GP registers + // Full registers estimated 42 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX MOVQ $0x0000000f, R14 MOVQ R14, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 mulAvxTwo_9x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24793,15 +48170,13 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24820,9 +48195,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24841,9 +48216,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24862,9 +48237,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24883,9 +48258,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24904,9 +48279,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24925,9 +48300,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (R11), Y6 - VMOVDQU 32(R11), Y5 - ADDQ $0x40, R11 + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24946,9 +48321,9 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -24967,19 +48342,441 @@ mulAvxTwo_9x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 + DECQ AX JNZ mulAvxTwo_9x1_64_loop VZEROUPPER mulAvxTwo_9x1_64_end: RET +// func mulAvxTwo_9x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 22 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_9x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y4 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1Xor_loop + VZEROUPPER + +mulAvxTwo_9x1Xor_end: + RET + +// func mulAvxTwo_9x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 42 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R13 + MOVQ start+72(FP), R14 + + // Add start offset to output + ADDQ R14, R13 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R14 + MOVQ R14, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R13), Y0 + VMOVDQU 32(R13), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R13) + VMOVDQU Y1, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x1_64Xor_end: + RET + // func mulAvxTwo_9x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 @@ -25025,10 +48822,6 @@ TEXT ·mulAvxTwo_9x2(SB), NOSPLIT, $0-88 VPBROADCASTB X2, Y2 mulAvxTwo_9x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -25039,14 +48832,12 @@ mulAvxTwo_9x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -25218,54 +49009,52 @@ mulAvxTwo_9x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x2_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 43 YMM used + // Destination kept in GP registers + // Full registers estimated 81 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX - MOVQ $0x0000000f, R14 - MOVQ R14, X4 + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 mulAvxTwo_9x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25278,25 +49067,21 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25325,9 +49110,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25356,9 +49141,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25387,9 +49172,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25418,9 +49203,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25449,9 +49234,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25480,9 +49265,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU (R11), Y9 - VMOVDQU 32(R11), Y11 - ADDQ $0x40, R11 + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25511,9 +49296,9 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -25542,22 +49327,600 @@ mulAvxTwo_9x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) - MOVQ 24(R12), R15 - VMOVDQU Y2, (R15)(R13*1) - VMOVDQU Y3, 32(R15)(R13*1) + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 + DECQ AX JNZ mulAvxTwo_9x2_64_loop VZEROUPPER mulAvxTwo_9x2_64_end: RET +// func mulAvxTwo_9x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 43 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_9x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R13), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y5 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2Xor_loop + VZEROUPPER + +mulAvxTwo_9x2Xor_end: + RET + +// func mulAvxTwo_9x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x2_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R13 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + ADDQ R15, R13 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_9x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R13), Y2 + VMOVDQU 32(R13), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R13) + VMOVDQU Y3, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x2_64Xor_end: + RET + // func mulAvxTwo_9x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 @@ -25605,11 +49968,6 @@ TEXT ·mulAvxTwo_9x3(SB), NOSPLIT, $8-88 VPBROADCASTB X3, Y3 mulAvxTwo_9x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -25620,20 +49978,17 @@ mulAvxTwo_9x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 @@ -25853,58 +50208,56 @@ mulAvxTwo_9x3_end: // func mulAvxTwo_9x3_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_9x3_64(SB), $0-88 +TEXT ·mulAvxTwo_9x3_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 62 YMM used + // Destination kept in GP registers + // Full registers estimated 118 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_9x3_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), AX - MOVQ out_base+48(FP), R12 - MOVQ out_base+48(FP), R12 - MOVQ start+72(FP), R13 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R13, DX - ADDQ R13, BX - ADDQ R13, SI - ADDQ R13, DI - ADDQ R13, R8 - ADDQ R13, R9 - ADDQ R13, R10 - ADDQ R13, R11 - ADDQ R13, AX - MOVQ $0x0000000f, R14 - MOVQ R14, X6 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R14 - SHRQ $0x06, R14 mulAvxTwo_9x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 64 bytes from input 0 to 3 outputs - VMOVDQU (DX), Y11 - VMOVDQU 32(DX), Y13 - ADDQ $0x40, DX + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -25917,35 +50270,29 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y0, Y0 - VPXOR Y9, Y1, Y1 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y2, Y2 - VPXOR Y9, Y3, Y3 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y9, Y10, Y9 - VPXOR Y7, Y4, Y4 - VPXOR Y9, Y5, Y5 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 // Load and process 64 bytes from input 1 to 3 outputs - VMOVDQU (BX), Y11 - VMOVDQU 32(BX), Y13 - ADDQ $0x40, BX + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -25984,9 +50331,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 2 to 3 outputs - VMOVDQU (SI), Y11 - VMOVDQU 32(SI), Y13 - ADDQ $0x40, SI + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26025,9 +50372,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 3 to 3 outputs - VMOVDQU (DI), Y11 - VMOVDQU 32(DI), Y13 - ADDQ $0x40, DI + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26066,9 +50413,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 4 to 3 outputs - VMOVDQU (R8), Y11 - VMOVDQU 32(R8), Y13 - ADDQ $0x40, R8 + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26107,9 +50454,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 5 to 3 outputs - VMOVDQU (R9), Y11 - VMOVDQU 32(R9), Y13 - ADDQ $0x40, R9 + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26148,9 +50495,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 6 to 3 outputs - VMOVDQU (R10), Y11 - VMOVDQU 32(R10), Y13 - ADDQ $0x40, R10 + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26189,9 +50536,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 7 to 3 outputs - VMOVDQU (R11), Y11 - VMOVDQU 32(R11), Y13 - ADDQ $0x40, R11 + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26230,9 +50577,9 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Load and process 64 bytes from input 8 to 3 outputs - VMOVDQU (AX), Y11 - VMOVDQU 32(AX), Y13 - ADDQ $0x40, AX + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX VPSRLQ $0x04, Y11, Y12 VPSRLQ $0x04, Y13, Y14 VPAND Y6, Y11, Y11 @@ -26271,25 +50618,759 @@ mulAvxTwo_9x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R12), R15 - VMOVDQU Y0, (R15)(R13*1) - VMOVDQU Y1, 32(R15)(R13*1) - MOVQ 24(R12), R15 - VMOVDQU Y2, (R15)(R13*1) - VMOVDQU Y3, 32(R15)(R13*1) - MOVQ 48(R12), R15 - VMOVDQU Y4, (R15)(R13*1) - VMOVDQU Y5, 32(R15)(R13*1) + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x40, R13 - DECQ R14 + DECQ AX JNZ mulAvxTwo_9x3_64_loop VZEROUPPER mulAvxTwo_9x3_64_end: RET +// func mulAvxTwo_9x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x3Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 62 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + +mulAvxTwo_9x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R15), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y6 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x3Xor_loop + VZEROUPPER + +mulAvxTwo_9x3Xor_end: + RET + +// func mulAvxTwo_9x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 118 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_9x3_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 + + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_9x3_64Xor_loop + VZEROUPPER + +mulAvxTwo_9x3_64Xor_end: + RET + // func mulAvxTwo_9x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 @@ -26341,12 +51422,6 @@ TEXT ·mulAvxTwo_9x4(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_9x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -26357,26 +51432,22 @@ mulAvxTwo_9x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 @@ -26644,6 +51715,358 @@ mulAvxTwo_9x4_loop: mulAvxTwo_9x4_end: RET +// func mulAvxTwo_9x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 81 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x4Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), AX + MOVQ out_base+48(FP), R12 + MOVQ (R12), R13 + MOVQ 24(R12), R14 + MOVQ 48(R12), R15 + MOVQ 72(R12), R12 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R13 + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R12 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_9x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU (R13), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU (R15), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU (R12), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (AX), Y7 + ADDQ $0x20, AX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + VMOVDQU Y0, (R13) + ADDQ $0x20, R13 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + VMOVDQU Y2, (R15) + ADDQ $0x20, R15 + VMOVDQU Y3, (R12) + ADDQ $0x20, R12 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_9x4Xor_loop + VZEROUPPER + +mulAvxTwo_9x4Xor_end: + RET + // func mulAvxTwo_9x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 @@ -26683,13 +52106,6 @@ TEXT ·mulAvxTwo_9x5(SB), NOSPLIT, $0-88 VPBROADCASTB X5, Y5 mulAvxTwo_9x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -26700,32 +52116,27 @@ mulAvxTwo_9x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -27044,6 +52455,409 @@ mulAvxTwo_9x5_loop: mulAvxTwo_9x5_end: RET +// func mulAvxTwo_9x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x5Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 100 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_9x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x5Xor_loop + VZEROUPPER + +mulAvxTwo_9x5Xor_end: + RET + // func mulAvxTwo_9x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 @@ -27083,14 +52897,6 @@ TEXT ·mulAvxTwo_9x6(SB), NOSPLIT, $0-88 VPBROADCASTB X6, Y6 mulAvxTwo_9x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -27101,38 +52907,32 @@ mulAvxTwo_9x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -27501,6 +53301,467 @@ mulAvxTwo_9x6_loop: mulAvxTwo_9x6_end: RET +// func mulAvxTwo_9x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x6Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 119 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_9x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x6Xor_loop + VZEROUPPER + +mulAvxTwo_9x6Xor_end: + RET + // func mulAvxTwo_9x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 @@ -27540,15 +53801,6 @@ TEXT ·mulAvxTwo_9x7(SB), NOSPLIT, $0-88 VPBROADCASTB X7, Y7 mulAvxTwo_9x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -27559,44 +53811,37 @@ mulAvxTwo_9x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -28015,6 +54260,525 @@ mulAvxTwo_9x7_loop: mulAvxTwo_9x7_end: RET +// func mulAvxTwo_9x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x7Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 138 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_9x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x7Xor_loop + VZEROUPPER + +mulAvxTwo_9x7Xor_end: + RET + // func mulAvxTwo_9x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 @@ -28054,16 +54818,6 @@ TEXT ·mulAvxTwo_9x8(SB), NOSPLIT, $0-88 VPBROADCASTB X8, Y8 mulAvxTwo_9x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -28074,50 +54828,42 @@ mulAvxTwo_9x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -28586,6 +55332,583 @@ mulAvxTwo_9x8_loop: mulAvxTwo_9x8_end: RET +// func mulAvxTwo_9x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x8Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 157 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_9x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x8Xor_loop + VZEROUPPER + +mulAvxTwo_9x8Xor_end: + RET + // func mulAvxTwo_9x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 @@ -28625,17 +55948,6 @@ TEXT ·mulAvxTwo_9x9(SB), NOSPLIT, $0-88 VPBROADCASTB X9, Y9 mulAvxTwo_9x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -28646,56 +55958,47 @@ mulAvxTwo_9x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -29214,6 +56517,641 @@ mulAvxTwo_9x9_loop: mulAvxTwo_9x9_end: RET +// func mulAvxTwo_9x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x9Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 176 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_9x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x9Xor_loop + VZEROUPPER + +mulAvxTwo_9x9Xor_end: + RET + // func mulAvxTwo_9x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 @@ -29253,18 +57191,6 @@ TEXT ·mulAvxTwo_9x10(SB), NOSPLIT, $0-88 VPBROADCASTB X10, Y10 mulAvxTwo_9x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -29275,62 +57201,52 @@ mulAvxTwo_9x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y0, Y0 + VPXOR Y11, Y12, Y0 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y1, Y1 + VPXOR Y11, Y12, Y1 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y2, Y2 + VPXOR Y11, Y12, Y2 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y3, Y3 + VPXOR Y11, Y12, Y3 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y4, Y4 + VPXOR Y11, Y12, Y4 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y5, Y5 + VPXOR Y11, Y12, Y5 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y6, Y6 + VPXOR Y11, Y12, Y6 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y7, Y7 + VPXOR Y11, Y12, Y7 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y8, Y8 + VPXOR Y11, Y12, Y8 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - VPXOR Y11, Y12, Y11 - VPXOR Y11, Y9, Y9 + VPXOR Y11, Y12, Y9 // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 @@ -29899,6 +57815,699 @@ mulAvxTwo_9x10_loop: mulAvxTwo_9x10_end: RET +// func mulAvxTwo_9x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_9x10Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 195 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_9x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), DX + MOVQ out_base+48(FP), R13 + MOVQ start+72(FP), R14 + + // Add start offset to input + ADDQ R14, BX + ADDQ R14, SI + ADDQ R14, DI + ADDQ R14, R8 + ADDQ R14, R9 + ADDQ R14, R10 + ADDQ R14, R11 + ADDQ R14, R12 + ADDQ R14, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_9x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R13), R15 + VMOVDQU (R15)(R14*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R13), R15 + VMOVDQU (R15)(R14*1), Y1 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + MOVQ 48(R13), R15 + VMOVDQU (R15)(R14*1), Y2 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + MOVQ 72(R13), R15 + VMOVDQU (R15)(R14*1), Y3 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + MOVQ 96(R13), R15 + VMOVDQU (R15)(R14*1), Y4 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + MOVQ 120(R13), R15 + VMOVDQU (R15)(R14*1), Y5 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + MOVQ 144(R13), R15 + VMOVDQU (R15)(R14*1), Y6 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + MOVQ 168(R13), R15 + VMOVDQU (R15)(R14*1), Y7 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + MOVQ 192(R13), R15 + VMOVDQU (R15)(R14*1), Y8 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + MOVQ 216(R13), R15 + VMOVDQU (R15)(R14*1), Y9 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R13), R15 + VMOVDQU Y0, (R15)(R14*1) + MOVQ 24(R13), R15 + VMOVDQU Y1, (R15)(R14*1) + MOVQ 48(R13), R15 + VMOVDQU Y2, (R15)(R14*1) + MOVQ 72(R13), R15 + VMOVDQU Y3, (R15)(R14*1) + MOVQ 96(R13), R15 + VMOVDQU Y4, (R15)(R14*1) + MOVQ 120(R13), R15 + VMOVDQU Y5, (R15)(R14*1) + MOVQ 144(R13), R15 + VMOVDQU Y6, (R15)(R14*1) + MOVQ 168(R13), R15 + VMOVDQU Y7, (R15)(R14*1) + MOVQ 192(R13), R15 + VMOVDQU Y8, (R15)(R14*1) + MOVQ 216(R13), R15 + VMOVDQU Y9, (R15)(R14*1) + + // Prepare for next loop + ADDQ $0x20, R14 + DECQ AX + JNZ mulAvxTwo_9x10Xor_loop + VZEROUPPER + +mulAvxTwo_9x10Xor_end: + RET + // func mulAvxTwo_10x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 @@ -29944,9 +58553,6 @@ TEXT ·mulAvxTwo_10x1(SB), NOSPLIT, $0-88 VPBROADCASTB X1, Y1 mulAvxTwo_10x1_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - // Load and process 32 bytes from input 0 to 1 outputs VMOVDQU (BX), Y4 ADDQ $0x20, BX @@ -29957,8 +58563,7 @@ mulAvxTwo_10x1_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 - VPXOR Y2, Y3, Y2 - VPXOR Y2, Y0, Y0 + VPXOR Y2, Y3, Y0 // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 @@ -30091,56 +58696,54 @@ mulAvxTwo_10x1_end: // func mulAvxTwo_10x1_64(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 -TEXT ·mulAvxTwo_10x1_64(SB), $8-88 +TEXT ·mulAvxTwo_10x1_64(SB), $0-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 24 YMM used + // Destination kept in GP registers + // Full registers estimated 46 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x1_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX MOVQ $0x0000000f, R15 MOVQ R15, X2 VPBROADCASTB X2, Y2 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 mulAvxTwo_10x1_64_loop: - // Clear 1 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 64 bytes from input 0 to 1 outputs - VMOVDQU (DX), Y6 - VMOVDQU 32(DX), Y5 - ADDQ $0x40, DX + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30153,15 +58756,13 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y5, Y6, Y5 - VPXOR Y3, Y0, Y0 - VPXOR Y5, Y1, Y1 + VPXOR Y3, Y4, Y0 + VPXOR Y5, Y6, Y1 // Load and process 64 bytes from input 1 to 1 outputs - VMOVDQU (BX), Y6 - VMOVDQU 32(BX), Y5 - ADDQ $0x40, BX + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30180,9 +58781,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 2 to 1 outputs - VMOVDQU (SI), Y6 - VMOVDQU 32(SI), Y5 - ADDQ $0x40, SI + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30201,9 +58802,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 3 to 1 outputs - VMOVDQU (DI), Y6 - VMOVDQU 32(DI), Y5 - ADDQ $0x40, DI + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30222,9 +58823,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 4 to 1 outputs - VMOVDQU (R8), Y6 - VMOVDQU 32(R8), Y5 - ADDQ $0x40, R8 + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30243,9 +58844,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 5 to 1 outputs - VMOVDQU (R9), Y6 - VMOVDQU 32(R9), Y5 - ADDQ $0x40, R9 + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30264,9 +58865,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 6 to 1 outputs - VMOVDQU (R10), Y6 - VMOVDQU 32(R10), Y5 - ADDQ $0x40, R10 + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30285,9 +58886,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 7 to 1 outputs - VMOVDQU (R11), Y6 - VMOVDQU 32(R11), Y5 - ADDQ $0x40, R11 + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30306,9 +58907,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 8 to 1 outputs - VMOVDQU (R12), Y6 - VMOVDQU 32(R12), Y5 - ADDQ $0x40, R12 + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30327,9 +58928,9 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Load and process 64 bytes from input 9 to 1 outputs - VMOVDQU (AX), Y6 - VMOVDQU 32(AX), Y5 - ADDQ $0x40, AX + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX VPSRLQ $0x04, Y6, Y7 VPSRLQ $0x04, Y5, Y8 VPAND Y2, Y6, Y6 @@ -30348,19 +58949,479 @@ mulAvxTwo_10x1_64_loop: VPXOR Y5, Y1, Y1 // Store 1 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 + DECQ AX JNZ mulAvxTwo_10x1_64_loop VZEROUPPER mulAvxTwo_10x1_64_end: RET +// func mulAvxTwo_10x1Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x1Xor(SB), NOSPLIT, $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 24 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X1 + VPBROADCASTB X1, Y1 + +mulAvxTwo_10x1Xor_loop: + // Load and process 32 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y4 + ADDQ $0x20, BX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y2 + VMOVDQU 32(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y4 + ADDQ $0x20, SI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 64(CX), Y2 + VMOVDQU 96(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y4 + ADDQ $0x20, DI + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 128(CX), Y2 + VMOVDQU 160(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y4 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 192(CX), Y2 + VMOVDQU 224(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y4 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 256(CX), Y2 + VMOVDQU 288(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y4 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 320(CX), Y2 + VMOVDQU 352(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y4 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 384(CX), Y2 + VMOVDQU 416(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y4 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 448(CX), Y2 + VMOVDQU 480(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y4 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 512(CX), Y2 + VMOVDQU 544(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Load and process 32 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y4 + ADDQ $0x20, DX + VPSRLQ $0x04, Y4, Y5 + VPAND Y1, Y4, Y4 + VPAND Y1, Y5, Y5 + VMOVDQU 576(CX), Y2 + VMOVDQU 608(CX), Y3 + VPSHUFB Y4, Y2, Y2 + VPSHUFB Y5, Y3, Y3 + VPXOR Y2, Y3, Y2 + VPXOR Y2, Y0, Y0 + + // Store 1 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x1Xor_loop + VZEROUPPER + +mulAvxTwo_10x1Xor_end: + RET + +// func mulAvxTwo_10x1_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x1_64Xor(SB), $0-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 46 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x1_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R14 + MOVQ start+72(FP), R15 + + // Add start offset to output + ADDQ R15, R14 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, R15 + MOVQ R15, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_10x1_64Xor_loop: + // Load 1 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + + // Load and process 64 bytes from input 0 to 1 outputs + VMOVDQU (BX), Y6 + VMOVDQU 32(BX), Y5 + ADDQ $0x40, BX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 1 to 1 outputs + VMOVDQU (SI), Y6 + VMOVDQU 32(SI), Y5 + ADDQ $0x40, SI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 2 to 1 outputs + VMOVDQU (DI), Y6 + VMOVDQU 32(DI), Y5 + ADDQ $0x40, DI + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 3 to 1 outputs + VMOVDQU (R8), Y6 + VMOVDQU 32(R8), Y5 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 4 to 1 outputs + VMOVDQU (R9), Y6 + VMOVDQU 32(R9), Y5 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 5 to 1 outputs + VMOVDQU (R10), Y6 + VMOVDQU 32(R10), Y5 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 6 to 1 outputs + VMOVDQU (R11), Y6 + VMOVDQU 32(R11), Y5 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 7 to 1 outputs + VMOVDQU (R12), Y6 + VMOVDQU 32(R12), Y5 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 8 to 1 outputs + VMOVDQU (R13), Y6 + VMOVDQU 32(R13), Y5 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Load and process 64 bytes from input 9 to 1 outputs + VMOVDQU (DX), Y6 + VMOVDQU 32(DX), Y5 + ADDQ $0x40, DX + VPSRLQ $0x04, Y6, Y7 + VPSRLQ $0x04, Y5, Y8 + VPAND Y2, Y6, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y7, Y7 + VPAND Y2, Y8, Y8 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y5 + VPSHUFB Y6, Y3, Y3 + VPSHUFB Y8, Y4, Y6 + VPSHUFB Y7, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y5, Y6, Y5 + VPXOR Y3, Y0, Y0 + VPXOR Y5, Y1, Y1 + + // Store 1 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x1_64Xor_loop + VZEROUPPER + +mulAvxTwo_10x1_64Xor_end: + RET + // func mulAvxTwo_10x2(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 @@ -30408,10 +59469,6 @@ TEXT ·mulAvxTwo_10x2(SB), NOSPLIT, $8-88 VPBROADCASTB X2, Y2 mulAvxTwo_10x2_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - // Load and process 32 bytes from input 0 to 2 outputs VMOVDQU (BX), Y5 ADDQ $0x20, BX @@ -30422,14 +59479,12 @@ mulAvxTwo_10x2_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y0, Y0 + VPXOR Y3, Y4, Y0 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - VPXOR Y3, Y4, Y3 - VPXOR Y3, Y1, Y1 + VPXOR Y3, Y4, Y1 // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 @@ -30620,56 +59675,54 @@ mulAvxTwo_10x2_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x2_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 47 YMM used + // Destination kept in GP registers + // Full registers estimated 89 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX TESTQ AX, AX JZ mulAvxTwo_10x2_64_end - MOVQ in_base+24(FP), AX - MOVQ (AX), DX - MOVQ 24(AX), BX - MOVQ 48(AX), SI - MOVQ 72(AX), DI - MOVQ 96(AX), R8 - MOVQ 120(AX), R9 - MOVQ 144(AX), R10 - MOVQ 168(AX), R11 - MOVQ 192(AX), R12 - MOVQ 216(AX), AX - MOVQ out_base+48(FP), R13 - MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX - MOVQ $0x0000000f, R15 - MOVQ R15, X4 + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 VPBROADCASTB X4, Y4 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 mulAvxTwo_10x2_64_loop: - // Clear 2 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 64 bytes from input 0 to 2 outputs - VMOVDQU (DX), Y9 - VMOVDQU 32(DX), Y11 - ADDQ $0x40, DX + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30682,25 +59735,21 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y0, Y0 - VPXOR Y7, Y1, Y1 + VPXOR Y5, Y6, Y0 + VPXOR Y7, Y8, Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y7, Y8, Y7 - VPXOR Y5, Y2, Y2 - VPXOR Y7, Y3, Y3 + VPXOR Y5, Y6, Y2 + VPXOR Y7, Y8, Y3 // Load and process 64 bytes from input 1 to 2 outputs - VMOVDQU (BX), Y9 - VMOVDQU 32(BX), Y11 - ADDQ $0x40, BX + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30729,9 +59778,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 2 to 2 outputs - VMOVDQU (SI), Y9 - VMOVDQU 32(SI), Y11 - ADDQ $0x40, SI + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30760,9 +59809,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 3 to 2 outputs - VMOVDQU (DI), Y9 - VMOVDQU 32(DI), Y11 - ADDQ $0x40, DI + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30791,9 +59840,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 4 to 2 outputs - VMOVDQU (R8), Y9 - VMOVDQU 32(R8), Y11 - ADDQ $0x40, R8 + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30822,9 +59871,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 5 to 2 outputs - VMOVDQU (R9), Y9 - VMOVDQU 32(R9), Y11 - ADDQ $0x40, R9 + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30853,9 +59902,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 6 to 2 outputs - VMOVDQU (R10), Y9 - VMOVDQU 32(R10), Y11 - ADDQ $0x40, R10 + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30884,9 +59933,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 7 to 2 outputs - VMOVDQU (R11), Y9 - VMOVDQU 32(R11), Y11 - ADDQ $0x40, R11 + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30915,9 +59964,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 8 to 2 outputs - VMOVDQU (R12), Y9 - VMOVDQU 32(R12), Y11 - ADDQ $0x40, R12 + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30946,9 +59995,9 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Load and process 64 bytes from input 9 to 2 outputs - VMOVDQU (AX), Y9 - VMOVDQU 32(AX), Y11 - ADDQ $0x40, AX + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX VPSRLQ $0x04, Y9, Y10 VPSRLQ $0x04, Y11, Y12 VPAND Y4, Y9, Y9 @@ -30977,22 +60026,654 @@ mulAvxTwo_10x2_64_loop: VPXOR Y7, Y3, Y3 // Store 2 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) - MOVQ 24(R13), BP - VMOVDQU Y2, (BP)(R14*1) - VMOVDQU Y3, 32(BP)(R14*1) + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 + DECQ AX JNZ mulAvxTwo_10x2_64_loop VZEROUPPER mulAvxTwo_10x2_64_end: RET +// func mulAvxTwo_10x2Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x2Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 47 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x2Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X2 + VPBROADCASTB X2, Y2 + +mulAvxTwo_10x2Xor_loop: + // Load and process 32 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y5 + ADDQ $0x20, BX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU (R15), Y0 + VMOVDQU (CX), Y3 + VMOVDQU 32(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU (R14), Y1 + VMOVDQU 64(CX), Y3 + VMOVDQU 96(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y5 + ADDQ $0x20, SI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 128(CX), Y3 + VMOVDQU 160(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 192(CX), Y3 + VMOVDQU 224(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y5 + ADDQ $0x20, DI + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 256(CX), Y3 + VMOVDQU 288(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 320(CX), Y3 + VMOVDQU 352(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y5 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 384(CX), Y3 + VMOVDQU 416(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 448(CX), Y3 + VMOVDQU 480(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y5 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 512(CX), Y3 + VMOVDQU 544(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 576(CX), Y3 + VMOVDQU 608(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y5 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 640(CX), Y3 + VMOVDQU 672(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 704(CX), Y3 + VMOVDQU 736(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y5 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 768(CX), Y3 + VMOVDQU 800(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 832(CX), Y3 + VMOVDQU 864(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y5 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 896(CX), Y3 + VMOVDQU 928(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 960(CX), Y3 + VMOVDQU 992(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y5 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1024(CX), Y3 + VMOVDQU 1056(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1088(CX), Y3 + VMOVDQU 1120(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Load and process 32 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y5 + ADDQ $0x20, DX + VPSRLQ $0x04, Y5, Y6 + VPAND Y2, Y5, Y5 + VPAND Y2, Y6, Y6 + VMOVDQU 1152(CX), Y3 + VMOVDQU 1184(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y0, Y0 + VMOVDQU 1216(CX), Y3 + VMOVDQU 1248(CX), Y4 + VPSHUFB Y5, Y3, Y3 + VPSHUFB Y6, Y4, Y4 + VPXOR Y3, Y4, Y3 + VPXOR Y3, Y1, Y1 + + // Store 2 outputs + VMOVDQU Y0, (R15) + ADDQ $0x20, R15 + VMOVDQU Y1, (R14) + ADDQ $0x20, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x2Xor_loop + VZEROUPPER + +mulAvxTwo_10x2Xor_end: + RET + +// func mulAvxTwo_10x2_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x2_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x2_64Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ out_base+48(FP), R14 + MOVQ (R14), R15 + MOVQ 24(R14), R14 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R15 + ADDQ BP, R14 + + // Add start offset to input + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, R13 + ADDQ BP, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x2_64Xor_loop: + // Load 2 outputs + VMOVDQU (R15), Y0 + VMOVDQU 32(R15), Y1 + VMOVDQU (R14), Y2 + VMOVDQU 32(R14), Y3 + + // Load and process 64 bytes from input 0 to 2 outputs + VMOVDQU (BX), Y9 + VMOVDQU 32(BX), Y11 + ADDQ $0x40, BX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 1 to 2 outputs + VMOVDQU (SI), Y9 + VMOVDQU 32(SI), Y11 + ADDQ $0x40, SI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 2 to 2 outputs + VMOVDQU (DI), Y9 + VMOVDQU 32(DI), Y11 + ADDQ $0x40, DI + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 3 to 2 outputs + VMOVDQU (R8), Y9 + VMOVDQU 32(R8), Y11 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 4 to 2 outputs + VMOVDQU (R9), Y9 + VMOVDQU 32(R9), Y11 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 5 to 2 outputs + VMOVDQU (R10), Y9 + VMOVDQU 32(R10), Y11 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 6 to 2 outputs + VMOVDQU (R11), Y9 + VMOVDQU 32(R11), Y11 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 7 to 2 outputs + VMOVDQU (R12), Y9 + VMOVDQU 32(R12), Y11 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 8 to 2 outputs + VMOVDQU (R13), Y9 + VMOVDQU 32(R13), Y11 + ADDQ $0x40, R13 + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Load and process 64 bytes from input 9 to 2 outputs + VMOVDQU (DX), Y9 + VMOVDQU 32(DX), Y11 + ADDQ $0x40, DX + VPSRLQ $0x04, Y9, Y10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y4, Y9, Y9 + VPAND Y4, Y11, Y11 + VPAND Y4, Y10, Y10 + VPAND Y4, Y12, Y12 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y0, Y0 + VPXOR Y7, Y1, Y1 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y11, Y5, Y7 + VPSHUFB Y9, Y5, Y5 + VPSHUFB Y12, Y6, Y8 + VPSHUFB Y10, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y7, Y8, Y7 + VPXOR Y5, Y2, Y2 + VPXOR Y7, Y3, Y3 + + // Store 2 outputs + VMOVDQU Y0, (R15) + VMOVDQU Y1, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y2, (R14) + VMOVDQU Y3, 32(R14) + ADDQ $0x40, R14 + + // Prepare for next loop + DECQ AX + JNZ mulAvxTwo_10x2_64Xor_loop + VZEROUPPER + +mulAvxTwo_10x2_64Xor_end: + RET + // func mulAvxTwo_10x3(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 @@ -31044,11 +60725,6 @@ TEXT ·mulAvxTwo_10x3(SB), NOSPLIT, $8-88 SHRQ $0x05, BP mulAvxTwo_10x3_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - // Load and process 32 bytes from input 0 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -31059,20 +60735,17 @@ mulAvxTwo_10x3_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y0, Y0 + VPXOR Y4, Y5, Y0 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y1, Y1 + VPXOR Y4, Y5, Y1 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - VPXOR Y4, Y5, Y4 - VPXOR Y4, Y2, Y2 + VPXOR Y4, Y5, Y2 // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y6 @@ -31319,8 +60992,8 @@ mulAvxTwo_10x3_end: // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x3_64(SB), $8-88 // Loading no tables to registers - // Destination kept on stack - // Full registers estimated 68 YMM used + // Destination kept in GP registers + // Full registers estimated 130 YMM used MOVQ n+80(FP), AX MOVQ matrix_base+0(FP), CX SHRQ $0x06, AX @@ -31339,33 +61012,840 @@ TEXT ·mulAvxTwo_10x3_64(SB), $8-88 MOVQ 216(AX), AX MOVQ out_base+48(FP), R13 MOVQ out_base+48(FP), R13 - MOVQ start+72(FP), R14 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 // Add start offset to input - ADDQ R14, DX - ADDQ R14, BX - ADDQ R14, SI - ADDQ R14, DI - ADDQ R14, R8 - ADDQ R14, R9 - ADDQ R14, R10 - ADDQ R14, R11 - ADDQ R14, R12 - ADDQ R14, AX - MOVQ $0x0000000f, R15 - MOVQ R15, X6 + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 VPBROADCASTB X6, Y6 - MOVQ n+80(FP), R15 - SHRQ $0x06, R15 + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP mulAvxTwo_10x3_64_loop: - // Clear 3 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 + // Load and process 64 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y11 + VMOVDQU 32(DX), Y13 + ADDQ $0x40, DX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y0 + VPXOR Y9, Y10, Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y2 + VPXOR Y9, Y10, Y3 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y4 + VPXOR Y9, Y10, Y5 + + // Load and process 64 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y11 + VMOVDQU 32(BX), Y13 + ADDQ $0x40, BX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y11 + VMOVDQU 32(SI), Y13 + ADDQ $0x40, SI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y11 + VMOVDQU 32(DI), Y13 + ADDQ $0x40, DI + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y11 + VMOVDQU 32(R8), Y13 + ADDQ $0x40, R8 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y11 + VMOVDQU 32(R9), Y13 + ADDQ $0x40, R9 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y11 + VMOVDQU 32(R10), Y13 + ADDQ $0x40, R10 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y11 + VMOVDQU 32(R11), Y13 + ADDQ $0x40, R11 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y11 + VMOVDQU 32(R12), Y13 + ADDQ $0x40, R12 + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Load and process 64 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y11 + VMOVDQU 32(AX), Y13 + ADDQ $0x40, AX + VPSRLQ $0x04, Y11, Y12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y6, Y11, Y11 + VPAND Y6, Y13, Y13 + VPAND Y6, Y12, Y12 + VPAND Y6, Y14, Y14 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y0, Y0 + VPXOR Y9, Y1, Y1 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y2, Y2 + VPXOR Y9, Y3, Y3 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y13, Y7, Y9 + VPSHUFB Y11, Y7, Y7 + VPSHUFB Y14, Y8, Y10 + VPSHUFB Y12, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y9, Y10, Y9 + VPXOR Y7, Y4, Y4 + VPXOR Y9, Y5, Y5 + + // Store 3 outputs + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_10x3_64_loop + VZEROUPPER + +mulAvxTwo_10x3_64_end: + RET + +// func mulAvxTwo_10x3Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x3Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 68 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x3Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X3 + VPBROADCASTB X3, Y3 + MOVQ n+80(FP), BP + SHRQ $0x05, BP + +mulAvxTwo_10x3Xor_loop: + // Load and process 32 bytes from input 0 to 3 outputs + VMOVDQU (DX), Y6 + ADDQ $0x20, DX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU (R14), Y0 + VMOVDQU (CX), Y4 + VMOVDQU 32(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU (R15), Y1 + VMOVDQU 64(CX), Y4 + VMOVDQU 96(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU (R13), Y2 + VMOVDQU 128(CX), Y4 + VMOVDQU 160(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 1 to 3 outputs + VMOVDQU (BX), Y6 + ADDQ $0x20, BX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 192(CX), Y4 + VMOVDQU 224(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 256(CX), Y4 + VMOVDQU 288(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 320(CX), Y4 + VMOVDQU 352(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 2 to 3 outputs + VMOVDQU (SI), Y6 + ADDQ $0x20, SI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 384(CX), Y4 + VMOVDQU 416(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 448(CX), Y4 + VMOVDQU 480(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 512(CX), Y4 + VMOVDQU 544(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 3 to 3 outputs + VMOVDQU (DI), Y6 + ADDQ $0x20, DI + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 576(CX), Y4 + VMOVDQU 608(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 640(CX), Y4 + VMOVDQU 672(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 704(CX), Y4 + VMOVDQU 736(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 4 to 3 outputs + VMOVDQU (R8), Y6 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 768(CX), Y4 + VMOVDQU 800(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 832(CX), Y4 + VMOVDQU 864(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 896(CX), Y4 + VMOVDQU 928(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 5 to 3 outputs + VMOVDQU (R9), Y6 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 960(CX), Y4 + VMOVDQU 992(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1024(CX), Y4 + VMOVDQU 1056(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1088(CX), Y4 + VMOVDQU 1120(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 6 to 3 outputs + VMOVDQU (R10), Y6 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1152(CX), Y4 + VMOVDQU 1184(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1216(CX), Y4 + VMOVDQU 1248(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1280(CX), Y4 + VMOVDQU 1312(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 7 to 3 outputs + VMOVDQU (R11), Y6 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1344(CX), Y4 + VMOVDQU 1376(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1408(CX), Y4 + VMOVDQU 1440(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1472(CX), Y4 + VMOVDQU 1504(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 8 to 3 outputs + VMOVDQU (R12), Y6 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1536(CX), Y4 + VMOVDQU 1568(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1600(CX), Y4 + VMOVDQU 1632(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1664(CX), Y4 + VMOVDQU 1696(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Load and process 32 bytes from input 9 to 3 outputs + VMOVDQU (AX), Y6 + ADDQ $0x20, AX + VPSRLQ $0x04, Y6, Y7 + VPAND Y3, Y6, Y6 + VPAND Y3, Y7, Y7 + VMOVDQU 1728(CX), Y4 + VMOVDQU 1760(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y0, Y0 + VMOVDQU 1792(CX), Y4 + VMOVDQU 1824(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y1, Y1 + VMOVDQU 1856(CX), Y4 + VMOVDQU 1888(CX), Y5 + VPSHUFB Y6, Y4, Y4 + VPSHUFB Y7, Y5, Y5 + VPXOR Y4, Y5, Y4 + VPXOR Y4, Y2, Y2 + + // Store 3 outputs + VMOVDQU Y0, (R14) + ADDQ $0x20, R14 + VMOVDQU Y1, (R15) + ADDQ $0x20, R15 + VMOVDQU Y2, (R13) + ADDQ $0x20, R13 + + // Prepare for next loop + DECQ BP + JNZ mulAvxTwo_10x3Xor_loop + VZEROUPPER + +mulAvxTwo_10x3Xor_end: + RET + +// func mulAvxTwo_10x3_64Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x3_64Xor(SB), $8-88 + // Loading no tables to registers + // Destination kept in GP registers + // Full registers estimated 130 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x06, AX + TESTQ AX, AX + JZ mulAvxTwo_10x3_64Xor_end + MOVQ in_base+24(FP), AX + MOVQ (AX), DX + MOVQ 24(AX), BX + MOVQ 48(AX), SI + MOVQ 72(AX), DI + MOVQ 96(AX), R8 + MOVQ 120(AX), R9 + MOVQ 144(AX), R10 + MOVQ 168(AX), R11 + MOVQ 192(AX), R12 + MOVQ 216(AX), AX + MOVQ out_base+48(FP), R13 + MOVQ out_base+48(FP), R13 + MOVQ (R13), R14 + MOVQ 24(R13), R15 + MOVQ 48(R13), R13 + MOVQ start+72(FP), BP + + // Add start offset to output + ADDQ BP, R14 + ADDQ BP, R15 + ADDQ BP, R13 + + // Add start offset to input + ADDQ BP, DX + ADDQ BP, BX + ADDQ BP, SI + ADDQ BP, DI + ADDQ BP, R8 + ADDQ BP, R9 + ADDQ BP, R10 + ADDQ BP, R11 + ADDQ BP, R12 + ADDQ BP, AX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + + // Reload length to save a register + MOVQ n+80(FP), BP + SHRQ $0x06, BP + +mulAvxTwo_10x3_64Xor_loop: + // Load 3 outputs + VMOVDQU (R14), Y0 + VMOVDQU 32(R14), Y1 + VMOVDQU (R15), Y2 + VMOVDQU 32(R15), Y3 + VMOVDQU (R13), Y4 + VMOVDQU 32(R13), Y5 // Load and process 64 bytes from input 0 to 3 outputs VMOVDQU (DX), Y11 @@ -31778,23 +62258,22 @@ mulAvxTwo_10x3_64_loop: VPXOR Y9, Y5, Y5 // Store 3 outputs - MOVQ (R13), BP - VMOVDQU Y0, (BP)(R14*1) - VMOVDQU Y1, 32(BP)(R14*1) - MOVQ 24(R13), BP - VMOVDQU Y2, (BP)(R14*1) - VMOVDQU Y3, 32(BP)(R14*1) - MOVQ 48(R13), BP - VMOVDQU Y4, (BP)(R14*1) - VMOVDQU Y5, 32(BP)(R14*1) + VMOVDQU Y0, (R14) + VMOVDQU Y1, 32(R14) + ADDQ $0x40, R14 + VMOVDQU Y2, (R15) + VMOVDQU Y3, 32(R15) + ADDQ $0x40, R15 + VMOVDQU Y4, (R13) + VMOVDQU Y5, 32(R13) + ADDQ $0x40, R13 // Prepare for next loop - ADDQ $0x40, R14 - DECQ R15 - JNZ mulAvxTwo_10x3_64_loop + DECQ BP + JNZ mulAvxTwo_10x3_64Xor_loop VZEROUPPER -mulAvxTwo_10x3_64_end: +mulAvxTwo_10x3_64Xor_end: RET // func mulAvxTwo_10x4(matrix []byte, in [][]byte, out [][]byte, start int, n int) @@ -31838,12 +62317,6 @@ TEXT ·mulAvxTwo_10x4(SB), NOSPLIT, $8-88 VPBROADCASTB X4, Y4 mulAvxTwo_10x4_loop: - // Clear 4 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - // Load and process 32 bytes from input 0 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -31854,26 +62327,22 @@ mulAvxTwo_10x4_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y0, Y0 + VPXOR Y5, Y6, Y0 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y1, Y1 + VPXOR Y5, Y6, Y1 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y2, Y2 + VPXOR Y5, Y6, Y2 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - VPXOR Y5, Y6, Y5 - VPXOR Y5, Y3, Y3 + VPXOR Y5, Y6, Y3 // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 @@ -32173,6 +62642,384 @@ mulAvxTwo_10x4_loop: mulAvxTwo_10x4_end: RET +// func mulAvxTwo_10x4Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x4Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 89 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x4Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X4 + VPBROADCASTB X4, Y4 + +mulAvxTwo_10x4Xor_loop: + // Load and process 32 bytes from input 0 to 4 outputs + VMOVDQU (BX), Y7 + ADDQ $0x20, BX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y5 + VMOVDQU 32(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y5 + VMOVDQU 96(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y5 + VMOVDQU 160(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y5 + VMOVDQU 224(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 1 to 4 outputs + VMOVDQU (SI), Y7 + ADDQ $0x20, SI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 256(CX), Y5 + VMOVDQU 288(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 320(CX), Y5 + VMOVDQU 352(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 384(CX), Y5 + VMOVDQU 416(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 448(CX), Y5 + VMOVDQU 480(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 2 to 4 outputs + VMOVDQU (DI), Y7 + ADDQ $0x20, DI + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 512(CX), Y5 + VMOVDQU 544(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 576(CX), Y5 + VMOVDQU 608(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 640(CX), Y5 + VMOVDQU 672(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 704(CX), Y5 + VMOVDQU 736(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 3 to 4 outputs + VMOVDQU (R8), Y7 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 768(CX), Y5 + VMOVDQU 800(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 832(CX), Y5 + VMOVDQU 864(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 896(CX), Y5 + VMOVDQU 928(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 960(CX), Y5 + VMOVDQU 992(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 4 to 4 outputs + VMOVDQU (R9), Y7 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1024(CX), Y5 + VMOVDQU 1056(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1088(CX), Y5 + VMOVDQU 1120(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1152(CX), Y5 + VMOVDQU 1184(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1216(CX), Y5 + VMOVDQU 1248(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 5 to 4 outputs + VMOVDQU (R10), Y7 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1280(CX), Y5 + VMOVDQU 1312(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1344(CX), Y5 + VMOVDQU 1376(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1408(CX), Y5 + VMOVDQU 1440(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1472(CX), Y5 + VMOVDQU 1504(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 6 to 4 outputs + VMOVDQU (R11), Y7 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1536(CX), Y5 + VMOVDQU 1568(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1600(CX), Y5 + VMOVDQU 1632(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1664(CX), Y5 + VMOVDQU 1696(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1728(CX), Y5 + VMOVDQU 1760(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 7 to 4 outputs + VMOVDQU (R12), Y7 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 1792(CX), Y5 + VMOVDQU 1824(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 1856(CX), Y5 + VMOVDQU 1888(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 1920(CX), Y5 + VMOVDQU 1952(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 1984(CX), Y5 + VMOVDQU 2016(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 8 to 4 outputs + VMOVDQU (R13), Y7 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2048(CX), Y5 + VMOVDQU 2080(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2112(CX), Y5 + VMOVDQU 2144(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2176(CX), Y5 + VMOVDQU 2208(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2240(CX), Y5 + VMOVDQU 2272(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Load and process 32 bytes from input 9 to 4 outputs + VMOVDQU (DX), Y7 + ADDQ $0x20, DX + VPSRLQ $0x04, Y7, Y8 + VPAND Y4, Y7, Y7 + VPAND Y4, Y8, Y8 + VMOVDQU 2304(CX), Y5 + VMOVDQU 2336(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y0, Y0 + VMOVDQU 2368(CX), Y5 + VMOVDQU 2400(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y1, Y1 + VMOVDQU 2432(CX), Y5 + VMOVDQU 2464(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y2, Y2 + VMOVDQU 2496(CX), Y5 + VMOVDQU 2528(CX), Y6 + VPSHUFB Y7, Y5, Y5 + VPSHUFB Y8, Y6, Y6 + VPXOR Y5, Y6, Y5 + VPXOR Y5, Y3, Y3 + + // Store 4 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x4Xor_loop + VZEROUPPER + +mulAvxTwo_10x4Xor_end: + RET + // func mulAvxTwo_10x5(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 @@ -32214,13 +63061,6 @@ TEXT ·mulAvxTwo_10x5(SB), NOSPLIT, $8-88 VPBROADCASTB X5, Y5 mulAvxTwo_10x5_loop: - // Clear 5 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - // Load and process 32 bytes from input 0 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -32231,32 +63071,27 @@ mulAvxTwo_10x5_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y0, Y0 + VPXOR Y6, Y7, Y0 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y1, Y1 + VPXOR Y6, Y7, Y1 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y2, Y2 + VPXOR Y6, Y7, Y2 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y3, Y3 + VPXOR Y6, Y7, Y3 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - VPXOR Y6, Y7, Y6 - VPXOR Y6, Y4, Y4 + VPXOR Y6, Y7, Y4 // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 @@ -32612,6 +63447,448 @@ mulAvxTwo_10x5_loop: mulAvxTwo_10x5_end: RET +// func mulAvxTwo_10x5Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x5Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 110 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x5Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X5 + VPBROADCASTB X5, Y5 + +mulAvxTwo_10x5Xor_loop: + // Load and process 32 bytes from input 0 to 5 outputs + VMOVDQU (BX), Y8 + ADDQ $0x20, BX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y6 + VMOVDQU 32(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y6 + VMOVDQU 96(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y6 + VMOVDQU 160(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y6 + VMOVDQU 224(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y6 + VMOVDQU 288(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 1 to 5 outputs + VMOVDQU (SI), Y8 + ADDQ $0x20, SI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 320(CX), Y6 + VMOVDQU 352(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 384(CX), Y6 + VMOVDQU 416(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 448(CX), Y6 + VMOVDQU 480(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 512(CX), Y6 + VMOVDQU 544(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 576(CX), Y6 + VMOVDQU 608(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 2 to 5 outputs + VMOVDQU (DI), Y8 + ADDQ $0x20, DI + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 640(CX), Y6 + VMOVDQU 672(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 704(CX), Y6 + VMOVDQU 736(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 768(CX), Y6 + VMOVDQU 800(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 832(CX), Y6 + VMOVDQU 864(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 896(CX), Y6 + VMOVDQU 928(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 3 to 5 outputs + VMOVDQU (R8), Y8 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 960(CX), Y6 + VMOVDQU 992(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1024(CX), Y6 + VMOVDQU 1056(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1088(CX), Y6 + VMOVDQU 1120(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1152(CX), Y6 + VMOVDQU 1184(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1216(CX), Y6 + VMOVDQU 1248(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 4 to 5 outputs + VMOVDQU (R9), Y8 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1280(CX), Y6 + VMOVDQU 1312(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1344(CX), Y6 + VMOVDQU 1376(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1408(CX), Y6 + VMOVDQU 1440(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1472(CX), Y6 + VMOVDQU 1504(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1536(CX), Y6 + VMOVDQU 1568(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 5 to 5 outputs + VMOVDQU (R10), Y8 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1600(CX), Y6 + VMOVDQU 1632(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1664(CX), Y6 + VMOVDQU 1696(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 1728(CX), Y6 + VMOVDQU 1760(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 1792(CX), Y6 + VMOVDQU 1824(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 1856(CX), Y6 + VMOVDQU 1888(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 6 to 5 outputs + VMOVDQU (R11), Y8 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 1920(CX), Y6 + VMOVDQU 1952(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 1984(CX), Y6 + VMOVDQU 2016(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2048(CX), Y6 + VMOVDQU 2080(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2112(CX), Y6 + VMOVDQU 2144(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2176(CX), Y6 + VMOVDQU 2208(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 7 to 5 outputs + VMOVDQU (R12), Y8 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2240(CX), Y6 + VMOVDQU 2272(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2304(CX), Y6 + VMOVDQU 2336(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2368(CX), Y6 + VMOVDQU 2400(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2432(CX), Y6 + VMOVDQU 2464(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2496(CX), Y6 + VMOVDQU 2528(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 8 to 5 outputs + VMOVDQU (R13), Y8 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2560(CX), Y6 + VMOVDQU 2592(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2624(CX), Y6 + VMOVDQU 2656(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 2688(CX), Y6 + VMOVDQU 2720(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 2752(CX), Y6 + VMOVDQU 2784(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 2816(CX), Y6 + VMOVDQU 2848(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Load and process 32 bytes from input 9 to 5 outputs + VMOVDQU (DX), Y8 + ADDQ $0x20, DX + VPSRLQ $0x04, Y8, Y9 + VPAND Y5, Y8, Y8 + VPAND Y5, Y9, Y9 + VMOVDQU 2880(CX), Y6 + VMOVDQU 2912(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y0, Y0 + VMOVDQU 2944(CX), Y6 + VMOVDQU 2976(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y1, Y1 + VMOVDQU 3008(CX), Y6 + VMOVDQU 3040(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y2, Y2 + VMOVDQU 3072(CX), Y6 + VMOVDQU 3104(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y3, Y3 + VMOVDQU 3136(CX), Y6 + VMOVDQU 3168(CX), Y7 + VPSHUFB Y8, Y6, Y6 + VPSHUFB Y9, Y7, Y7 + VPXOR Y6, Y7, Y6 + VPXOR Y6, Y4, Y4 + + // Store 5 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x5Xor_loop + VZEROUPPER + +mulAvxTwo_10x5Xor_end: + RET + // func mulAvxTwo_10x6(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 @@ -32653,14 +63930,6 @@ TEXT ·mulAvxTwo_10x6(SB), NOSPLIT, $8-88 VPBROADCASTB X6, Y6 mulAvxTwo_10x6_loop: - // Clear 6 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - // Load and process 32 bytes from input 0 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -32671,38 +63940,32 @@ mulAvxTwo_10x6_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y0, Y0 + VPXOR Y7, Y8, Y0 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y1, Y1 + VPXOR Y7, Y8, Y1 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y2, Y2 + VPXOR Y7, Y8, Y2 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y3, Y3 + VPXOR Y7, Y8, Y3 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y4, Y4 + VPXOR Y7, Y8, Y4 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - VPXOR Y7, Y8, Y7 - VPXOR Y7, Y5, Y5 + VPXOR Y7, Y8, Y5 // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 @@ -33114,6 +64377,512 @@ mulAvxTwo_10x6_loop: mulAvxTwo_10x6_end: RET +// func mulAvxTwo_10x6Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x6Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 131 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x6Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X6 + VPBROADCASTB X6, Y6 + +mulAvxTwo_10x6Xor_loop: + // Load and process 32 bytes from input 0 to 6 outputs + VMOVDQU (BX), Y9 + ADDQ $0x20, BX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y7 + VMOVDQU 32(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y7 + VMOVDQU 96(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y7 + VMOVDQU 160(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y7 + VMOVDQU 224(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y7 + VMOVDQU 288(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y7 + VMOVDQU 352(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 1 to 6 outputs + VMOVDQU (SI), Y9 + ADDQ $0x20, SI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 384(CX), Y7 + VMOVDQU 416(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 448(CX), Y7 + VMOVDQU 480(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 512(CX), Y7 + VMOVDQU 544(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 576(CX), Y7 + VMOVDQU 608(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 640(CX), Y7 + VMOVDQU 672(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 704(CX), Y7 + VMOVDQU 736(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 2 to 6 outputs + VMOVDQU (DI), Y9 + ADDQ $0x20, DI + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 768(CX), Y7 + VMOVDQU 800(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 832(CX), Y7 + VMOVDQU 864(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 896(CX), Y7 + VMOVDQU 928(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 960(CX), Y7 + VMOVDQU 992(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1024(CX), Y7 + VMOVDQU 1056(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1088(CX), Y7 + VMOVDQU 1120(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 3 to 6 outputs + VMOVDQU (R8), Y9 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1152(CX), Y7 + VMOVDQU 1184(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1216(CX), Y7 + VMOVDQU 1248(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1280(CX), Y7 + VMOVDQU 1312(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1344(CX), Y7 + VMOVDQU 1376(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1408(CX), Y7 + VMOVDQU 1440(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1472(CX), Y7 + VMOVDQU 1504(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 4 to 6 outputs + VMOVDQU (R9), Y9 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1536(CX), Y7 + VMOVDQU 1568(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1600(CX), Y7 + VMOVDQU 1632(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 1664(CX), Y7 + VMOVDQU 1696(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 1728(CX), Y7 + VMOVDQU 1760(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 1792(CX), Y7 + VMOVDQU 1824(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 1856(CX), Y7 + VMOVDQU 1888(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 5 to 6 outputs + VMOVDQU (R10), Y9 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 1920(CX), Y7 + VMOVDQU 1952(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 1984(CX), Y7 + VMOVDQU 2016(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2048(CX), Y7 + VMOVDQU 2080(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2112(CX), Y7 + VMOVDQU 2144(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2176(CX), Y7 + VMOVDQU 2208(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2240(CX), Y7 + VMOVDQU 2272(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 6 to 6 outputs + VMOVDQU (R11), Y9 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2304(CX), Y7 + VMOVDQU 2336(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2368(CX), Y7 + VMOVDQU 2400(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2432(CX), Y7 + VMOVDQU 2464(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2496(CX), Y7 + VMOVDQU 2528(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2560(CX), Y7 + VMOVDQU 2592(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 2624(CX), Y7 + VMOVDQU 2656(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 7 to 6 outputs + VMOVDQU (R12), Y9 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 2688(CX), Y7 + VMOVDQU 2720(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 2752(CX), Y7 + VMOVDQU 2784(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 2816(CX), Y7 + VMOVDQU 2848(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 2880(CX), Y7 + VMOVDQU 2912(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 2944(CX), Y7 + VMOVDQU 2976(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3008(CX), Y7 + VMOVDQU 3040(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 8 to 6 outputs + VMOVDQU (R13), Y9 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3072(CX), Y7 + VMOVDQU 3104(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3136(CX), Y7 + VMOVDQU 3168(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3200(CX), Y7 + VMOVDQU 3232(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3264(CX), Y7 + VMOVDQU 3296(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3328(CX), Y7 + VMOVDQU 3360(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3392(CX), Y7 + VMOVDQU 3424(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Load and process 32 bytes from input 9 to 6 outputs + VMOVDQU (DX), Y9 + ADDQ $0x20, DX + VPSRLQ $0x04, Y9, Y10 + VPAND Y6, Y9, Y9 + VPAND Y6, Y10, Y10 + VMOVDQU 3456(CX), Y7 + VMOVDQU 3488(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y0, Y0 + VMOVDQU 3520(CX), Y7 + VMOVDQU 3552(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y1, Y1 + VMOVDQU 3584(CX), Y7 + VMOVDQU 3616(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y2, Y2 + VMOVDQU 3648(CX), Y7 + VMOVDQU 3680(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y3, Y3 + VMOVDQU 3712(CX), Y7 + VMOVDQU 3744(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y4, Y4 + VMOVDQU 3776(CX), Y7 + VMOVDQU 3808(CX), Y8 + VPSHUFB Y9, Y7, Y7 + VPSHUFB Y10, Y8, Y8 + VPXOR Y7, Y8, Y7 + VPXOR Y7, Y5, Y5 + + // Store 6 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x6Xor_loop + VZEROUPPER + +mulAvxTwo_10x6Xor_end: + RET + // func mulAvxTwo_10x7(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 @@ -33155,15 +64924,6 @@ TEXT ·mulAvxTwo_10x7(SB), NOSPLIT, $8-88 VPBROADCASTB X7, Y7 mulAvxTwo_10x7_loop: - // Clear 7 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - // Load and process 32 bytes from input 0 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -33174,44 +64934,37 @@ mulAvxTwo_10x7_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y0, Y0 + VPXOR Y8, Y9, Y0 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y1, Y1 + VPXOR Y8, Y9, Y1 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y2, Y2 + VPXOR Y8, Y9, Y2 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y3, Y3 + VPXOR Y8, Y9, Y3 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y4, Y4 + VPXOR Y8, Y9, Y4 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y5, Y5 + VPXOR Y8, Y9, Y5 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - VPXOR Y8, Y9, Y8 - VPXOR Y8, Y6, Y6 + VPXOR Y8, Y9, Y6 // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 @@ -33679,6 +65432,576 @@ mulAvxTwo_10x7_loop: mulAvxTwo_10x7_end: RET +// func mulAvxTwo_10x7Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x7Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 152 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x7Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X7 + VPBROADCASTB X7, Y7 + +mulAvxTwo_10x7Xor_loop: + // Load and process 32 bytes from input 0 to 7 outputs + VMOVDQU (BX), Y10 + ADDQ $0x20, BX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y8 + VMOVDQU 32(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y8 + VMOVDQU 96(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y8 + VMOVDQU 160(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y8 + VMOVDQU 224(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y8 + VMOVDQU 288(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y8 + VMOVDQU 352(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y8 + VMOVDQU 416(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 1 to 7 outputs + VMOVDQU (SI), Y10 + ADDQ $0x20, SI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 448(CX), Y8 + VMOVDQU 480(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 512(CX), Y8 + VMOVDQU 544(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 576(CX), Y8 + VMOVDQU 608(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 640(CX), Y8 + VMOVDQU 672(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 704(CX), Y8 + VMOVDQU 736(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 768(CX), Y8 + VMOVDQU 800(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 832(CX), Y8 + VMOVDQU 864(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 2 to 7 outputs + VMOVDQU (DI), Y10 + ADDQ $0x20, DI + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 896(CX), Y8 + VMOVDQU 928(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 960(CX), Y8 + VMOVDQU 992(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1024(CX), Y8 + VMOVDQU 1056(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1088(CX), Y8 + VMOVDQU 1120(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1152(CX), Y8 + VMOVDQU 1184(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1216(CX), Y8 + VMOVDQU 1248(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1280(CX), Y8 + VMOVDQU 1312(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 3 to 7 outputs + VMOVDQU (R8), Y10 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1344(CX), Y8 + VMOVDQU 1376(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1408(CX), Y8 + VMOVDQU 1440(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1472(CX), Y8 + VMOVDQU 1504(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1536(CX), Y8 + VMOVDQU 1568(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 1600(CX), Y8 + VMOVDQU 1632(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 1664(CX), Y8 + VMOVDQU 1696(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 1728(CX), Y8 + VMOVDQU 1760(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 4 to 7 outputs + VMOVDQU (R9), Y10 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 1792(CX), Y8 + VMOVDQU 1824(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 1856(CX), Y8 + VMOVDQU 1888(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 1920(CX), Y8 + VMOVDQU 1952(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 1984(CX), Y8 + VMOVDQU 2016(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2048(CX), Y8 + VMOVDQU 2080(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2112(CX), Y8 + VMOVDQU 2144(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2176(CX), Y8 + VMOVDQU 2208(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 5 to 7 outputs + VMOVDQU (R10), Y10 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2240(CX), Y8 + VMOVDQU 2272(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2304(CX), Y8 + VMOVDQU 2336(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2368(CX), Y8 + VMOVDQU 2400(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2432(CX), Y8 + VMOVDQU 2464(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2496(CX), Y8 + VMOVDQU 2528(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 2560(CX), Y8 + VMOVDQU 2592(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 2624(CX), Y8 + VMOVDQU 2656(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 6 to 7 outputs + VMOVDQU (R11), Y10 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 2688(CX), Y8 + VMOVDQU 2720(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 2752(CX), Y8 + VMOVDQU 2784(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 2816(CX), Y8 + VMOVDQU 2848(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 2880(CX), Y8 + VMOVDQU 2912(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 2944(CX), Y8 + VMOVDQU 2976(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3008(CX), Y8 + VMOVDQU 3040(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3072(CX), Y8 + VMOVDQU 3104(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 7 to 7 outputs + VMOVDQU (R12), Y10 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3136(CX), Y8 + VMOVDQU 3168(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3200(CX), Y8 + VMOVDQU 3232(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3264(CX), Y8 + VMOVDQU 3296(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3328(CX), Y8 + VMOVDQU 3360(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3392(CX), Y8 + VMOVDQU 3424(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3456(CX), Y8 + VMOVDQU 3488(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3520(CX), Y8 + VMOVDQU 3552(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 8 to 7 outputs + VMOVDQU (R13), Y10 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 3584(CX), Y8 + VMOVDQU 3616(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 3648(CX), Y8 + VMOVDQU 3680(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 3712(CX), Y8 + VMOVDQU 3744(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 3776(CX), Y8 + VMOVDQU 3808(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 3840(CX), Y8 + VMOVDQU 3872(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 3904(CX), Y8 + VMOVDQU 3936(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 3968(CX), Y8 + VMOVDQU 4000(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Load and process 32 bytes from input 9 to 7 outputs + VMOVDQU (DX), Y10 + ADDQ $0x20, DX + VPSRLQ $0x04, Y10, Y11 + VPAND Y7, Y10, Y10 + VPAND Y7, Y11, Y11 + VMOVDQU 4032(CX), Y8 + VMOVDQU 4064(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y0, Y0 + VMOVDQU 4096(CX), Y8 + VMOVDQU 4128(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y1, Y1 + VMOVDQU 4160(CX), Y8 + VMOVDQU 4192(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y2, Y2 + VMOVDQU 4224(CX), Y8 + VMOVDQU 4256(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y3, Y3 + VMOVDQU 4288(CX), Y8 + VMOVDQU 4320(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y4, Y4 + VMOVDQU 4352(CX), Y8 + VMOVDQU 4384(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y5, Y5 + VMOVDQU 4416(CX), Y8 + VMOVDQU 4448(CX), Y9 + VPSHUFB Y10, Y8, Y8 + VPSHUFB Y11, Y9, Y9 + VPXOR Y8, Y9, Y8 + VPXOR Y8, Y6, Y6 + + // Store 7 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x7Xor_loop + VZEROUPPER + +mulAvxTwo_10x7Xor_end: + RET + // func mulAvxTwo_10x8(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 @@ -33720,16 +66043,6 @@ TEXT ·mulAvxTwo_10x8(SB), NOSPLIT, $8-88 VPBROADCASTB X8, Y8 mulAvxTwo_10x8_loop: - // Clear 8 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - // Load and process 32 bytes from input 0 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -33740,50 +66053,42 @@ mulAvxTwo_10x8_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y0, Y0 + VPXOR Y9, Y10, Y0 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y1, Y1 + VPXOR Y9, Y10, Y1 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y2, Y2 + VPXOR Y9, Y10, Y2 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y3, Y3 + VPXOR Y9, Y10, Y3 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y4, Y4 + VPXOR Y9, Y10, Y4 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y5, Y5 + VPXOR Y9, Y10, Y5 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y6, Y6 + VPXOR Y9, Y10, Y6 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - VPXOR Y9, Y10, Y9 - VPXOR Y9, Y7, Y7 + VPXOR Y9, Y10, Y7 // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 @@ -34307,6 +66612,640 @@ mulAvxTwo_10x8_loop: mulAvxTwo_10x8_end: RET +// func mulAvxTwo_10x8Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x8Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 173 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x8Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X8 + VPBROADCASTB X8, Y8 + +mulAvxTwo_10x8Xor_loop: + // Load and process 32 bytes from input 0 to 8 outputs + VMOVDQU (BX), Y11 + ADDQ $0x20, BX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y9 + VMOVDQU 32(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y9 + VMOVDQU 96(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y9 + VMOVDQU 160(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y9 + VMOVDQU 224(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y9 + VMOVDQU 288(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y9 + VMOVDQU 352(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y9 + VMOVDQU 416(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y9 + VMOVDQU 480(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 1 to 8 outputs + VMOVDQU (SI), Y11 + ADDQ $0x20, SI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 512(CX), Y9 + VMOVDQU 544(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 576(CX), Y9 + VMOVDQU 608(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 640(CX), Y9 + VMOVDQU 672(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 704(CX), Y9 + VMOVDQU 736(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 768(CX), Y9 + VMOVDQU 800(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 832(CX), Y9 + VMOVDQU 864(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 896(CX), Y9 + VMOVDQU 928(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 960(CX), Y9 + VMOVDQU 992(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 2 to 8 outputs + VMOVDQU (DI), Y11 + ADDQ $0x20, DI + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1024(CX), Y9 + VMOVDQU 1056(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1088(CX), Y9 + VMOVDQU 1120(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1152(CX), Y9 + VMOVDQU 1184(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1216(CX), Y9 + VMOVDQU 1248(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1280(CX), Y9 + VMOVDQU 1312(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1344(CX), Y9 + VMOVDQU 1376(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1408(CX), Y9 + VMOVDQU 1440(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1472(CX), Y9 + VMOVDQU 1504(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 3 to 8 outputs + VMOVDQU (R8), Y11 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 1536(CX), Y9 + VMOVDQU 1568(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 1600(CX), Y9 + VMOVDQU 1632(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 1664(CX), Y9 + VMOVDQU 1696(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 1728(CX), Y9 + VMOVDQU 1760(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 1792(CX), Y9 + VMOVDQU 1824(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 1856(CX), Y9 + VMOVDQU 1888(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 1920(CX), Y9 + VMOVDQU 1952(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 1984(CX), Y9 + VMOVDQU 2016(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 4 to 8 outputs + VMOVDQU (R9), Y11 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2048(CX), Y9 + VMOVDQU 2080(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2112(CX), Y9 + VMOVDQU 2144(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2176(CX), Y9 + VMOVDQU 2208(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2240(CX), Y9 + VMOVDQU 2272(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2304(CX), Y9 + VMOVDQU 2336(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2368(CX), Y9 + VMOVDQU 2400(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2432(CX), Y9 + VMOVDQU 2464(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 2496(CX), Y9 + VMOVDQU 2528(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 5 to 8 outputs + VMOVDQU (R10), Y11 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 2560(CX), Y9 + VMOVDQU 2592(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 2624(CX), Y9 + VMOVDQU 2656(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 2688(CX), Y9 + VMOVDQU 2720(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 2752(CX), Y9 + VMOVDQU 2784(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 2816(CX), Y9 + VMOVDQU 2848(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 2880(CX), Y9 + VMOVDQU 2912(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 2944(CX), Y9 + VMOVDQU 2976(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3008(CX), Y9 + VMOVDQU 3040(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 6 to 8 outputs + VMOVDQU (R11), Y11 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3072(CX), Y9 + VMOVDQU 3104(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3136(CX), Y9 + VMOVDQU 3168(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3200(CX), Y9 + VMOVDQU 3232(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3264(CX), Y9 + VMOVDQU 3296(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3328(CX), Y9 + VMOVDQU 3360(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3392(CX), Y9 + VMOVDQU 3424(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3456(CX), Y9 + VMOVDQU 3488(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 3520(CX), Y9 + VMOVDQU 3552(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 7 to 8 outputs + VMOVDQU (R12), Y11 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 3584(CX), Y9 + VMOVDQU 3616(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 3648(CX), Y9 + VMOVDQU 3680(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 3712(CX), Y9 + VMOVDQU 3744(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 3776(CX), Y9 + VMOVDQU 3808(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 3840(CX), Y9 + VMOVDQU 3872(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 3904(CX), Y9 + VMOVDQU 3936(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 3968(CX), Y9 + VMOVDQU 4000(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4032(CX), Y9 + VMOVDQU 4064(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 8 to 8 outputs + VMOVDQU (R13), Y11 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4096(CX), Y9 + VMOVDQU 4128(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4160(CX), Y9 + VMOVDQU 4192(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4224(CX), Y9 + VMOVDQU 4256(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4288(CX), Y9 + VMOVDQU 4320(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4352(CX), Y9 + VMOVDQU 4384(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4416(CX), Y9 + VMOVDQU 4448(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4480(CX), Y9 + VMOVDQU 4512(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 4544(CX), Y9 + VMOVDQU 4576(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Load and process 32 bytes from input 9 to 8 outputs + VMOVDQU (DX), Y11 + ADDQ $0x20, DX + VPSRLQ $0x04, Y11, Y12 + VPAND Y8, Y11, Y11 + VPAND Y8, Y12, Y12 + VMOVDQU 4608(CX), Y9 + VMOVDQU 4640(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y0, Y0 + VMOVDQU 4672(CX), Y9 + VMOVDQU 4704(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y1, Y1 + VMOVDQU 4736(CX), Y9 + VMOVDQU 4768(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y2, Y2 + VMOVDQU 4800(CX), Y9 + VMOVDQU 4832(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y3, Y3 + VMOVDQU 4864(CX), Y9 + VMOVDQU 4896(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y4, Y4 + VMOVDQU 4928(CX), Y9 + VMOVDQU 4960(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y5, Y5 + VMOVDQU 4992(CX), Y9 + VMOVDQU 5024(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y6, Y6 + VMOVDQU 5056(CX), Y9 + VMOVDQU 5088(CX), Y10 + VPSHUFB Y11, Y9, Y9 + VPSHUFB Y12, Y10, Y10 + VPXOR Y9, Y10, Y9 + VPXOR Y9, Y7, Y7 + + // Store 8 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x8Xor_loop + VZEROUPPER + +mulAvxTwo_10x8Xor_end: + RET + // func mulAvxTwo_10x9(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 @@ -34348,17 +67287,6 @@ TEXT ·mulAvxTwo_10x9(SB), NOSPLIT, $8-88 VPBROADCASTB X9, Y9 mulAvxTwo_10x9_loop: - // Clear 9 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - // Load and process 32 bytes from input 0 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -34369,56 +67297,47 @@ mulAvxTwo_10x9_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y0, Y0 + VPXOR Y10, Y11, Y0 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y1, Y1 + VPXOR Y10, Y11, Y1 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y2, Y2 + VPXOR Y10, Y11, Y2 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y3, Y3 + VPXOR Y10, Y11, Y3 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y4, Y4 + VPXOR Y10, Y11, Y4 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y5, Y5 + VPXOR Y10, Y11, Y5 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y6, Y6 + VPXOR Y10, Y11, Y6 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y7, Y7 + VPXOR Y10, Y11, Y7 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - VPXOR Y10, Y11, Y10 - VPXOR Y10, Y8, Y8 + VPXOR Y10, Y11, Y8 // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 @@ -34998,6 +67917,704 @@ mulAvxTwo_10x9_loop: mulAvxTwo_10x9_end: RET +// func mulAvxTwo_10x9Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x9Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 194 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x9Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X9 + VPBROADCASTB X9, Y9 + +mulAvxTwo_10x9Xor_loop: + // Load and process 32 bytes from input 0 to 9 outputs + VMOVDQU (BX), Y12 + ADDQ $0x20, BX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y10 + VMOVDQU 32(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 + VMOVDQU 64(CX), Y10 + VMOVDQU 96(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 + VMOVDQU 128(CX), Y10 + VMOVDQU 160(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 + VMOVDQU 192(CX), Y10 + VMOVDQU 224(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 + VMOVDQU 256(CX), Y10 + VMOVDQU 288(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 + VMOVDQU 320(CX), Y10 + VMOVDQU 352(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 + VMOVDQU 384(CX), Y10 + VMOVDQU 416(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 + VMOVDQU 448(CX), Y10 + VMOVDQU 480(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 + VMOVDQU 512(CX), Y10 + VMOVDQU 544(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 1 to 9 outputs + VMOVDQU (SI), Y12 + ADDQ $0x20, SI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 576(CX), Y10 + VMOVDQU 608(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 640(CX), Y10 + VMOVDQU 672(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 704(CX), Y10 + VMOVDQU 736(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 768(CX), Y10 + VMOVDQU 800(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 832(CX), Y10 + VMOVDQU 864(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 896(CX), Y10 + VMOVDQU 928(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 960(CX), Y10 + VMOVDQU 992(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1024(CX), Y10 + VMOVDQU 1056(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1088(CX), Y10 + VMOVDQU 1120(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 2 to 9 outputs + VMOVDQU (DI), Y12 + ADDQ $0x20, DI + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1152(CX), Y10 + VMOVDQU 1184(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1216(CX), Y10 + VMOVDQU 1248(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1280(CX), Y10 + VMOVDQU 1312(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1344(CX), Y10 + VMOVDQU 1376(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1408(CX), Y10 + VMOVDQU 1440(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 1472(CX), Y10 + VMOVDQU 1504(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 1536(CX), Y10 + VMOVDQU 1568(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 1600(CX), Y10 + VMOVDQU 1632(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 1664(CX), Y10 + VMOVDQU 1696(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 3 to 9 outputs + VMOVDQU (R8), Y12 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 1728(CX), Y10 + VMOVDQU 1760(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 1792(CX), Y10 + VMOVDQU 1824(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 1856(CX), Y10 + VMOVDQU 1888(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 1920(CX), Y10 + VMOVDQU 1952(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 1984(CX), Y10 + VMOVDQU 2016(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2048(CX), Y10 + VMOVDQU 2080(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2112(CX), Y10 + VMOVDQU 2144(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2176(CX), Y10 + VMOVDQU 2208(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2240(CX), Y10 + VMOVDQU 2272(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 4 to 9 outputs + VMOVDQU (R9), Y12 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2304(CX), Y10 + VMOVDQU 2336(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2368(CX), Y10 + VMOVDQU 2400(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 2432(CX), Y10 + VMOVDQU 2464(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 2496(CX), Y10 + VMOVDQU 2528(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 2560(CX), Y10 + VMOVDQU 2592(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 2624(CX), Y10 + VMOVDQU 2656(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 2688(CX), Y10 + VMOVDQU 2720(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 2752(CX), Y10 + VMOVDQU 2784(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 2816(CX), Y10 + VMOVDQU 2848(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 5 to 9 outputs + VMOVDQU (R10), Y12 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 2880(CX), Y10 + VMOVDQU 2912(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 2944(CX), Y10 + VMOVDQU 2976(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3008(CX), Y10 + VMOVDQU 3040(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3072(CX), Y10 + VMOVDQU 3104(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3136(CX), Y10 + VMOVDQU 3168(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3200(CX), Y10 + VMOVDQU 3232(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3264(CX), Y10 + VMOVDQU 3296(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3328(CX), Y10 + VMOVDQU 3360(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3392(CX), Y10 + VMOVDQU 3424(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 6 to 9 outputs + VMOVDQU (R11), Y12 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 3456(CX), Y10 + VMOVDQU 3488(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 3520(CX), Y10 + VMOVDQU 3552(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 3584(CX), Y10 + VMOVDQU 3616(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 3648(CX), Y10 + VMOVDQU 3680(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 3712(CX), Y10 + VMOVDQU 3744(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 3776(CX), Y10 + VMOVDQU 3808(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 3840(CX), Y10 + VMOVDQU 3872(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 3904(CX), Y10 + VMOVDQU 3936(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 3968(CX), Y10 + VMOVDQU 4000(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 7 to 9 outputs + VMOVDQU (R12), Y12 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4032(CX), Y10 + VMOVDQU 4064(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4096(CX), Y10 + VMOVDQU 4128(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4160(CX), Y10 + VMOVDQU 4192(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4224(CX), Y10 + VMOVDQU 4256(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4288(CX), Y10 + VMOVDQU 4320(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4352(CX), Y10 + VMOVDQU 4384(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4416(CX), Y10 + VMOVDQU 4448(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 4480(CX), Y10 + VMOVDQU 4512(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 4544(CX), Y10 + VMOVDQU 4576(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 8 to 9 outputs + VMOVDQU (R13), Y12 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 4608(CX), Y10 + VMOVDQU 4640(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 4672(CX), Y10 + VMOVDQU 4704(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 4736(CX), Y10 + VMOVDQU 4768(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 4800(CX), Y10 + VMOVDQU 4832(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 4864(CX), Y10 + VMOVDQU 4896(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 4928(CX), Y10 + VMOVDQU 4960(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 4992(CX), Y10 + VMOVDQU 5024(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5056(CX), Y10 + VMOVDQU 5088(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5120(CX), Y10 + VMOVDQU 5152(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Load and process 32 bytes from input 9 to 9 outputs + VMOVDQU (DX), Y12 + ADDQ $0x20, DX + VPSRLQ $0x04, Y12, Y13 + VPAND Y9, Y12, Y12 + VPAND Y9, Y13, Y13 + VMOVDQU 5184(CX), Y10 + VMOVDQU 5216(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y0, Y0 + VMOVDQU 5248(CX), Y10 + VMOVDQU 5280(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y1, Y1 + VMOVDQU 5312(CX), Y10 + VMOVDQU 5344(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y2, Y2 + VMOVDQU 5376(CX), Y10 + VMOVDQU 5408(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y3, Y3 + VMOVDQU 5440(CX), Y10 + VMOVDQU 5472(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y4, Y4 + VMOVDQU 5504(CX), Y10 + VMOVDQU 5536(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y5, Y5 + VMOVDQU 5568(CX), Y10 + VMOVDQU 5600(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y6, Y6 + VMOVDQU 5632(CX), Y10 + VMOVDQU 5664(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y7, Y7 + VMOVDQU 5696(CX), Y10 + VMOVDQU 5728(CX), Y11 + VPSHUFB Y12, Y10, Y10 + VPSHUFB Y13, Y11, Y11 + VPXOR Y10, Y11, Y10 + VPXOR Y10, Y8, Y8 + + // Store 9 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x9Xor_loop + VZEROUPPER + +mulAvxTwo_10x9Xor_end: + RET + // func mulAvxTwo_10x10(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 @@ -35039,18 +68656,6 @@ TEXT ·mulAvxTwo_10x10(SB), NOSPLIT, $8-88 VPBROADCASTB X10, Y10 mulAvxTwo_10x10_loop: - // Clear 10 outputs - VPXOR Y0, Y0, Y0 - VPXOR Y1, Y1, Y1 - VPXOR Y2, Y2, Y2 - VPXOR Y3, Y3, Y3 - VPXOR Y4, Y4, Y4 - VPXOR Y5, Y5, Y5 - VPXOR Y6, Y6, Y6 - VPXOR Y7, Y7, Y7 - VPXOR Y8, Y8, Y8 - VPXOR Y9, Y9, Y9 - // Load and process 32 bytes from input 0 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -35061,56 +68666,808 @@ mulAvxTwo_10x10_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y0 + VMOVDQU 64(CX), Y11 + VMOVDQU 96(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y1 + VMOVDQU 128(CX), Y11 + VMOVDQU 160(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y2 + VMOVDQU 192(CX), Y11 + VMOVDQU 224(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y3 + VMOVDQU 256(CX), Y11 + VMOVDQU 288(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y4 + VMOVDQU 320(CX), Y11 + VMOVDQU 352(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y5 + VMOVDQU 384(CX), Y11 + VMOVDQU 416(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y6 + VMOVDQU 448(CX), Y11 + VMOVDQU 480(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y7 + VMOVDQU 512(CX), Y11 + VMOVDQU 544(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y8 + VMOVDQU 576(CX), Y11 + VMOVDQU 608(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y9 + + // Load and process 32 bytes from input 1 to 10 outputs + VMOVDQU (SI), Y13 + ADDQ $0x20, SI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 640(CX), Y11 + VMOVDQU 672(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y0, Y0 + VMOVDQU 704(CX), Y11 + VMOVDQU 736(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 768(CX), Y11 + VMOVDQU 800(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 832(CX), Y11 + VMOVDQU 864(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 896(CX), Y11 + VMOVDQU 928(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 960(CX), Y11 + VMOVDQU 992(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1024(CX), Y11 + VMOVDQU 1056(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1088(CX), Y11 + VMOVDQU 1120(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1152(CX), Y11 + VMOVDQU 1184(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1216(CX), Y11 + VMOVDQU 1248(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 2 to 10 outputs + VMOVDQU (DI), Y13 + ADDQ $0x20, DI + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1280(CX), Y11 + VMOVDQU 1312(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1344(CX), Y11 + VMOVDQU 1376(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 1408(CX), Y11 + VMOVDQU 1440(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 1472(CX), Y11 + VMOVDQU 1504(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 1536(CX), Y11 + VMOVDQU 1568(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 1600(CX), Y11 + VMOVDQU 1632(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 1664(CX), Y11 + VMOVDQU 1696(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 1728(CX), Y11 + VMOVDQU 1760(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 1792(CX), Y11 + VMOVDQU 1824(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 1856(CX), Y11 + VMOVDQU 1888(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 3 to 10 outputs + VMOVDQU (R8), Y13 + ADDQ $0x20, R8 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 1920(CX), Y11 + VMOVDQU 1952(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 1984(CX), Y11 + VMOVDQU 2016(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2048(CX), Y11 + VMOVDQU 2080(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2112(CX), Y11 + VMOVDQU 2144(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2176(CX), Y11 + VMOVDQU 2208(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2240(CX), Y11 + VMOVDQU 2272(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2304(CX), Y11 + VMOVDQU 2336(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 2368(CX), Y11 + VMOVDQU 2400(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 2432(CX), Y11 + VMOVDQU 2464(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 2496(CX), Y11 + VMOVDQU 2528(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 4 to 10 outputs + VMOVDQU (R9), Y13 + ADDQ $0x20, R9 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 2560(CX), Y11 + VMOVDQU 2592(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 2624(CX), Y11 + VMOVDQU 2656(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 2688(CX), Y11 + VMOVDQU 2720(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 2752(CX), Y11 + VMOVDQU 2784(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 2816(CX), Y11 + VMOVDQU 2848(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 2880(CX), Y11 + VMOVDQU 2912(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 2944(CX), Y11 + VMOVDQU 2976(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3008(CX), Y11 + VMOVDQU 3040(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3072(CX), Y11 + VMOVDQU 3104(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3136(CX), Y11 + VMOVDQU 3168(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 5 to 10 outputs + VMOVDQU (R10), Y13 + ADDQ $0x20, R10 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3200(CX), Y11 + VMOVDQU 3232(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3264(CX), Y11 + VMOVDQU 3296(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3328(CX), Y11 + VMOVDQU 3360(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 3392(CX), Y11 + VMOVDQU 3424(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 3456(CX), Y11 + VMOVDQU 3488(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 3520(CX), Y11 + VMOVDQU 3552(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 3584(CX), Y11 + VMOVDQU 3616(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 3648(CX), Y11 + VMOVDQU 3680(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 3712(CX), Y11 + VMOVDQU 3744(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 3776(CX), Y11 + VMOVDQU 3808(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 6 to 10 outputs + VMOVDQU (R11), Y13 + ADDQ $0x20, R11 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 3840(CX), Y11 + VMOVDQU 3872(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 3904(CX), Y11 + VMOVDQU 3936(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 3968(CX), Y11 + VMOVDQU 4000(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4032(CX), Y11 + VMOVDQU 4064(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4096(CX), Y11 + VMOVDQU 4128(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4160(CX), Y11 + VMOVDQU 4192(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4224(CX), Y11 + VMOVDQU 4256(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4288(CX), Y11 + VMOVDQU 4320(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4352(CX), Y11 + VMOVDQU 4384(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 4416(CX), Y11 + VMOVDQU 4448(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 7 to 10 outputs + VMOVDQU (R12), Y13 + ADDQ $0x20, R12 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 4480(CX), Y11 + VMOVDQU 4512(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 4544(CX), Y11 + VMOVDQU 4576(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 4608(CX), Y11 + VMOVDQU 4640(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 4672(CX), Y11 + VMOVDQU 4704(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 4736(CX), Y11 + VMOVDQU 4768(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 4800(CX), Y11 + VMOVDQU 4832(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 4864(CX), Y11 + VMOVDQU 4896(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 4928(CX), Y11 + VMOVDQU 4960(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 4992(CX), Y11 + VMOVDQU 5024(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5056(CX), Y11 + VMOVDQU 5088(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 8 to 10 outputs + VMOVDQU (R13), Y13 + ADDQ $0x20, R13 + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5120(CX), Y11 + VMOVDQU 5152(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5184(CX), Y11 + VMOVDQU 5216(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5248(CX), Y11 + VMOVDQU 5280(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5312(CX), Y11 + VMOVDQU 5344(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 5376(CX), Y11 + VMOVDQU 5408(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 5440(CX), Y11 + VMOVDQU 5472(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 5504(CX), Y11 + VMOVDQU 5536(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 5568(CX), Y11 + VMOVDQU 5600(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 5632(CX), Y11 + VMOVDQU 5664(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 5696(CX), Y11 + VMOVDQU 5728(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Load and process 32 bytes from input 9 to 10 outputs + VMOVDQU (DX), Y13 + ADDQ $0x20, DX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + VMOVDQU 5760(CX), Y11 + VMOVDQU 5792(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + VMOVDQU 5824(CX), Y11 + VMOVDQU 5856(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y1, Y1 + VMOVDQU 5888(CX), Y11 + VMOVDQU 5920(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y2, Y2 + VMOVDQU 5952(CX), Y11 + VMOVDQU 5984(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y3, Y3 + VMOVDQU 6016(CX), Y11 + VMOVDQU 6048(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y4, Y4 + VMOVDQU 6080(CX), Y11 + VMOVDQU 6112(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y5, Y5 + VMOVDQU 6144(CX), Y11 + VMOVDQU 6176(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y6, Y6 + VMOVDQU 6208(CX), Y11 + VMOVDQU 6240(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y7, Y7 + VMOVDQU 6272(CX), Y11 + VMOVDQU 6304(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y8, Y8 + VMOVDQU 6336(CX), Y11 + VMOVDQU 6368(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y9, Y9 + + // Store 10 outputs + MOVQ (R14), BP + VMOVDQU Y0, (BP)(R15*1) + MOVQ 24(R14), BP + VMOVDQU Y1, (BP)(R15*1) + MOVQ 48(R14), BP + VMOVDQU Y2, (BP)(R15*1) + MOVQ 72(R14), BP + VMOVDQU Y3, (BP)(R15*1) + MOVQ 96(R14), BP + VMOVDQU Y4, (BP)(R15*1) + MOVQ 120(R14), BP + VMOVDQU Y5, (BP)(R15*1) + MOVQ 144(R14), BP + VMOVDQU Y6, (BP)(R15*1) + MOVQ 168(R14), BP + VMOVDQU Y7, (BP)(R15*1) + MOVQ 192(R14), BP + VMOVDQU Y8, (BP)(R15*1) + MOVQ 216(R14), BP + VMOVDQU Y9, (BP)(R15*1) + + // Prepare for next loop + ADDQ $0x20, R15 + DECQ AX + JNZ mulAvxTwo_10x10_loop + VZEROUPPER + +mulAvxTwo_10x10_end: + RET + +// func mulAvxTwo_10x10Xor(matrix []byte, in [][]byte, out [][]byte, start int, n int) +// Requires: AVX, AVX2, SSE2 +TEXT ·mulAvxTwo_10x10Xor(SB), NOSPLIT, $8-88 + // Loading no tables to registers + // Destination kept on stack + // Full registers estimated 215 YMM used + MOVQ n+80(FP), AX + MOVQ matrix_base+0(FP), CX + SHRQ $0x05, AX + TESTQ AX, AX + JZ mulAvxTwo_10x10Xor_end + MOVQ in_base+24(FP), DX + MOVQ (DX), BX + MOVQ 24(DX), SI + MOVQ 48(DX), DI + MOVQ 72(DX), R8 + MOVQ 96(DX), R9 + MOVQ 120(DX), R10 + MOVQ 144(DX), R11 + MOVQ 168(DX), R12 + MOVQ 192(DX), R13 + MOVQ 216(DX), DX + MOVQ out_base+48(FP), R14 + MOVQ start+72(FP), R15 + + // Add start offset to input + ADDQ R15, BX + ADDQ R15, SI + ADDQ R15, DI + ADDQ R15, R8 + ADDQ R15, R9 + ADDQ R15, R10 + ADDQ R15, R11 + ADDQ R15, R12 + ADDQ R15, R13 + ADDQ R15, DX + MOVQ $0x0000000f, BP + MOVQ BP, X10 + VPBROADCASTB X10, Y10 + +mulAvxTwo_10x10Xor_loop: + // Load and process 32 bytes from input 0 to 10 outputs + VMOVDQU (BX), Y13 + ADDQ $0x20, BX + VPSRLQ $0x04, Y13, Y14 + VPAND Y10, Y13, Y13 + VPAND Y10, Y14, Y14 + MOVQ (R14), BP + VMOVDQU (BP)(R15*1), Y0 + VMOVDQU (CX), Y11 + VMOVDQU 32(CX), Y12 + VPSHUFB Y13, Y11, Y11 + VPSHUFB Y14, Y12, Y12 + VPXOR Y11, Y12, Y11 + VPXOR Y11, Y0, Y0 + MOVQ 24(R14), BP + VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y1, Y1 + MOVQ 48(R14), BP + VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y2, Y2 + MOVQ 72(R14), BP + VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y3, Y3 + MOVQ 96(R14), BP + VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y4, Y4 + MOVQ 120(R14), BP + VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y5, Y5 + MOVQ 144(R14), BP + VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y6, Y6 + MOVQ 168(R14), BP + VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y7, Y7 + MOVQ 192(R14), BP + VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 VPXOR Y11, Y12, Y11 VPXOR Y11, Y8, Y8 + MOVQ 216(R14), BP + VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 @@ -35746,8 +70103,8 @@ mulAvxTwo_10x10_loop: // Prepare for next loop ADDQ $0x20, R15 DECQ AX - JNZ mulAvxTwo_10x10_loop + JNZ mulAvxTwo_10x10Xor_loop VZEROUPPER -mulAvxTwo_10x10_end: +mulAvxTwo_10x10Xor_end: RET diff --git a/galois_gen_none.go b/galois_gen_none.go index f11b6ee..303d6a9 100644 --- a/galois_gen_none.go +++ b/galois_gen_none.go @@ -3,10 +3,16 @@ package reedsolomon -const maxAvx2Inputs = 0 -const maxAvx2Outputs = 0 +const maxAvx2Inputs = 1 +const maxAvx2Outputs = 1 +const minAvx2Size = 1 +const avxSizeMask = 0 const avx2CodeGen = false func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { panic("avx2 codegen not available") } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + panic("avx2 codegen not available") +} diff --git a/galois_gen_switch_amd64.go b/galois_gen_switch_amd64.go index a6a8410..3078114 100644 --- a/galois_gen_switch_amd64.go +++ b/galois_gen_switch_amd64.go @@ -5,29 +5,31 @@ package reedsolomon -import "fmt" +import ( + "fmt" +) -const avx2CodeGen = true -const maxAvx2Inputs = 10 -const maxAvx2Outputs = 10 +const ( + avx2CodeGen = true + maxAvx2Inputs = 10 + maxAvx2Outputs = 10 + minAvx2Size = 64 + avxSizeMask = maxInt - (minAvx2Size - 1) +) func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { - n := stop - start - n = (n >> 5) << 5 + n := (stop - start) & avxSizeMask switch len(in) { case 1: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_1x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_1x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_1x3_64(matrix, in, out, start, n) return n case 4: @@ -55,15 +57,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 2: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_2x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_2x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_2x3_64(matrix, in, out, start, n) return n case 4: @@ -91,15 +90,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 3: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_3x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_3x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_3x3_64(matrix, in, out, start, n) return n case 4: @@ -127,15 +123,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 4: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_4x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_4x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_4x3_64(matrix, in, out, start, n) return n case 4: @@ -163,15 +156,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 5: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_5x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_5x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_5x3_64(matrix, in, out, start, n) return n case 4: @@ -199,15 +189,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 6: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_6x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_6x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_6x3_64(matrix, in, out, start, n) return n case 4: @@ -235,15 +222,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 7: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_7x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_7x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_7x3_64(matrix, in, out, start, n) return n case 4: @@ -271,15 +255,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 8: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_8x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_8x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_8x3_64(matrix, in, out, start, n) return n case 4: @@ -307,15 +288,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 9: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_9x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_9x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_9x3_64(matrix, in, out, start, n) return n case 4: @@ -343,15 +321,12 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { case 10: switch len(out) { case 1: - n = (n >> 6) << 6 mulAvxTwo_10x1_64(matrix, in, out, start, n) return n case 2: - n = (n >> 6) << 6 mulAvxTwo_10x2_64(matrix, in, out, start, n) return n case 3: - n = (n >> 6) << 6 mulAvxTwo_10x3_64(matrix, in, out, start, n) return n case 4: @@ -379,3 +354,341 @@ func galMulSlicesAvx2(matrix []byte, in, out [][]byte, start, stop int) int { } panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) } + +func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { + n := (stop - start) & avxSizeMask + + switch len(in) { + case 1: + switch len(out) { + case 1: + mulAvxTwo_1x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_1x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_1x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_1x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_1x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_1x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_1x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_1x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_1x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_1x10Xor(matrix, in, out, start, n) + return n + } + case 2: + switch len(out) { + case 1: + mulAvxTwo_2x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_2x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_2x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_2x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_2x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_2x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_2x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_2x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_2x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_2x10Xor(matrix, in, out, start, n) + return n + } + case 3: + switch len(out) { + case 1: + mulAvxTwo_3x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_3x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_3x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_3x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_3x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_3x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_3x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_3x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_3x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_3x10Xor(matrix, in, out, start, n) + return n + } + case 4: + switch len(out) { + case 1: + mulAvxTwo_4x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_4x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_4x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_4x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_4x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_4x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_4x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_4x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_4x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_4x10Xor(matrix, in, out, start, n) + return n + } + case 5: + switch len(out) { + case 1: + mulAvxTwo_5x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_5x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_5x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_5x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_5x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_5x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_5x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_5x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_5x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_5x10Xor(matrix, in, out, start, n) + return n + } + case 6: + switch len(out) { + case 1: + mulAvxTwo_6x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_6x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_6x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_6x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_6x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_6x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_6x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_6x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_6x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_6x10Xor(matrix, in, out, start, n) + return n + } + case 7: + switch len(out) { + case 1: + mulAvxTwo_7x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_7x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_7x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_7x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_7x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_7x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_7x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_7x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_7x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_7x10Xor(matrix, in, out, start, n) + return n + } + case 8: + switch len(out) { + case 1: + mulAvxTwo_8x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_8x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_8x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_8x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_8x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_8x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_8x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_8x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_8x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_8x10Xor(matrix, in, out, start, n) + return n + } + case 9: + switch len(out) { + case 1: + mulAvxTwo_9x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_9x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_9x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_9x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_9x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_9x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_9x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_9x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_9x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_9x10Xor(matrix, in, out, start, n) + return n + } + case 10: + switch len(out) { + case 1: + mulAvxTwo_10x1_64Xor(matrix, in, out, start, n) + return n + case 2: + mulAvxTwo_10x2_64Xor(matrix, in, out, start, n) + return n + case 3: + mulAvxTwo_10x3_64Xor(matrix, in, out, start, n) + return n + case 4: + mulAvxTwo_10x4Xor(matrix, in, out, start, n) + return n + case 5: + mulAvxTwo_10x5Xor(matrix, in, out, start, n) + return n + case 6: + mulAvxTwo_10x6Xor(matrix, in, out, start, n) + return n + case 7: + mulAvxTwo_10x7Xor(matrix, in, out, start, n) + return n + case 8: + mulAvxTwo_10x8Xor(matrix, in, out, start, n) + return n + case 9: + mulAvxTwo_10x9Xor(matrix, in, out, start, n) + return n + case 10: + mulAvxTwo_10x10Xor(matrix, in, out, start, n) + return n + } + } + panic(fmt.Sprintf("unhandled size: %dx%d", len(in), len(out))) +} diff --git a/galois_notamd64.go b/galois_notamd64.go index e9472f7..e67905b 100644 --- a/galois_notamd64.go +++ b/galois_notamd64.go @@ -5,10 +5,10 @@ package reedsolomon -func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512(matrixRows, inputs, outputs [][]byte, byteCount int) { panic("codeSomeShardsAvx512 should not be called if built without asm") } -func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsAvx512P(matrixRows, inputs, outputs [][]byte, byteCount int) { panic("codeSomeShardsAvx512P should not be called if built without asm") } diff --git a/reedsolomon.go b/reedsolomon.go index 3b1a6a9..8382e56 100644 --- a/reedsolomon.go +++ b/reedsolomon.go @@ -112,6 +112,9 @@ const ( avx2CodeGenMinSize = 64 avx2CodeGenMinShards = 3 avx2CodeGenMaxGoroutines = 8 + + intSize = 32 << (^uint(0) >> 63) // 32 or 64 + maxInt = 1<<(intSize-1) - 1 ) // reedSolomon contains a matrix for a specific @@ -291,6 +294,24 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Calculate what we want per round r.o.perRound = cpuid.CPU.Cache.L2 + + divide := parityShards + 1 + if avx2CodeGen && r.o.useAVX2 && (dataShards > maxAvx2Inputs || parityShards > maxAvx2Outputs) { + // Base on L1 cache if we have many inputs. + r.o.perRound = cpuid.CPU.Cache.L1D + divide = 0 + if dataShards > maxAvx2Inputs { + divide += maxAvx2Inputs + } else { + divide += dataShards + } + if parityShards > maxAvx2Inputs { + divide += maxAvx2Outputs + } else { + divide += parityShards + } + } + if r.o.perRound <= 0 { // Set to 128K if undetectable. r.o.perRound = 128 << 10 @@ -300,8 +321,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // If multiple threads per core, make sure they don't contend for cache. r.o.perRound /= cpuid.CPU.ThreadsPerCore } + // 1 input + parity must fit in cache, and we add one more to be safer. - r.o.perRound = r.o.perRound / (1 + parityShards) + r.o.perRound = r.o.perRound / divide // Align to 64 bytes. r.o.perRound = ((r.o.perRound + 63) / 64) * 64 @@ -319,10 +341,6 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } } - if r.o.perRound < r.o.minSplitSize { - r.o.perRound = r.o.minSplitSize - } - if r.o.shardSize > 0 { p := runtime.GOMAXPROCS(0) if p == 1 || r.o.shardSize <= r.o.minSplitSize*2 { @@ -347,7 +365,7 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { // Generated AVX2 does not need data to stay in L1 cache between runs. // We will be purely limited by RAM speed. - if r.canAVX2C(avx2CodeGenMinSize, r.DataShards, r.ParityShards) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { + if r.canAVX2C(avx2CodeGenMinSize, maxAvx2Inputs, maxAvx2Outputs) && r.o.maxGoroutines > avx2CodeGenMaxGoroutines { r.o.maxGoroutines = avx2CodeGenMaxGoroutines } @@ -366,8 +384,9 @@ func New(dataShards, parityShards int, opts ...Option) (Encoder, error) { } if avx2CodeGen && r.o.useAVX2 { + sz := r.DataShards * r.ParityShards * 2 * 32 r.mPool.New = func() interface{} { - return make([]byte, r.Shards*2*32) + return make([]byte, sz) } } return &r, err @@ -398,7 +417,7 @@ func (r *reedSolomon) Encode(shards [][]byte) error { output := shards[r.DataShards:] // Do the coding. - r.codeSomeShards(r.parity, shards[0:r.DataShards], output, r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[0:r.DataShards], output[:r.ParityShards], len(shards[0])) return nil } @@ -558,7 +577,7 @@ func (r *reedSolomon) Verify(shards [][]byte) (bool, error) { toCheck := shards[r.DataShards:] // Do the checking. - return r.checkSomeShards(r.parity, shards[0:r.DataShards], toCheck, r.ParityShards, len(shards[0])), nil + return r.checkSomeShards(r.parity, shards[:r.DataShards], toCheck[:r.ParityShards], len(shards[0])), nil } func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { @@ -576,19 +595,19 @@ func (r *reedSolomon) canAVX2C(byteCount int, inputs, outputs int) bool { // The number of outputs computed, and the // number of matrix rows used, is determined by // outputCount, which is the number of outputs to compute. -func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, byteCount int) { if len(outputs) == 0 { return } switch { case r.o.useAVX512 && r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize && len(inputs) >= 4 && len(outputs) >= 2: - r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShardsAvx512P(matrixRows, inputs, outputs, byteCount) return case r.o.useAVX512 && len(inputs) >= 4 && len(outputs) >= 2: - r.codeSomeShardsAvx512(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShardsAvx512(matrixRows, inputs, outputs, byteCount) return - case r.o.maxGoroutines > 1 && byteCount > r.o.minSplitSize: - r.codeSomeShardsP(matrixRows, inputs, outputs, outputCount, byteCount) + case byteCount > r.o.minSplitSize: + r.codeSomeShardsP(matrixRows, inputs, outputs, byteCount) return } @@ -598,16 +617,49 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu end = len(inputs[0]) } if r.canAVX2C(byteCount, len(inputs), len(outputs)) { - m := genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + m := genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) start += galMulSlicesAvx2(m, inputs, outputs, 0, byteCount) r.mPool.Put(m) end = len(inputs[0]) + } else if len(inputs)+len(outputs) > avx2CodeGenMinShards && r.canAVX2C(byteCount, maxAvx2Inputs, maxAvx2Outputs) { + end = len(inputs[0]) + inIdx := 0 + m := r.mPool.Get().([]byte) + defer r.mPool.Put(m) + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + m = genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), m) + if inIdx == 0 { + galMulSlicesAvx2(m, inPer, outPer, 0, byteCount) + } else { + galMulSlicesAvx2Xor(m, inPer, outPer, 0, byteCount) + } + start = byteCount & avxSizeMask + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + if start >= end { + return + } } - for start < len(inputs[0]) { - for c := 0; c < r.DataShards; c++ { + for c := 0; c < len(inputs); c++ { in := inputs[c][start:end] - for iRow := 0; iRow < outputCount; iRow++ { + for iRow := 0; iRow < len(outputs); iRow++ { if c == 0 { galMulSlice(matrixRows[iRow][c], in, outputs[iRow][start:end], &r.o) } else { @@ -625,15 +677,21 @@ func (r *reedSolomon) codeSomeShards(matrixRows, inputs, outputs [][]byte, outpu // Perform the same as codeSomeShards, but split the workload into // several goroutines. -func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outputCount, byteCount int) { +func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, byteCount int) { var wg sync.WaitGroup gor := r.o.maxGoroutines var avx2Matrix []byte useAvx2 := r.canAVX2C(byteCount, len(inputs), len(outputs)) if useAvx2 { - avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), len(outputs), r.mPool.Get().([]byte)) + avx2Matrix = genAvx2Matrix(matrixRows, len(inputs), 0, len(outputs), r.mPool.Get().([]byte)) defer r.mPool.Put(avx2Matrix) + } else if byteCount < 10<<20 && len(inputs)+len(outputs) > avx2CodeGenMinShards && + r.canAVX2C(byteCount/4, maxAvx2Inputs, maxAvx2Outputs) { + // It appears there is a switchover point at around 10MB where + // Regular processing is faster... + r.codeSomeShardsAVXP(matrixRows, inputs, outputs, byteCount) + return } do := byteCount / gor @@ -641,6 +699,40 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp do = r.o.minSplitSize } + exec := func(start, stop int) { + if useAvx2 && stop-start >= 64 { + start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + } + + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + for c := 0; c < len(inputs); c++ { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + } + wg.Done() + } + if gor <= 1 { + wg.Add(1) + exec(0, byteCount) + return + } + // Make sizes divisible by 64 do = (do + 63) & (^63) start := 0 @@ -650,34 +742,162 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp } wg.Add(1) - go func(start, stop int) { - if useAvx2 && stop-start >= 64 { - start += galMulSlicesAvx2(avx2Matrix, inputs, outputs, start, stop) + go exec(start, start+do) + start += do + } + wg.Wait() +} + +// Perform the same as codeSomeShards, but split the workload into +// several goroutines. +func (r *reedSolomon) codeSomeShardsAVXP(matrixRows, inputs, outputs [][]byte, byteCount int) { + var wg sync.WaitGroup + gor := r.o.maxGoroutines + + type state struct { + input [][]byte + output [][]byte + m []byte + first bool + } + // Make a plan... + plan := make([]state, 0, ((len(inputs)+maxAvx2Inputs-1)/maxAvx2Inputs)*((len(outputs)+maxAvx2Outputs-1)/maxAvx2Outputs)) + + tmp := r.mPool.Get().([]byte) + defer func(b []byte) { + r.mPool.Put(b) + }(tmp) + + // Flips between input first to output first. + // We put the smallest data load in the inner loop. + if len(inputs) > len(outputs) { + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + outIdx += len(outPer) + outs = outs[len(outPer):] + } + inIdx += len(inPer) + ins = ins[len(inPer):] + } + } else { + outs := outputs + outIdx := 0 + for len(outs) > 0 { + outPer := outs + if len(outPer) > maxAvx2Outputs { + outPer = outPer[:maxAvx2Outputs] } - lstart, lstop := start, start+r.o.perRound + inIdx := 0 + ins := inputs + for len(ins) > 0 { + inPer := ins + if len(inPer) > maxAvx2Inputs { + inPer = inPer[:maxAvx2Inputs] + } + // Generate local matrix + m := genAvx2Matrix(matrixRows[outIdx:], len(inPer), inIdx, len(outPer), tmp) + tmp = tmp[len(m):] + //fmt.Println("bytes:", len(inPer)*r.o.perRound, "out:", len(outPer)*r.o.perRound) + plan = append(plan, state{ + input: inPer, + output: outPer, + m: m, + first: inIdx == 0, + }) + inIdx += len(inPer) + ins = ins[len(inPer):] + } + outIdx += len(outPer) + outs = outs[len(outPer):] + } + } + + do := byteCount / gor + if do < r.o.minSplitSize { + do = r.o.minSplitSize + } + + exec := func(start, stop int) { + lstart, lstop := start, start+r.o.perRound + if lstop > stop { + lstop = stop + } + for lstart < stop { + if lstop-lstart >= minAvx2Size { + // Execute plan... + for _, p := range plan { + if p.first { + galMulSlicesAvx2(p.m, p.input, p.output, lstart, lstop) + } else { + galMulSlicesAvx2Xor(p.m, p.input, p.output, lstart, lstop) + } + } + lstart += (lstop - lstart) & avxSizeMask + if lstart == lstop { + lstop += r.o.perRound + if lstop > stop { + lstop = stop + } + continue + } + } + + for c := range inputs { + in := inputs[c][lstart:lstop] + for iRow := 0; iRow < len(outputs); iRow++ { + if c == 0 { + galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } else { + galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) + } + } + } + lstart = lstop + lstop += r.o.perRound if lstop > stop { lstop = stop } - for lstart < stop { - for c := 0; c < r.DataShards; c++ { - in := inputs[c][lstart:lstop] - for iRow := 0; iRow < outputCount; iRow++ { - if c == 0 { - galMulSlice(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) - } else { - galMulSliceXor(matrixRows[iRow][c], in, outputs[iRow][lstart:lstop], &r.o) - } - } - } - lstart = lstop - lstop += r.o.perRound - if lstop > stop { - lstop = stop - } - } - wg.Done() - }(start, start+do) + } + wg.Done() + } + if gor == 1 { + wg.Add(1) + exec(0, byteCount) + return + } + + // Make sizes divisible by 64 + do = (do + 63) & (^63) + start := 0 + for start < byteCount { + if start+do > byteCount { + do = byteCount - start + } + + wg.Add(1) + go exec(start, start+do) start += do } wg.Wait() @@ -686,7 +906,7 @@ func (r *reedSolomon) codeSomeShardsP(matrixRows, inputs, outputs [][]byte, outp // checkSomeShards is mostly the same as codeSomeShards, // except this will check values and return // as soon as a difference is found. -func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outputCount, byteCount int) bool { +func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, byteCount int) bool { if len(toCheck) == 0 { return true } @@ -695,7 +915,7 @@ func (r *reedSolomon) checkSomeShards(matrixRows, inputs, toCheck [][]byte, outp for i := range outputs { outputs[i] = make([]byte, byteCount) } - r.codeSomeShards(matrixRows, inputs, outputs, outputCount, byteCount) + r.codeSomeShards(matrixRows, inputs, outputs, byteCount) for i, calc := range outputs { if !bytes.Equal(calc, toCheck[i]) { @@ -902,7 +1122,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { outputCount++ } } - r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], outputCount, shardSize) + r.codeSomeShards(matrixRows, subShards, outputs[:outputCount], shardSize) if dataOnly { // Exit out early if we are only interested in the data shards @@ -928,7 +1148,7 @@ func (r *reedSolomon) reconstruct(shards [][]byte, dataOnly bool) error { outputCount++ } } - r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], outputCount, shardSize) + r.codeSomeShards(matrixRows, shards[:r.DataShards], outputs[:outputCount], shardSize) return nil } diff --git a/reedsolomon_test.go b/reedsolomon_test.go index ce5d176..342cf6b 100644 --- a/reedsolomon_test.go +++ b/reedsolomon_test.go @@ -191,7 +191,7 @@ func TestEncoding(t *testing.T) { // note that par1 matric will fail on some combinations. var testSizes = [][2]int{ {1, 0}, {3, 0}, {5, 0}, {8, 0}, {10, 0}, {12, 0}, {14, 0}, {41, 0}, {49, 0}, - {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}} + {1, 1}, {1, 2}, {3, 3}, {3, 1}, {5, 3}, {8, 4}, {10, 30}, {12, 10}, {14, 7}, {41, 17}, {49, 1}, {5, 20}} var testDataSizes = []int{10, 100, 1000, 10001, 100003, 1000055} var testDataSizesShort = []int{10, 10001, 100003} @@ -893,6 +893,7 @@ func benchmarkEncode(b *testing.B, dataShards, parityShards, shardSize int) { b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() + b.ReportAllocs() for i := 0; i < b.N; i++ { err = r.Encode(shards) if err != nil { @@ -937,7 +938,7 @@ func BenchmarkEncode10x4x1M(b *testing.B) { benchmarkEncode(b, 10, 4, 1024*1024) } -// Benchmark 50 data shards and 20 parity shards with 1MB each. +// Benchmark 50 data shards and 20 parity shards with 1M each. func BenchmarkEncode50x20x1M(b *testing.B) { benchmarkEncode(b, 50, 20, 1024*1024) } @@ -989,6 +990,7 @@ func benchmarkVerify(b *testing.B, dataShards, parityShards, shardSize int) { b.SetBytes(int64(shardSize * (dataShards + parityShards))) b.ResetTimer() + b.ReportAllocs() for i := 0; i < b.N; i++ { _, err = r.Verify(shards) if err != nil { @@ -1003,7 +1005,7 @@ func BenchmarkVerify10x2x10000(b *testing.B) { } // Benchmark 50 data slices with 5 parity slices holding 100000 bytes each -func BenchmarkVerify50x5x50000(b *testing.B) { +func BenchmarkVerify50x5x100000(b *testing.B) { benchmarkVerify(b, 50, 5, 100000) } @@ -1359,11 +1361,11 @@ func TestCodeSomeShards(t *testing.T) { shards, _ := enc.Split(data) old := runtime.GOMAXPROCS(1) - r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) // hopefully more than 1 CPU runtime.GOMAXPROCS(runtime.NumCPU()) - r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:], r.ParityShards, len(shards[0])) + r.codeSomeShards(r.parity, shards[:r.DataShards], shards[r.DataShards:r.DataShards+r.ParityShards], len(shards[0])) // reset MAXPROCS, otherwise testing complains runtime.GOMAXPROCS(old) @@ -1642,7 +1644,9 @@ func benchmarkParallel(b *testing.B, dataShards, parityShards, shardSize int) { c := runtime.GOMAXPROCS(0) // Note that concurrency also affects total data size and will make caches less effective. - b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB") + if testing.Verbose() { + b.Log("Total data:", (c*dataShards*shardSize)>>20, "MiB", "parity:", (c*parityShards*shardSize)>>20, "MiB") + } // Create independent shards shardsCh := make(chan [][]byte, c) for i := 0; i < c; i++ {