From 2f19c81be4b54c72ebb6b67c47663e012f4a9c12 Mon Sep 17 00:00:00 2001 From: Klaus Post Date: Thu, 24 Mar 2022 05:25:40 -0700 Subject: [PATCH] Reduce generated code (#185) * Reduce generated code Use a define (with hacks) --- .github/workflows/go.yml | 1 + _gen/cleanup.go | 14 +- _gen/gen.go | 20 +- galois_gen_amd64.go | 2 + galois_gen_amd64.s | 65114 ++++--------------------------------- 5 files changed, 6743 insertions(+), 58408 deletions(-) diff --git a/.github/workflows/go.yml b/.github/workflows/go.yml index 5451542..7a95a6b 100644 --- a/.github/workflows/go.yml +++ b/.github/workflows/go.yml @@ -87,4 +87,5 @@ jobs: run: go test -no-avx512 -no-avx2 -no-ssse3 -short -race . - name: Test Microarch v4 + shell: bash {0} run: go run testlevel.go 4;if [ $? -eq 0 ]; then GOAMD64=v4 go test -no-avx512 ./...; else true; fi diff --git a/_gen/cleanup.go b/_gen/cleanup.go index bcd7964..6dd2a30 100644 --- a/_gen/cleanup.go +++ b/_gen/cleanup.go @@ -21,7 +21,19 @@ func main() { if err != nil { log.Fatalln(err) } - data = bytes.Replace(data, []byte("\t// #"), []byte("#"), -1) + data = bytes.ReplaceAll(data, []byte("\t// #"), []byte("#")) + data = bytes.ReplaceAll(data, []byte("\t// @"), []byte("")) + data = bytes.ReplaceAll(data, []byte("VPTERNLOGQ"), []byte("XOR3WAY(")) + split := bytes.Split(data, []byte("\n")) + // Add closing ')' + want := []byte("\tXOR3WAY(") + for i, b := range split { + if bytes.Contains(b, want) { + b = []byte(string(b) + ")") + split[i] = b + } + } + data = bytes.Join(split, []byte("\n")) data, err = asmfmt.Format(bytes.NewBuffer(data)) if err != nil { log.Fatalln(err) diff --git a/_gen/gen.go b/_gen/gen.go index 40f1227..50b8043 100644 --- a/_gen/gen.go +++ b/_gen/gen.go @@ -38,6 +38,17 @@ func main() { Constraint(buildtags.Not("nogen").ToConstraint()) Constraint(buildtags.Term("gc").ToConstraint()) + TEXT("_dummy_", 0, "func()") + Comment("#ifdef GOAMD64_v4") + Comment("#define XOR3WAY(ignore, a, b, dst)\\") + Comment("@\tVPTERNLOGD $0x96, a, b, dst") + Comment("#else") + Comment("#define XOR3WAY(ignore, a, b, dst)\\") + Comment("@\tVPXOR a, dst, dst\\") + Comment("@\tVPXOR b, dst, dst") + Comment("#endif") + RET() + const perLoopBits = 6 const perLoop = 1 << perLoopBits @@ -123,13 +134,8 @@ func galMulSlicesAvx2Xor(matrix []byte, in, out [][]byte, start, stop int) int { // VPXOR3way will 3-way xor a and b and dst. func VPXOR3way(a, b, dst reg.VecVirtual) { - Comment("#ifdef GOAMD64_v4") - // AVX512F and AVX512VL required - VPTERNLOGD(U8(0x96), a, b, dst) - Comment("#else") - VPXOR(a, dst, dst) // dst = a^dst - VPXOR(b, dst, dst) // dst = (a^dst)^b - Comment("#endif") + // VPTERNLOGQ is replaced by XOR3WAY - we just use an equivalent operation + VPTERNLOGQ(U8(0), a, b, dst) } func genMulAvx2(name string, inputs int, outputs int, xor bool) { diff --git a/galois_gen_amd64.go b/galois_gen_amd64.go index 817c7ea..72c4ca4 100644 --- a/galois_gen_amd64.go +++ b/galois_gen_amd64.go @@ -5,6 +5,8 @@ package reedsolomon +func _dummy_() + // mulAvxTwo_1x1 takes 1 inputs and produces 1 outputs. // The output is initialized to 0. //go:noescape diff --git a/galois_gen_amd64.s b/galois_gen_amd64.s index 5d24082..97ead9c 100644 --- a/galois_gen_amd64.s +++ b/galois_gen_amd64.s @@ -5,6 +5,20 @@ #include "textflag.h" +// func _dummy_() +TEXT ·_dummy_(SB), $0 +#ifdef GOAMD64_v4 +#define XOR3WAY(ignore, a, b, dst) \ + VPTERNLOGD $0x96, a, b, dst + +#else +#define XOR3WAY(ignore, a, b, dst) \ + VPXOR a, dst, dst \ + VPXOR b, dst, dst + +#endif + RET + // func mulAvxTwo_1x1(matrix []byte, in [][]byte, out [][]byte, start int, n int) // Requires: AVX, AVX2, SSE2 TEXT ·mulAvxTwo_1x1(SB), NOSPLIT, $0-88 @@ -154,15 +168,8 @@ mulAvxTwo_1x1Xor_loop: VMOVDQU (DX), Y2 VPSHUFB Y4, Y0, Y4 VPSHUFB Y5, Y1, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 1 outputs VMOVDQU Y2, (DX) ADDQ $0x20, DX @@ -223,23 +230,9 @@ mulAvxTwo_1x1_64Xor_loop: VPSHUFB Y7, Y0, Y7 VPSHUFB Y6, Y1, Y6 VPSHUFB Y8, Y1, Y8 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 1 outputs VMOVDQU Y2, (DX) VMOVDQU Y3, 32(DX) @@ -428,27 +421,12 @@ mulAvxTwo_1x2Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y9, Y0, Y7 VPSHUFB Y10, Y1, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (DX), Y5 VPSHUFB Y9, Y2, Y7 VPSHUFB Y10, Y3, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 2 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -515,46 +493,17 @@ mulAvxTwo_1x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) @@ -772,39 +721,16 @@ mulAvxTwo_1x3Xor_loop: VMOVDQU (BX), Y6 VPSHUFB Y12, Y0, Y10 VPSHUFB Y13, Y1, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (SI), Y7 VPSHUFB Y12, Y2, Y10 VPSHUFB Y13, Y3, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (DX), Y8 VPSHUFB Y12, Y4, Y10 VPSHUFB Y13, Y5, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 3 outputs VMOVDQU Y6, (BX) ADDQ $0x20, BX @@ -877,69 +803,25 @@ mulAvxTwo_1x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (SI) VMOVDQU Y1, 32(SI) @@ -1081,57 +963,26 @@ mulAvxTwo_1x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (BX), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1283,71 +1134,32 @@ mulAvxTwo_1x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (BX), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1512,85 +1324,38 @@ mulAvxTwo_1x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (BX), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -1768,99 +1533,44 @@ mulAvxTwo_1x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (BX), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2051,113 +1761,50 @@ mulAvxTwo_1x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (BX), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2361,127 +2008,56 @@ mulAvxTwo_1x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (BX), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2698,141 +2274,62 @@ mulAvxTwo_1x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R11), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R12), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R13), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R14), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (BX), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs VMOVDQU Y0, (SI) ADDQ $0x20, SI @@ -2914,15 +2411,8 @@ mulAvxTwo_2x1_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 1 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -3000,23 +2490,9 @@ mulAvxTwo_2x1_64_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) @@ -3072,15 +2548,8 @@ mulAvxTwo_2x1Xor_loop: VMOVDQU (BX), Y4 VPSHUFB Y6, Y0, Y6 VPSHUFB Y7, Y1, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (CX), Y6 ADDQ $0x20, CX @@ -3089,15 +2558,8 @@ mulAvxTwo_2x1Xor_loop: VPAND Y5, Y7, Y7 VPSHUFB Y6, Y2, Y6 VPSHUFB Y7, Y3, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 1 outputs VMOVDQU Y4, (BX) ADDQ $0x20, BX @@ -3162,23 +2624,9 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y0, Y9 VPSHUFB Y8, Y1, Y8 VPSHUFB Y10, Y1, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (CX), Y7 VMOVDQU 32(CX), Y9 @@ -3193,23 +2641,9 @@ mulAvxTwo_2x1_64Xor_loop: VPSHUFB Y9, Y2, Y9 VPSHUFB Y8, Y3, Y8 VPSHUFB Y10, Y3, Y10 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 1 outputs VMOVDQU Y4, (BX) VMOVDQU Y5, 32(BX) @@ -3283,26 +2717,11 @@ mulAvxTwo_2x2_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 2 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI @@ -3392,46 +2811,17 @@ mulAvxTwo_2x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -3496,27 +2886,12 @@ mulAvxTwo_2x2Xor_loop: VMOVDQU (SI), Y8 VPSHUFB Y13, Y0, Y11 VPSHUFB Y14, Y1, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (BX), Y9 VPSHUFB Y13, Y2, Y11 VPSHUFB Y14, Y3, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (CX), Y13 ADDQ $0x20, CX @@ -3525,26 +2900,11 @@ mulAvxTwo_2x2Xor_loop: VPAND Y10, Y14, Y14 VPSHUFB Y13, Y4, Y11 VPSHUFB Y14, Y5, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VPSHUFB Y13, Y6, Y11 VPSHUFB Y14, Y7, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 2 outputs VMOVDQU Y8, (SI) ADDQ $0x20, SI @@ -3613,46 +2973,17 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -3669,46 +3000,17 @@ mulAvxTwo_2x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -3790,41 +3092,18 @@ mulAvxTwo_2x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -3926,69 +3205,25 @@ mulAvxTwo_2x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -4052,43 +3287,20 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (SI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -4099,41 +3311,18 @@ mulAvxTwo_2x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4208,69 +3397,25 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -4287,69 +3432,25 @@ mulAvxTwo_2x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -4441,54 +3542,23 @@ mulAvxTwo_2x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4553,57 +3623,26 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (SI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -4614,54 +3653,23 @@ mulAvxTwo_2x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4759,67 +3767,28 @@ mulAvxTwo_2x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -4888,71 +3857,32 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (SI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -4963,67 +3893,28 @@ mulAvxTwo_2x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5130,80 +4021,33 @@ mulAvxTwo_2x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5276,85 +4120,38 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (SI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -5365,80 +4162,33 @@ mulAvxTwo_2x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5554,93 +4304,38 @@ mulAvxTwo_2x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -5717,99 +4412,44 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (SI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -5820,93 +4460,38 @@ mulAvxTwo_2x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -6031,106 +4616,43 @@ mulAvxTwo_2x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -6211,113 +4733,50 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (SI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -6328,106 +4787,43 @@ mulAvxTwo_2x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -6561,119 +4957,48 @@ mulAvxTwo_2x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -6758,127 +5083,56 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (SI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -6889,119 +5143,48 @@ mulAvxTwo_2x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -7144,132 +5327,53 @@ mulAvxTwo_2x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -7358,141 +5462,62 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -7503,132 +5528,53 @@ mulAvxTwo_2x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -7714,15 +5660,8 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX @@ -7731,15 +5670,8 @@ mulAvxTwo_3x1_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 1 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI @@ -7819,23 +5751,9 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -7852,23 +5770,9 @@ mulAvxTwo_3x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -7928,15 +5832,8 @@ mulAvxTwo_3x1Xor_loop: VMOVDQU (SI), Y6 VPSHUFB Y8, Y0, Y8 VPSHUFB Y9, Y1, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -7945,15 +5842,8 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y2, Y8 VPSHUFB Y9, Y3, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (CX), Y8 ADDQ $0x20, CX @@ -7962,15 +5852,8 @@ mulAvxTwo_3x1Xor_loop: VPAND Y7, Y9, Y9 VPSHUFB Y8, Y4, Y8 VPSHUFB Y9, Y5, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 1 outputs VMOVDQU Y6, (SI) ADDQ $0x20, SI @@ -8035,23 +5918,9 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -8068,23 +5937,9 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -8101,23 +5956,9 @@ mulAvxTwo_3x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (DI) VMOVDQU Y1, 32(DI) @@ -8191,28 +6032,13 @@ mulAvxTwo_3x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -8223,28 +6049,13 @@ mulAvxTwo_3x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8336,46 +6147,17 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -8392,46 +6174,17 @@ mulAvxTwo_3x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -8492,29 +6245,14 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (DI), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -8525,28 +6263,13 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -8557,28 +6280,13 @@ mulAvxTwo_3x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -8649,46 +6357,17 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -8705,46 +6384,17 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -8761,46 +6411,17 @@ mulAvxTwo_3x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -8884,41 +6505,18 @@ mulAvxTwo_3x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -8929,41 +6527,18 @@ mulAvxTwo_3x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9067,69 +6642,25 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -9146,69 +6677,25 @@ mulAvxTwo_3x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -9274,43 +6761,20 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (DI), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -9321,41 +6785,18 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -9366,41 +6807,18 @@ mulAvxTwo_3x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9477,69 +6895,25 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -9556,69 +6930,25 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -9635,69 +6965,25 @@ mulAvxTwo_3x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -9791,54 +7077,23 @@ mulAvxTwo_3x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -9849,54 +7104,23 @@ mulAvxTwo_3x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -9963,57 +7187,26 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (DI), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -10024,54 +7217,23 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -10082,54 +7244,23 @@ mulAvxTwo_3x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -10229,67 +7360,28 @@ mulAvxTwo_3x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -10300,67 +7392,28 @@ mulAvxTwo_3x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -10431,71 +7484,32 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (DI), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -10506,67 +7520,28 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -10577,67 +7552,28 @@ mulAvxTwo_3x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -10746,80 +7682,33 @@ mulAvxTwo_3x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -10830,80 +7719,33 @@ mulAvxTwo_3x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -10978,85 +7820,38 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (DI), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -11067,80 +7862,33 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -11151,80 +7899,33 @@ mulAvxTwo_3x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -11342,93 +8043,38 @@ mulAvxTwo_3x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -11439,93 +8085,38 @@ mulAvxTwo_3x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -11604,99 +8195,44 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (DI), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -11707,93 +8243,38 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -11804,93 +8285,38 @@ mulAvxTwo_3x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -12017,106 +8443,43 @@ mulAvxTwo_3x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -12127,106 +8490,43 @@ mulAvxTwo_3x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -12309,113 +8609,50 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (DI), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -12426,106 +8663,43 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -12536,106 +8710,43 @@ mulAvxTwo_3x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -12771,119 +8882,48 @@ mulAvxTwo_3x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -12894,119 +8934,48 @@ mulAvxTwo_3x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -13093,127 +9062,56 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -13224,119 +9122,48 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -13347,119 +9174,48 @@ mulAvxTwo_3x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -13606,132 +9362,53 @@ mulAvxTwo_3x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX @@ -13742,132 +9419,53 @@ mulAvxTwo_3x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -13960,141 +9558,62 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU (R12), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU (R13), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU (R14), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU (R15), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU (SI), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (BX), Y13 ADDQ $0x20, BX @@ -14105,132 +9624,53 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (AX), Y13 ADDQ $0x20, AX @@ -14241,132 +9681,53 @@ mulAvxTwo_3x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs VMOVDQU Y0, (DI) ADDQ $0x20, DI @@ -14456,15 +9817,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -14473,15 +9827,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX @@ -14490,15 +9837,8 @@ mulAvxTwo_4x1_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 1 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI @@ -14580,23 +9920,9 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -14613,23 +9939,9 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -14646,23 +9958,9 @@ mulAvxTwo_4x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -14726,15 +10024,8 @@ mulAvxTwo_4x1Xor_loop: VMOVDQU (DI), Y8 VPSHUFB Y10, Y0, Y10 VPSHUFB Y11, Y1, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -14743,15 +10034,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y2, Y10 VPSHUFB Y11, Y3, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -14760,15 +10044,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y4, Y10 VPSHUFB Y11, Y5, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (CX), Y10 ADDQ $0x20, CX @@ -14777,15 +10054,8 @@ mulAvxTwo_4x1Xor_loop: VPAND Y9, Y11, Y11 VPSHUFB Y10, Y6, Y10 VPSHUFB Y11, Y7, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 1 outputs VMOVDQU Y8, (DI) ADDQ $0x20, DI @@ -14852,23 +10122,9 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -14885,23 +10141,9 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -14918,23 +10160,9 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -14951,23 +10179,9 @@ mulAvxTwo_4x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R8) VMOVDQU Y1, 32(R8) @@ -15043,28 +10257,13 @@ mulAvxTwo_4x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -15075,28 +10274,13 @@ mulAvxTwo_4x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -15107,28 +10291,13 @@ mulAvxTwo_4x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -15222,46 +10391,17 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -15278,46 +10418,17 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -15334,46 +10445,17 @@ mulAvxTwo_4x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -15436,29 +10518,14 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R8), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -15469,28 +10536,13 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -15501,28 +10553,13 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -15533,28 +10570,13 @@ mulAvxTwo_4x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -15627,46 +10649,17 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -15683,46 +10676,17 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -15739,46 +10703,17 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -15795,46 +10730,17 @@ mulAvxTwo_4x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -15920,41 +10826,18 @@ mulAvxTwo_4x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -15965,41 +10848,18 @@ mulAvxTwo_4x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -16010,41 +10870,18 @@ mulAvxTwo_4x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -16150,69 +10987,25 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -16229,69 +11022,25 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -16308,69 +11057,25 @@ mulAvxTwo_4x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -16438,43 +11143,20 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R8), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -16485,41 +11167,18 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -16530,41 +11189,18 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -16575,41 +11211,18 @@ mulAvxTwo_4x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -16688,69 +11301,25 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -16767,69 +11336,25 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -16846,69 +11371,25 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -16925,69 +11406,25 @@ mulAvxTwo_4x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -17083,54 +11520,23 @@ mulAvxTwo_4x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -17141,54 +11547,23 @@ mulAvxTwo_4x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -17199,54 +11574,23 @@ mulAvxTwo_4x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -17315,57 +11659,26 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R8), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -17376,54 +11689,23 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -17434,54 +11716,23 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -17492,54 +11743,23 @@ mulAvxTwo_4x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -17641,67 +11861,28 @@ mulAvxTwo_4x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -17712,67 +11893,28 @@ mulAvxTwo_4x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -17783,67 +11925,28 @@ mulAvxTwo_4x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -17916,71 +12019,32 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R8), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -17991,67 +12055,28 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -18062,67 +12087,28 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -18133,67 +12119,28 @@ mulAvxTwo_4x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -18304,80 +12251,33 @@ mulAvxTwo_4x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -18388,80 +12288,33 @@ mulAvxTwo_4x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -18472,80 +12325,33 @@ mulAvxTwo_4x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -18622,85 +12428,38 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R8), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -18711,80 +12470,33 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -18795,80 +12507,33 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -18879,80 +12544,33 @@ mulAvxTwo_4x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -19072,93 +12690,38 @@ mulAvxTwo_4x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -19169,93 +12732,38 @@ mulAvxTwo_4x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -19266,93 +12774,38 @@ mulAvxTwo_4x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -19433,99 +12886,44 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R8), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -19536,93 +12934,38 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -19633,93 +12976,38 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -19730,93 +13018,38 @@ mulAvxTwo_4x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -19945,106 +13178,43 @@ mulAvxTwo_4x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -20055,106 +13225,43 @@ mulAvxTwo_4x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -20165,106 +13272,43 @@ mulAvxTwo_4x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -20349,113 +13393,50 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -20466,106 +13447,43 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -20576,106 +13494,43 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -20686,106 +13541,43 @@ mulAvxTwo_4x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -20925,119 +13717,48 @@ mulAvxTwo_4x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -21048,119 +13769,48 @@ mulAvxTwo_4x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX @@ -21171,119 +13821,48 @@ mulAvxTwo_4x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -21374,127 +13953,56 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU (R12), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU (R13), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU (R14), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU (R15), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU (DI), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -21505,119 +14013,48 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -21628,119 +14065,48 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (AX), Y12 ADDQ $0x20, AX @@ -21751,119 +14117,48 @@ mulAvxTwo_4x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs VMOVDQU Y0, (R8) ADDQ $0x20, R8 @@ -21988,132 +14283,53 @@ mulAvxTwo_4x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -22124,132 +14340,53 @@ mulAvxTwo_4x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -22260,132 +14397,53 @@ mulAvxTwo_4x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) @@ -22458,150 +14516,71 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R8), R10 VMOVDQU (R10)(R9*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R8), R10 VMOVDQU (R10)(R9*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R8), R10 VMOVDQU (R10)(R9*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R8), R10 VMOVDQU (R10)(R9*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R8), R10 VMOVDQU (R10)(R9*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R8), R10 VMOVDQU (R10)(R9*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R8), R10 VMOVDQU (R10)(R9*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R8), R10 VMOVDQU (R10)(R9*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R8), R10 VMOVDQU (R10)(R9*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -22612,132 +14591,53 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -22748,132 +14648,53 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -22884,132 +14705,53 @@ mulAvxTwo_4x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R8), R10 VMOVDQU Y0, (R10)(R9*1) @@ -23104,15 +14846,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -23121,15 +14856,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -23138,15 +14866,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX @@ -23155,15 +14876,8 @@ mulAvxTwo_5x1_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Store 1 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 @@ -23247,23 +14961,9 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -23280,23 +14980,9 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -23313,23 +14999,9 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -23346,23 +15018,9 @@ mulAvxTwo_5x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -23430,15 +15088,8 @@ mulAvxTwo_5x1Xor_loop: VMOVDQU (R8), Y10 VPSHUFB Y12, Y0, Y12 VPSHUFB Y13, Y1, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y12 ADDQ $0x20, BX @@ -23447,15 +15098,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y2, Y12 VPSHUFB Y13, Y3, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -23464,15 +15108,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y4, Y12 VPSHUFB Y13, Y5, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -23481,15 +15118,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y6, Y12 VPSHUFB Y13, Y7, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (CX), Y12 ADDQ $0x20, CX @@ -23498,15 +15128,8 @@ mulAvxTwo_5x1Xor_loop: VPAND Y11, Y13, Y13 VPSHUFB Y12, Y8, Y12 VPSHUFB Y13, Y9, Y13 + XOR3WAY( $0x00, Y12, Y13, Y10) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y12, Y13, Y10 - -#else - VPXOR Y12, Y10, Y10 - VPXOR Y13, Y10, Y10 - -#endif // Store 1 outputs VMOVDQU Y10, (R8) ADDQ $0x20, R8 @@ -23575,23 +15198,9 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -23608,23 +15217,9 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -23641,23 +15236,9 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -23674,23 +15255,9 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -23707,23 +15274,9 @@ mulAvxTwo_5x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R9) VMOVDQU Y1, 32(R9) @@ -23801,28 +15354,13 @@ mulAvxTwo_5x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -23833,28 +15371,13 @@ mulAvxTwo_5x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -23865,28 +15388,13 @@ mulAvxTwo_5x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -23897,28 +15405,13 @@ mulAvxTwo_5x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -24014,46 +15507,17 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -24070,46 +15534,17 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -24126,46 +15561,17 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -24182,46 +15588,17 @@ mulAvxTwo_5x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -24286,29 +15663,14 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R9), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -24319,28 +15681,13 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -24351,28 +15698,13 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -24383,28 +15715,13 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -24415,28 +15732,13 @@ mulAvxTwo_5x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -24511,46 +15813,17 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -24567,46 +15840,17 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -24623,46 +15867,17 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -24679,46 +15894,17 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -24735,46 +15921,17 @@ mulAvxTwo_5x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -24862,41 +16019,18 @@ mulAvxTwo_5x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -24907,41 +16041,18 @@ mulAvxTwo_5x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -24952,41 +16063,18 @@ mulAvxTwo_5x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -24997,41 +16085,18 @@ mulAvxTwo_5x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -25139,69 +16204,25 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -25218,69 +16239,25 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -25297,69 +16274,25 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -25376,69 +16309,25 @@ mulAvxTwo_5x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -25508,43 +16397,20 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R9), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -25555,41 +16421,18 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -25600,41 +16443,18 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -25645,41 +16465,18 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -25690,41 +16487,18 @@ mulAvxTwo_5x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -25805,69 +16579,25 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -25884,69 +16614,25 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -25963,69 +16649,25 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -26042,69 +16684,25 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -26121,69 +16719,25 @@ mulAvxTwo_5x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -26281,54 +16835,23 @@ mulAvxTwo_5x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -26339,54 +16862,23 @@ mulAvxTwo_5x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -26397,54 +16889,23 @@ mulAvxTwo_5x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -26455,54 +16916,23 @@ mulAvxTwo_5x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -26573,57 +17003,26 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R9), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -26634,54 +17033,23 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -26692,54 +17060,23 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -26750,54 +17087,23 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -26808,54 +17114,23 @@ mulAvxTwo_5x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -26959,67 +17234,28 @@ mulAvxTwo_5x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -27030,67 +17266,28 @@ mulAvxTwo_5x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -27101,67 +17298,28 @@ mulAvxTwo_5x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -27172,67 +17330,28 @@ mulAvxTwo_5x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -27307,71 +17426,32 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R9), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -27382,67 +17462,28 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -27453,67 +17494,28 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -27524,67 +17526,28 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -27595,67 +17558,28 @@ mulAvxTwo_5x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -27768,80 +17692,33 @@ mulAvxTwo_5x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -27852,80 +17729,33 @@ mulAvxTwo_5x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -27936,80 +17766,33 @@ mulAvxTwo_5x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -28020,80 +17803,33 @@ mulAvxTwo_5x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -28172,85 +17908,38 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R9), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -28261,80 +17950,33 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -28345,80 +17987,33 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -28429,80 +18024,33 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -28513,80 +18061,33 @@ mulAvxTwo_5x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -28708,93 +18209,38 @@ mulAvxTwo_5x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -28805,93 +18251,38 @@ mulAvxTwo_5x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -28902,93 +18293,38 @@ mulAvxTwo_5x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -28999,93 +18335,38 @@ mulAvxTwo_5x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -29168,99 +18449,44 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -29271,93 +18497,38 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -29368,93 +18539,38 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -29465,93 +18581,38 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -29562,93 +18623,38 @@ mulAvxTwo_5x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -29781,106 +18787,43 @@ mulAvxTwo_5x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -29891,106 +18834,43 @@ mulAvxTwo_5x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -30001,106 +18881,43 @@ mulAvxTwo_5x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX @@ -30111,106 +18928,43 @@ mulAvxTwo_5x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -30299,113 +19053,50 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU (R13), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU (R14), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU (R15), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU (R8), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (BX), Y11 ADDQ $0x20, BX @@ -30416,106 +19107,43 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -30526,106 +19154,43 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -30636,106 +19201,43 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (AX), Y11 ADDQ $0x20, AX @@ -30746,106 +19248,43 @@ mulAvxTwo_5x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs VMOVDQU Y0, (R9) ADDQ $0x20, R9 @@ -30965,119 +19404,48 @@ mulAvxTwo_5x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -31088,119 +19456,48 @@ mulAvxTwo_5x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -31211,119 +19508,48 @@ mulAvxTwo_5x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -31334,119 +19560,48 @@ mulAvxTwo_5x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -31519,135 +19674,64 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -31658,119 +19742,48 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -31781,119 +19794,48 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -31904,119 +19846,48 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -32027,119 +19898,48 @@ mulAvxTwo_5x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -32267,132 +20067,53 @@ mulAvxTwo_5x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -32403,132 +20124,53 @@ mulAvxTwo_5x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -32539,132 +20181,53 @@ mulAvxTwo_5x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -32675,132 +20238,53 @@ mulAvxTwo_5x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -32875,150 +20359,71 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R9), R11 VMOVDQU (R11)(R10*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R9), R11 VMOVDQU (R11)(R10*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R9), R11 VMOVDQU (R11)(R10*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R9), R11 VMOVDQU (R11)(R10*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R9), R11 VMOVDQU (R11)(R10*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R9), R11 VMOVDQU (R11)(R10*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R9), R11 VMOVDQU (R11)(R10*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R9), R11 VMOVDQU (R11)(R10*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R9), R11 VMOVDQU (R11)(R10*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -33029,132 +20434,53 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -33165,132 +20491,53 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -33301,132 +20548,53 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -33437,132 +20605,53 @@ mulAvxTwo_5x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R9), R11 VMOVDQU Y0, (R11)(R10*1) @@ -33661,15 +20750,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI @@ -33678,15 +20760,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI @@ -33695,15 +20770,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 @@ -33712,15 +20780,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX @@ -33729,15 +20790,8 @@ mulAvxTwo_6x1_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Store 1 outputs VMOVDQU Y12, (R9) ADDQ $0x20, R9 @@ -33823,23 +20877,9 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -33856,23 +20896,9 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -33889,23 +20915,9 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -33922,23 +20934,9 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -33955,23 +20953,9 @@ mulAvxTwo_6x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -34043,15 +21027,8 @@ mulAvxTwo_6x1Xor_loop: VMOVDQU (R9), Y12 VPSHUFB Y14, Y0, Y14 VPSHUFB Y15, Y1, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (BX), Y14 ADDQ $0x20, BX @@ -34060,15 +21037,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y2, Y14 VPSHUFB Y15, Y3, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (SI), Y14 ADDQ $0x20, SI @@ -34077,15 +21047,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y4, Y14 VPSHUFB Y15, Y5, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (DI), Y14 ADDQ $0x20, DI @@ -34094,15 +21057,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y6, Y14 VPSHUFB Y15, Y7, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R8), Y14 ADDQ $0x20, R8 @@ -34111,15 +21067,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y8, Y14 VPSHUFB Y15, Y9, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (CX), Y14 ADDQ $0x20, CX @@ -34128,15 +21077,8 @@ mulAvxTwo_6x1Xor_loop: VPAND Y13, Y15, Y15 VPSHUFB Y14, Y10, Y14 VPSHUFB Y15, Y11, Y15 + XOR3WAY( $0x00, Y14, Y15, Y12) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y14, Y15, Y12 - -#else - VPXOR Y14, Y12, Y12 - VPXOR Y15, Y12, Y12 - -#endif // Store 1 outputs VMOVDQU Y12, (R9) ADDQ $0x20, R9 @@ -34207,23 +21149,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -34240,23 +21168,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -34273,23 +21187,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -34306,23 +21206,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -34339,23 +21225,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -34372,23 +21244,9 @@ mulAvxTwo_6x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R10) VMOVDQU Y1, 32(R10) @@ -34468,28 +21326,13 @@ mulAvxTwo_6x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -34500,28 +21343,13 @@ mulAvxTwo_6x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -34532,28 +21360,13 @@ mulAvxTwo_6x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -34564,28 +21377,13 @@ mulAvxTwo_6x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -34596,28 +21394,13 @@ mulAvxTwo_6x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -34715,46 +21498,17 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -34771,46 +21525,17 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -34827,46 +21552,17 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -34883,46 +21579,17 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -34939,46 +21606,17 @@ mulAvxTwo_6x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -35045,29 +21683,14 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R10), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -35078,28 +21701,13 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -35110,28 +21718,13 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -35142,28 +21735,13 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -35174,28 +21752,13 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -35206,28 +21769,13 @@ mulAvxTwo_6x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -35304,46 +21852,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -35360,46 +21879,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -35416,46 +21906,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -35472,46 +21933,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -35528,46 +21960,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -35584,46 +21987,17 @@ mulAvxTwo_6x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -35713,41 +22087,18 @@ mulAvxTwo_6x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -35758,41 +22109,18 @@ mulAvxTwo_6x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -35803,41 +22131,18 @@ mulAvxTwo_6x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -35848,41 +22153,18 @@ mulAvxTwo_6x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -35893,41 +22175,18 @@ mulAvxTwo_6x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -36037,69 +22296,25 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -36116,69 +22331,25 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -36195,69 +22366,25 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -36274,69 +22401,25 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -36353,69 +22436,25 @@ mulAvxTwo_6x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -36487,43 +22526,20 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R10), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -36534,41 +22550,18 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -36579,41 +22572,18 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -36624,41 +22594,18 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -36669,41 +22616,18 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -36714,41 +22638,18 @@ mulAvxTwo_6x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -36831,69 +22732,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -36910,69 +22767,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -36989,69 +22802,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -37068,69 +22837,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -37147,69 +22872,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -37226,69 +22907,25 @@ mulAvxTwo_6x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -37388,54 +23025,23 @@ mulAvxTwo_6x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -37446,54 +23052,23 @@ mulAvxTwo_6x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -37504,54 +23079,23 @@ mulAvxTwo_6x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -37562,54 +23106,23 @@ mulAvxTwo_6x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -37620,54 +23133,23 @@ mulAvxTwo_6x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -37740,57 +23222,26 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R10), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -37801,54 +23252,23 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -37859,54 +23279,23 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -37917,54 +23306,23 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -37975,54 +23333,23 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -38033,54 +23360,23 @@ mulAvxTwo_6x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -38186,67 +23482,28 @@ mulAvxTwo_6x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -38257,67 +23514,28 @@ mulAvxTwo_6x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -38328,67 +23546,28 @@ mulAvxTwo_6x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -38399,67 +23578,28 @@ mulAvxTwo_6x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -38470,67 +23610,28 @@ mulAvxTwo_6x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -38607,71 +23708,32 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R10), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -38682,67 +23744,28 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -38753,67 +23776,28 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -38824,67 +23808,28 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -38895,67 +23840,28 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -38966,67 +23872,28 @@ mulAvxTwo_6x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -39141,80 +24008,33 @@ mulAvxTwo_6x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -39225,80 +24045,33 @@ mulAvxTwo_6x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -39309,80 +24082,33 @@ mulAvxTwo_6x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -39393,80 +24119,33 @@ mulAvxTwo_6x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -39477,80 +24156,33 @@ mulAvxTwo_6x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -39631,85 +24263,38 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -39720,80 +24305,33 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -39804,80 +24342,33 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -39888,80 +24379,33 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -39972,80 +24416,33 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -40056,80 +24453,33 @@ mulAvxTwo_6x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -40255,93 +24605,38 @@ mulAvxTwo_6x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -40352,93 +24647,38 @@ mulAvxTwo_6x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -40449,93 +24689,38 @@ mulAvxTwo_6x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -40546,93 +24731,38 @@ mulAvxTwo_6x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX @@ -40643,93 +24773,38 @@ mulAvxTwo_6x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -40816,99 +24891,44 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU (R13), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU (R14), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU (R15), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU (R9), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (BX), Y10 ADDQ $0x20, BX @@ -40919,93 +24939,38 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -41016,93 +24981,38 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -41113,93 +25023,38 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -41210,93 +25065,38 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (AX), Y10 ADDQ $0x20, AX @@ -41307,93 +25107,38 @@ mulAvxTwo_6x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs VMOVDQU Y0, (R10) ADDQ $0x20, R10 @@ -41508,106 +25253,43 @@ mulAvxTwo_6x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -41618,106 +25300,43 @@ mulAvxTwo_6x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -41728,106 +25347,43 @@ mulAvxTwo_6x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -41838,106 +25394,43 @@ mulAvxTwo_6x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -41948,106 +25441,43 @@ mulAvxTwo_6x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -42120,120 +25550,57 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -42244,106 +25611,43 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -42354,106 +25658,43 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -42464,106 +25705,43 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -42574,106 +25752,43 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -42684,106 +25799,43 @@ mulAvxTwo_6x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -42906,119 +25958,48 @@ mulAvxTwo_6x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -43029,119 +26010,48 @@ mulAvxTwo_6x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -43152,119 +26062,48 @@ mulAvxTwo_6x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -43275,119 +26114,48 @@ mulAvxTwo_6x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -43398,119 +26166,48 @@ mulAvxTwo_6x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -43585,135 +26282,64 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -43724,119 +26350,48 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -43847,119 +26402,48 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -43970,119 +26454,48 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -44093,119 +26506,48 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -44216,119 +26558,48 @@ mulAvxTwo_6x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -44458,132 +26729,53 @@ mulAvxTwo_6x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -44594,132 +26786,53 @@ mulAvxTwo_6x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -44730,132 +26843,53 @@ mulAvxTwo_6x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -44866,132 +26900,53 @@ mulAvxTwo_6x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -45002,132 +26957,53 @@ mulAvxTwo_6x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -45204,150 +27080,71 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R10), R12 VMOVDQU (R12)(R11*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R10), R12 VMOVDQU (R12)(R11*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R10), R12 VMOVDQU (R12)(R11*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R10), R12 VMOVDQU (R12)(R11*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R10), R12 VMOVDQU (R12)(R11*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R10), R12 VMOVDQU (R12)(R11*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R10), R12 VMOVDQU (R12)(R11*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R10), R12 VMOVDQU (R12)(R11*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R10), R12 VMOVDQU (R12)(R11*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -45358,132 +27155,53 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -45494,132 +27212,53 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -45630,132 +27269,53 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -45766,132 +27326,53 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -45902,132 +27383,53 @@ mulAvxTwo_6x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R10), R12 VMOVDQU Y0, (R12)(R11*1) @@ -46120,15 +27522,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -46139,15 +27534,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -46158,15 +27546,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -46177,15 +27558,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -46196,15 +27570,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -46215,15 +27582,8 @@ mulAvxTwo_7x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -46311,23 +27671,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -46344,23 +27690,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -46377,23 +27709,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -46410,23 +27728,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -46443,23 +27747,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -46476,23 +27766,9 @@ mulAvxTwo_7x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -46556,15 +27832,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -46575,15 +27844,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -46594,15 +27856,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -46613,15 +27868,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -46632,15 +27880,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -46651,15 +27892,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -46670,15 +27904,8 @@ mulAvxTwo_7x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -46751,23 +27978,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -46784,23 +27997,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -46817,23 +28016,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -46850,23 +28035,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -46883,23 +28054,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -46916,23 +28073,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -46949,23 +28092,9 @@ mulAvxTwo_7x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R11) VMOVDQU Y1, 32(R11) @@ -47047,28 +28176,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -47079,28 +28193,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -47111,28 +28210,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -47143,28 +28227,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -47175,28 +28244,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -47207,28 +28261,13 @@ mulAvxTwo_7x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -47328,46 +28367,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -47384,46 +28394,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -47440,46 +28421,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -47496,46 +28448,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -47552,46 +28475,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -47608,46 +28502,17 @@ mulAvxTwo_7x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -47716,29 +28581,14 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R11), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -47749,28 +28599,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -47781,28 +28616,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -47813,28 +28633,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -47845,28 +28650,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -47877,28 +28667,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -47909,28 +28684,13 @@ mulAvxTwo_7x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -48009,46 +28769,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -48065,46 +28796,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -48121,46 +28823,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -48177,46 +28850,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -48233,46 +28877,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -48289,46 +28904,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -48345,46 +28931,17 @@ mulAvxTwo_7x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -48476,41 +29033,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -48521,41 +29055,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -48566,41 +29077,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -48611,41 +29099,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -48656,41 +29121,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -48701,41 +29143,18 @@ mulAvxTwo_7x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -48847,69 +29266,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -48926,69 +29301,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -49005,69 +29336,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -49084,69 +29371,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -49163,69 +29406,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -49242,69 +29441,25 @@ mulAvxTwo_7x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -49378,43 +29533,20 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R11), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -49425,41 +29557,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -49470,41 +29579,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -49515,41 +29601,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -49560,41 +29623,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -49605,41 +29645,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -49650,41 +29667,18 @@ mulAvxTwo_7x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -49769,69 +29763,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -49848,69 +29798,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -49927,69 +29833,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -50006,69 +29868,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -50085,69 +29903,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -50164,69 +29938,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -50243,69 +29973,25 @@ mulAvxTwo_7x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -50407,54 +30093,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -50465,54 +30120,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -50523,54 +30147,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -50581,54 +30174,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -50639,54 +30201,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -50697,54 +30228,23 @@ mulAvxTwo_7x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -50819,57 +30319,26 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R11), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -50880,54 +30349,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -50938,54 +30376,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -50996,54 +30403,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -51054,54 +30430,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -51112,54 +30457,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -51170,54 +30484,23 @@ mulAvxTwo_7x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -51325,67 +30608,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -51396,67 +30640,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -51467,67 +30672,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -51538,67 +30704,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -51609,67 +30736,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -51680,67 +30768,28 @@ mulAvxTwo_7x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -51819,71 +30868,32 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -51894,67 +30904,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -51965,67 +30936,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -52036,67 +30968,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -52107,67 +31000,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -52178,67 +31032,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -52249,67 +31064,28 @@ mulAvxTwo_7x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -52428,80 +31204,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -52512,80 +31241,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -52596,80 +31278,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -52680,80 +31315,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -52764,80 +31352,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX @@ -52848,80 +31389,33 @@ mulAvxTwo_7x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -53006,85 +31500,38 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU (R14), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU (R15), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU (R10), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (BX), Y9 ADDQ $0x20, BX @@ -53095,80 +31542,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -53179,80 +31579,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -53263,80 +31616,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -53347,80 +31653,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -53431,80 +31690,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (AX), Y9 ADDQ $0x20, AX @@ -53515,80 +31727,33 @@ mulAvxTwo_7x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs VMOVDQU Y0, (R11) ADDQ $0x20, R11 @@ -53698,93 +31863,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -53795,93 +31905,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -53892,93 +31947,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -53989,93 +31989,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -54086,93 +32031,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -54183,93 +32073,38 @@ mulAvxTwo_7x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -54342,105 +32177,50 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -54451,93 +32231,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -54548,93 +32273,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -54645,93 +32315,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -54742,93 +32357,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -54839,93 +32399,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -54936,93 +32441,38 @@ mulAvxTwo_7x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -55140,106 +32590,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -55250,106 +32637,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -55360,106 +32684,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -55470,106 +32731,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -55580,106 +32778,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -55690,106 +32825,43 @@ mulAvxTwo_7x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -55864,120 +32936,57 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -55988,106 +32997,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -56098,106 +33044,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -56208,106 +33091,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -56318,106 +33138,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -56428,106 +33185,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -56538,106 +33232,43 @@ mulAvxTwo_7x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -56762,119 +33393,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -56885,119 +33445,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -57008,119 +33497,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -57131,119 +33549,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -57254,119 +33601,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -57377,119 +33653,48 @@ mulAvxTwo_7x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -57566,135 +33771,64 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -57705,119 +33839,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -57828,119 +33891,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -57951,119 +33943,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -58074,119 +33995,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -58197,119 +34047,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -58320,119 +34099,48 @@ mulAvxTwo_7x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -58564,132 +34272,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -58700,132 +34329,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -58836,132 +34386,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -58972,132 +34443,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -59108,132 +34500,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -59244,132 +34557,53 @@ mulAvxTwo_7x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -59448,150 +34682,71 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R11), R13 VMOVDQU (R13)(R12*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R11), R13 VMOVDQU (R13)(R12*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R11), R13 VMOVDQU (R13)(R12*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R11), R13 VMOVDQU (R13)(R12*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R11), R13 VMOVDQU (R13)(R12*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R11), R13 VMOVDQU (R13)(R12*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R11), R13 VMOVDQU (R13)(R12*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R11), R13 VMOVDQU (R13)(R12*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R11), R13 VMOVDQU (R13)(R12*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -59602,132 +34757,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -59738,132 +34814,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -59874,132 +34871,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -60010,132 +34928,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -60146,132 +34985,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -60282,132 +35042,53 @@ mulAvxTwo_7x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R11), R13 VMOVDQU Y0, (R13)(R12*1) @@ -60502,15 +35183,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -60521,15 +35195,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -60540,15 +35207,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -60559,15 +35219,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -60578,15 +35231,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -60597,15 +35243,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -60616,15 +35255,8 @@ mulAvxTwo_8x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -60714,23 +35346,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -60747,23 +35365,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -60780,23 +35384,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -60813,23 +35403,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -60846,23 +35422,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -60879,23 +35441,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -60912,23 +35460,9 @@ mulAvxTwo_8x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -60994,15 +35528,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -61013,15 +35540,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -61032,15 +35552,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -61051,15 +35564,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -61070,15 +35576,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -61089,15 +35588,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -61108,15 +35600,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -61127,15 +35612,8 @@ mulAvxTwo_8x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -61210,23 +35688,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -61243,23 +35707,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -61276,23 +35726,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -61309,23 +35745,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -61342,23 +35764,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -61375,23 +35783,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -61408,23 +35802,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -61441,23 +35821,9 @@ mulAvxTwo_8x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R12) VMOVDQU Y1, 32(R12) @@ -61541,28 +35907,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -61573,28 +35924,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -61605,28 +35941,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -61637,28 +35958,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -61669,28 +35975,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -61701,28 +35992,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -61733,28 +36009,13 @@ mulAvxTwo_8x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -61856,46 +36117,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -61912,46 +36144,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -61968,46 +36171,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -62024,46 +36198,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -62080,46 +36225,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -62136,46 +36252,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -62192,46 +36279,17 @@ mulAvxTwo_8x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -62302,29 +36360,14 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R12), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -62335,28 +36378,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -62367,28 +36395,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -62399,28 +36412,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -62431,28 +36429,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -62463,28 +36446,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -62495,28 +36463,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -62527,28 +36480,13 @@ mulAvxTwo_8x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -62629,46 +36567,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -62685,46 +36594,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -62741,46 +36621,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -62797,46 +36648,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -62853,46 +36675,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -62909,46 +36702,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -62965,46 +36729,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -63021,46 +36756,17 @@ mulAvxTwo_8x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -63154,41 +36860,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -63199,41 +36882,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -63244,41 +36904,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -63289,41 +36926,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -63334,41 +36948,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -63379,41 +36970,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -63424,41 +36992,18 @@ mulAvxTwo_8x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -63572,69 +37117,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -63651,69 +37152,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -63730,69 +37187,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -63809,69 +37222,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -63888,69 +37257,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -63967,69 +37292,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -64046,69 +37327,25 @@ mulAvxTwo_8x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -64184,43 +37421,20 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R12), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -64231,41 +37445,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -64276,41 +37467,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -64321,41 +37489,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -64366,41 +37511,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -64411,41 +37533,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -64456,41 +37555,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -64501,41 +37577,18 @@ mulAvxTwo_8x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -64622,69 +37675,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -64701,69 +37710,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -64780,69 +37745,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -64859,69 +37780,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -64938,69 +37815,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -65017,69 +37850,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -65096,69 +37885,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -65175,69 +37920,25 @@ mulAvxTwo_8x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -65341,54 +38042,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -65399,54 +38069,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -65457,54 +38096,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -65515,54 +38123,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -65573,54 +38150,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -65631,54 +38177,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -65689,54 +38204,23 @@ mulAvxTwo_8x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -65813,57 +38297,26 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -65874,54 +38327,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -65932,54 +38354,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -65990,54 +38381,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -66048,54 +38408,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -66106,54 +38435,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -66164,54 +38462,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -66222,54 +38489,23 @@ mulAvxTwo_8x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -66381,67 +38617,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -66452,67 +38649,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -66523,67 +38681,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -66594,67 +38713,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -66665,67 +38745,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -66736,67 +38777,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX @@ -66807,67 +38809,28 @@ mulAvxTwo_8x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -66950,71 +38913,32 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU (R14), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU (R15), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU (R11), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (BX), Y8 ADDQ $0x20, BX @@ -67025,67 +38949,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -67096,67 +38981,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -67167,67 +39013,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -67238,67 +39045,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -67309,67 +39077,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -67380,67 +39109,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (AX), Y8 ADDQ $0x20, AX @@ -67451,67 +39141,28 @@ mulAvxTwo_8x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs VMOVDQU Y0, (R12) ADDQ $0x20, R12 @@ -67616,80 +39267,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -67700,80 +39304,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -67784,80 +39341,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -67868,80 +39378,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -67952,80 +39415,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -68036,80 +39452,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -68120,80 +39489,33 @@ mulAvxTwo_8x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -68266,90 +39588,43 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -68360,80 +39635,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -68444,80 +39672,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -68528,80 +39709,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -68612,80 +39746,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -68696,80 +39783,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -68780,80 +39820,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -68864,80 +39857,33 @@ mulAvxTwo_8x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -69050,93 +39996,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -69147,93 +40038,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -69244,93 +40080,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -69341,93 +40122,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -69438,93 +40164,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -69535,93 +40206,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -69632,93 +40248,38 @@ mulAvxTwo_8x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -69793,105 +40354,50 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -69902,93 +40408,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -69999,93 +40450,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -70096,93 +40492,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -70193,93 +40534,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -70290,93 +40576,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -70387,93 +40618,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -70484,93 +40660,38 @@ mulAvxTwo_8x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -70690,106 +40811,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -70800,106 +40858,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -70910,106 +40905,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -71020,106 +40952,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -71130,106 +40999,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -71240,106 +41046,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -71350,106 +41093,43 @@ mulAvxTwo_8x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -71526,120 +41206,57 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -71650,106 +41267,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -71760,106 +41314,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -71870,106 +41361,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -71980,106 +41408,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -72090,106 +41455,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -72200,106 +41502,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -72310,106 +41549,43 @@ mulAvxTwo_8x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -72536,119 +41712,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -72659,119 +41764,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -72782,119 +41816,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -72905,119 +41868,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -73028,119 +41920,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -73151,119 +41972,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -73274,119 +42024,48 @@ mulAvxTwo_8x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -73465,135 +42144,64 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -73604,119 +42212,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -73727,119 +42264,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -73850,119 +42316,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -73973,119 +42368,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -74096,119 +42420,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -74219,119 +42472,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -74342,119 +42524,48 @@ mulAvxTwo_8x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -74588,132 +42699,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -74724,132 +42756,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -74860,132 +42813,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -74996,132 +42870,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -75132,132 +42927,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -75268,132 +42984,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -75404,132 +43041,53 @@ mulAvxTwo_8x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -75610,150 +43168,71 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R12), R14 VMOVDQU (R14)(R13*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R12), R14 VMOVDQU (R14)(R13*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R12), R14 VMOVDQU (R14)(R13*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R12), R14 VMOVDQU (R14)(R13*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R12), R14 VMOVDQU (R14)(R13*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R12), R14 VMOVDQU (R14)(R13*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R12), R14 VMOVDQU (R14)(R13*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R12), R14 VMOVDQU (R14)(R13*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R12), R14 VMOVDQU (R14)(R13*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -75764,132 +43243,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -75900,132 +43300,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -76036,132 +43357,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -76172,132 +43414,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -76308,132 +43471,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -76444,132 +43528,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -76580,132 +43585,53 @@ mulAvxTwo_8x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R12), R14 VMOVDQU Y0, (R14)(R13*1) @@ -76802,15 +43728,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -76821,15 +43740,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -76840,15 +43752,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -76859,15 +43764,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -76878,15 +43776,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -76897,15 +43788,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -76916,15 +43800,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -76935,15 +43812,8 @@ mulAvxTwo_9x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -77035,23 +43905,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -77068,23 +43924,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -77101,23 +43943,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -77134,23 +43962,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -77167,23 +43981,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -77200,23 +44000,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -77233,23 +44019,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -77266,23 +44038,9 @@ mulAvxTwo_9x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -77350,15 +44108,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -77369,15 +44120,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -77388,15 +44132,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -77407,15 +44144,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -77426,15 +44156,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -77445,15 +44168,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -77464,15 +44180,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -77483,15 +44192,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -77502,15 +44204,8 @@ mulAvxTwo_9x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -77587,23 +44282,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -77620,23 +44301,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -77653,23 +44320,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -77686,23 +44339,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -77719,23 +44358,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -77752,23 +44377,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -77785,23 +44396,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -77818,23 +44415,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -77851,23 +44434,9 @@ mulAvxTwo_9x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R13) VMOVDQU Y1, 32(R13) @@ -77953,28 +44522,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -77985,28 +44539,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -78017,28 +44556,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -78049,28 +44573,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -78081,28 +44590,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -78113,28 +44607,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -78145,28 +44624,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -78177,28 +44641,13 @@ mulAvxTwo_9x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -78302,46 +44751,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -78358,46 +44778,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -78414,46 +44805,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -78470,46 +44832,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -78526,46 +44859,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -78582,46 +44886,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -78638,46 +44913,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -78694,46 +44940,17 @@ mulAvxTwo_9x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -78806,29 +45023,14 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R13), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -78839,28 +45041,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -78871,28 +45058,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -78903,28 +45075,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -78935,28 +45092,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -78967,28 +45109,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -78999,28 +45126,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -79031,28 +45143,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -79063,28 +45160,13 @@ mulAvxTwo_9x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -79167,46 +45249,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -79223,46 +45276,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -79279,46 +45303,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -79335,46 +45330,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -79391,46 +45357,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -79447,46 +45384,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -79503,46 +45411,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -79559,46 +45438,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -79615,46 +45465,17 @@ mulAvxTwo_9x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -79750,41 +45571,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -79795,41 +45593,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -79840,41 +45615,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -79885,41 +45637,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -79930,41 +45659,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -79975,41 +45681,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -80020,41 +45703,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -80065,41 +45725,18 @@ mulAvxTwo_9x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -80215,69 +45852,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -80294,69 +45887,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -80373,69 +45922,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -80452,69 +45957,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -80531,69 +45992,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -80610,69 +46027,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -80689,69 +46062,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -80768,69 +46097,25 @@ mulAvxTwo_9x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -80908,43 +46193,20 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -80955,41 +46217,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -81000,41 +46239,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -81045,41 +46261,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -81090,41 +46283,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -81135,41 +46305,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -81180,41 +46327,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -81225,41 +46349,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (DX), Y6 ADDQ $0x20, DX @@ -81270,41 +46371,18 @@ mulAvxTwo_9x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -81393,69 +46471,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -81472,69 +46506,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -81551,69 +46541,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -81630,69 +46576,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -81709,69 +46611,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -81788,69 +46646,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -81867,69 +46681,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -81946,69 +46716,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (DX), Y11 VMOVDQU 32(DX), Y13 @@ -82025,69 +46751,25 @@ mulAvxTwo_9x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -82195,54 +46877,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -82253,54 +46904,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -82311,54 +46931,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -82369,54 +46958,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -82427,54 +46985,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -82485,54 +47012,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -82543,54 +47039,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX @@ -82601,54 +47066,23 @@ mulAvxTwo_9x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -82729,57 +47163,26 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU (R15), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU (R12), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (BX), Y7 ADDQ $0x20, BX @@ -82790,54 +47193,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -82848,54 +47220,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -82906,54 +47247,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -82964,54 +47274,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -83022,54 +47301,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -83080,54 +47328,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -83138,54 +47355,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (AX), Y7 ADDQ $0x20, AX @@ -83196,54 +47382,23 @@ mulAvxTwo_9x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs VMOVDQU Y0, (R13) ADDQ $0x20, R13 @@ -83343,67 +47498,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -83414,67 +47530,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -83485,67 +47562,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -83556,67 +47594,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -83627,67 +47626,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -83698,67 +47658,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -83769,67 +47690,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -83840,67 +47722,28 @@ mulAvxTwo_9x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -83973,75 +47816,36 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -84052,67 +47856,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -84123,67 +47888,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -84194,67 +47920,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -84265,67 +47952,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -84336,67 +47984,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -84407,67 +48016,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -84478,67 +48048,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -84549,67 +48080,28 @@ mulAvxTwo_9x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -84717,80 +48209,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -84801,80 +48246,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -84885,80 +48283,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -84969,80 +48320,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -85053,80 +48357,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -85137,80 +48394,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -85221,80 +48431,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -85305,80 +48468,33 @@ mulAvxTwo_9x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -85453,90 +48569,43 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -85547,80 +48616,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -85631,80 +48653,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -85715,80 +48690,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -85799,80 +48727,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -85883,80 +48764,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -85967,80 +48801,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -86051,80 +48838,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -86135,80 +48875,33 @@ mulAvxTwo_9x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -86323,93 +49016,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -86420,93 +49058,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -86517,93 +49100,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -86614,93 +49142,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -86711,93 +49184,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -86808,93 +49226,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -86905,93 +49268,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -87002,93 +49310,38 @@ mulAvxTwo_9x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -87165,105 +49418,50 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -87274,93 +49472,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -87371,93 +49514,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -87468,93 +49556,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -87565,93 +49598,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -87662,93 +49640,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -87759,93 +49682,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -87856,93 +49724,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -87953,93 +49766,38 @@ mulAvxTwo_9x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -88161,106 +49919,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -88271,106 +49966,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -88381,106 +50013,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -88491,106 +50060,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -88601,106 +50107,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -88711,106 +50154,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -88821,106 +50201,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -88931,106 +50248,43 @@ mulAvxTwo_9x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -89109,120 +50363,57 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -89233,106 +50424,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -89343,106 +50471,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -89453,106 +50518,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -89563,106 +50565,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -89673,106 +50612,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -89783,106 +50659,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -89893,106 +50706,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -90003,106 +50753,43 @@ mulAvxTwo_9x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -90231,119 +50918,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -90354,119 +50970,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -90477,119 +51022,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -90600,119 +51074,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -90723,119 +51126,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -90846,119 +51178,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -90969,119 +51230,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -91092,119 +51282,48 @@ mulAvxTwo_9x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -91285,135 +51404,64 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -91424,119 +51472,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -91547,119 +51524,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -91670,119 +51576,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -91793,119 +51628,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -91916,119 +51680,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -92039,119 +51732,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -92162,119 +51784,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -92285,119 +51836,48 @@ mulAvxTwo_9x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -92533,132 +52013,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -92669,132 +52070,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -92805,132 +52127,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -92941,132 +52184,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -93077,132 +52241,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -93213,132 +52298,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -93349,132 +52355,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -93485,132 +52412,53 @@ mulAvxTwo_9x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -93693,150 +52541,71 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R13), R15 VMOVDQU (R15)(R14*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R13), R15 VMOVDQU (R15)(R14*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R13), R15 VMOVDQU (R15)(R14*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R13), R15 VMOVDQU (R15)(R14*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R13), R15 VMOVDQU (R15)(R14*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R13), R15 VMOVDQU (R15)(R14*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R13), R15 VMOVDQU (R15)(R14*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R13), R15 VMOVDQU (R15)(R14*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R13), R15 VMOVDQU (R15)(R14*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -93847,132 +52616,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -93983,132 +52673,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -94119,132 +52730,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -94255,132 +52787,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -94391,132 +52844,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -94527,132 +52901,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -94663,132 +52958,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -94799,132 +53015,53 @@ mulAvxTwo_9x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R13), R15 VMOVDQU Y0, (R15)(R14*1) @@ -95023,15 +53160,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -95042,15 +53172,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -95061,15 +53184,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -95080,15 +53196,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -95099,15 +53208,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -95118,15 +53220,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -95137,15 +53232,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 ADDQ $0x20, R13 @@ -95156,15 +53244,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -95175,15 +53256,8 @@ mulAvxTwo_10x1_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -95277,23 +53351,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -95310,23 +53370,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -95343,23 +53389,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -95376,23 +53408,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -95409,23 +53427,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -95442,23 +53446,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -95475,23 +53465,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 @@ -95508,23 +53484,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -95541,23 +53503,9 @@ mulAvxTwo_10x1_64_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -95627,15 +53575,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 32(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 1 to 1 outputs VMOVDQU (SI), Y4 ADDQ $0x20, SI @@ -95646,15 +53587,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 96(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 2 to 1 outputs VMOVDQU (DI), Y4 ADDQ $0x20, DI @@ -95665,15 +53599,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 160(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 3 to 1 outputs VMOVDQU (R8), Y4 ADDQ $0x20, R8 @@ -95684,15 +53611,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 224(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 4 to 1 outputs VMOVDQU (R9), Y4 ADDQ $0x20, R9 @@ -95703,15 +53623,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 288(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 5 to 1 outputs VMOVDQU (R10), Y4 ADDQ $0x20, R10 @@ -95722,15 +53635,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 352(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 6 to 1 outputs VMOVDQU (R11), Y4 ADDQ $0x20, R11 @@ -95741,15 +53647,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 416(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 7 to 1 outputs VMOVDQU (R12), Y4 ADDQ $0x20, R12 @@ -95760,15 +53659,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 480(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 8 to 1 outputs VMOVDQU (R13), Y4 ADDQ $0x20, R13 @@ -95779,15 +53671,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 544(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Load and process 32 bytes from input 9 to 1 outputs VMOVDQU (DX), Y4 ADDQ $0x20, DX @@ -95798,15 +53683,8 @@ mulAvxTwo_10x1Xor_loop: VMOVDQU 608(CX), Y3 VPSHUFB Y4, Y2, Y2 VPSHUFB Y5, Y3, Y3 + XOR3WAY( $0x00, Y2, Y3, Y0) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y2, Y3, Y0 - -#else - VPXOR Y2, Y0, Y0 - VPXOR Y3, Y0, Y0 - -#endif // Store 1 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -95885,23 +53763,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 1 to 1 outputs VMOVDQU (SI), Y6 VMOVDQU 32(SI), Y5 @@ -95918,23 +53782,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 2 to 1 outputs VMOVDQU (DI), Y6 VMOVDQU 32(DI), Y5 @@ -95951,23 +53801,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 3 to 1 outputs VMOVDQU (R8), Y6 VMOVDQU 32(R8), Y5 @@ -95984,23 +53820,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 4 to 1 outputs VMOVDQU (R9), Y6 VMOVDQU 32(R9), Y5 @@ -96017,23 +53839,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 5 to 1 outputs VMOVDQU (R10), Y6 VMOVDQU 32(R10), Y5 @@ -96050,23 +53858,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 6 to 1 outputs VMOVDQU (R11), Y6 VMOVDQU 32(R11), Y5 @@ -96083,23 +53877,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 7 to 1 outputs VMOVDQU (R12), Y6 VMOVDQU 32(R12), Y5 @@ -96116,23 +53896,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 8 to 1 outputs VMOVDQU (R13), Y6 VMOVDQU 32(R13), Y5 @@ -96149,23 +53915,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Load and process 64 bytes from input 9 to 1 outputs VMOVDQU (DX), Y6 VMOVDQU 32(DX), Y5 @@ -96182,23 +53934,9 @@ mulAvxTwo_10x1_64Xor_loop: VPSHUFB Y6, Y3, Y3 VPSHUFB Y8, Y4, Y6 VPSHUFB Y7, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y0) + XOR3WAY( $0x00, Y5, Y6, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif // Store 1 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -96286,28 +54024,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -96318,28 +54041,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -96350,28 +54058,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -96382,28 +54075,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -96414,28 +54092,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -96446,28 +54109,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -96478,28 +54126,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 ADDQ $0x20, R13 @@ -96510,28 +54143,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -96542,28 +54160,13 @@ mulAvxTwo_10x2_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R15) ADDQ $0x20, R15 @@ -96669,46 +54272,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -96725,46 +54299,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -96781,46 +54326,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -96837,46 +54353,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -96893,46 +54380,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -96949,46 +54407,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -97005,46 +54434,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 @@ -97061,46 +54461,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -97117,46 +54488,17 @@ mulAvxTwo_10x2_64_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) @@ -97231,29 +54573,14 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 32(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU (R14), Y1 VMOVDQU 64(CX), Y3 VMOVDQU 96(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 1 to 2 outputs VMOVDQU (SI), Y5 ADDQ $0x20, SI @@ -97264,28 +54591,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 160(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 192(CX), Y3 VMOVDQU 224(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 2 to 2 outputs VMOVDQU (DI), Y5 ADDQ $0x20, DI @@ -97296,28 +54608,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 288(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 320(CX), Y3 VMOVDQU 352(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 3 to 2 outputs VMOVDQU (R8), Y5 ADDQ $0x20, R8 @@ -97328,28 +54625,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 416(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 448(CX), Y3 VMOVDQU 480(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 4 to 2 outputs VMOVDQU (R9), Y5 ADDQ $0x20, R9 @@ -97360,28 +54642,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 544(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 576(CX), Y3 VMOVDQU 608(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 5 to 2 outputs VMOVDQU (R10), Y5 ADDQ $0x20, R10 @@ -97392,28 +54659,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 672(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 704(CX), Y3 VMOVDQU 736(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 6 to 2 outputs VMOVDQU (R11), Y5 ADDQ $0x20, R11 @@ -97424,28 +54676,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 800(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 832(CX), Y3 VMOVDQU 864(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 7 to 2 outputs VMOVDQU (R12), Y5 ADDQ $0x20, R12 @@ -97456,28 +54693,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 928(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 960(CX), Y3 VMOVDQU 992(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 8 to 2 outputs VMOVDQU (R13), Y5 ADDQ $0x20, R13 @@ -97488,28 +54710,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1056(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1088(CX), Y3 VMOVDQU 1120(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Load and process 32 bytes from input 9 to 2 outputs VMOVDQU (DX), Y5 ADDQ $0x20, DX @@ -97520,28 +54727,13 @@ mulAvxTwo_10x2Xor_loop: VMOVDQU 1184(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y0 - -#else - VPXOR Y3, Y0, Y0 - VPXOR Y4, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y3, Y4, Y0) VMOVDQU 1216(CX), Y3 VMOVDQU 1248(CX), Y4 VPSHUFB Y5, Y3, Y3 VPSHUFB Y6, Y4, Y4 + XOR3WAY( $0x00, Y3, Y4, Y1) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y3, Y4, Y1 - -#else - VPXOR Y3, Y1, Y1 - VPXOR Y4, Y1, Y1 - -#endif // Store 2 outputs VMOVDQU Y0, (R15) ADDQ $0x20, R15 @@ -97626,46 +54818,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 1 to 2 outputs VMOVDQU (SI), Y9 VMOVDQU 32(SI), Y11 @@ -97682,46 +54845,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 2 to 2 outputs VMOVDQU (DI), Y9 VMOVDQU 32(DI), Y11 @@ -97738,46 +54872,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 3 to 2 outputs VMOVDQU (R8), Y9 VMOVDQU 32(R8), Y11 @@ -97794,46 +54899,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 4 to 2 outputs VMOVDQU (R9), Y9 VMOVDQU 32(R9), Y11 @@ -97850,46 +54926,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 5 to 2 outputs VMOVDQU (R10), Y9 VMOVDQU 32(R10), Y11 @@ -97906,46 +54953,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 6 to 2 outputs VMOVDQU (R11), Y9 VMOVDQU 32(R11), Y11 @@ -97962,46 +54980,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 7 to 2 outputs VMOVDQU (R12), Y9 VMOVDQU 32(R12), Y11 @@ -98018,46 +55007,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 8 to 2 outputs VMOVDQU (R13), Y9 VMOVDQU 32(R13), Y11 @@ -98074,46 +55034,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Load and process 64 bytes from input 9 to 2 outputs VMOVDQU (DX), Y9 VMOVDQU 32(DX), Y11 @@ -98130,46 +55061,17 @@ mulAvxTwo_10x2_64Xor_loop: VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y11, Y5, Y7 VPSHUFB Y9, Y5, Y5 VPSHUFB Y12, Y6, Y8 VPSHUFB Y10, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y2) + XOR3WAY( $0x00, Y7, Y8, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif // Store 2 outputs VMOVDQU Y0, (R15) VMOVDQU Y1, 32(R15) @@ -98269,41 +55171,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -98314,41 +55193,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -98359,41 +55215,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -98404,41 +55237,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -98449,41 +55259,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -98494,41 +55281,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -98539,41 +55303,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -98584,41 +55325,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 ADDQ $0x20, AX @@ -98629,41 +55347,18 @@ mulAvxTwo_10x3_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -98785,69 +55480,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -98864,69 +55515,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -98943,69 +55550,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -99022,69 +55585,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -99101,69 +55620,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -99180,69 +55655,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -99259,69 +55690,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -99338,69 +55725,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 @@ -99417,69 +55760,25 @@ mulAvxTwo_10x3_64_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -99561,43 +55860,20 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 32(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU (R15), Y1 VMOVDQU 64(CX), Y4 VMOVDQU 96(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU (R13), Y2 VMOVDQU 128(CX), Y4 VMOVDQU 160(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 1 to 3 outputs VMOVDQU (BX), Y6 ADDQ $0x20, BX @@ -99608,41 +55884,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 224(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 256(CX), Y4 VMOVDQU 288(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 320(CX), Y4 VMOVDQU 352(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 2 to 3 outputs VMOVDQU (SI), Y6 ADDQ $0x20, SI @@ -99653,41 +55906,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 416(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 448(CX), Y4 VMOVDQU 480(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 512(CX), Y4 VMOVDQU 544(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 3 to 3 outputs VMOVDQU (DI), Y6 ADDQ $0x20, DI @@ -99698,41 +55928,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 608(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 640(CX), Y4 VMOVDQU 672(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 704(CX), Y4 VMOVDQU 736(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 4 to 3 outputs VMOVDQU (R8), Y6 ADDQ $0x20, R8 @@ -99743,41 +55950,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 800(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 832(CX), Y4 VMOVDQU 864(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 896(CX), Y4 VMOVDQU 928(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 5 to 3 outputs VMOVDQU (R9), Y6 ADDQ $0x20, R9 @@ -99788,41 +55972,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 992(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1024(CX), Y4 VMOVDQU 1056(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1088(CX), Y4 VMOVDQU 1120(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 6 to 3 outputs VMOVDQU (R10), Y6 ADDQ $0x20, R10 @@ -99833,41 +55994,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1184(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1216(CX), Y4 VMOVDQU 1248(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1280(CX), Y4 VMOVDQU 1312(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 7 to 3 outputs VMOVDQU (R11), Y6 ADDQ $0x20, R11 @@ -99878,41 +56016,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1376(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1408(CX), Y4 VMOVDQU 1440(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1472(CX), Y4 VMOVDQU 1504(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 8 to 3 outputs VMOVDQU (R12), Y6 ADDQ $0x20, R12 @@ -99923,41 +56038,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1568(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1600(CX), Y4 VMOVDQU 1632(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1664(CX), Y4 VMOVDQU 1696(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Load and process 32 bytes from input 9 to 3 outputs VMOVDQU (AX), Y6 ADDQ $0x20, AX @@ -99968,41 +56060,18 @@ mulAvxTwo_10x3Xor_loop: VMOVDQU 1760(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y0 - -#else - VPXOR Y4, Y0, Y0 - VPXOR Y5, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y0) VMOVDQU 1792(CX), Y4 VMOVDQU 1824(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y1 - -#else - VPXOR Y4, Y1, Y1 - VPXOR Y5, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y4, Y5, Y1) VMOVDQU 1856(CX), Y4 VMOVDQU 1888(CX), Y5 VPSHUFB Y6, Y4, Y4 VPSHUFB Y7, Y5, Y5 + XOR3WAY( $0x00, Y4, Y5, Y2) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y4, Y5, Y2 - -#else - VPXOR Y4, Y2, Y2 - VPXOR Y5, Y2, Y2 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) ADDQ $0x20, R14 @@ -100097,69 +56166,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 1 to 3 outputs VMOVDQU (BX), Y11 VMOVDQU 32(BX), Y13 @@ -100176,69 +56201,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 2 to 3 outputs VMOVDQU (SI), Y11 VMOVDQU 32(SI), Y13 @@ -100255,69 +56236,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 3 to 3 outputs VMOVDQU (DI), Y11 VMOVDQU 32(DI), Y13 @@ -100334,69 +56271,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 4 to 3 outputs VMOVDQU (R8), Y11 VMOVDQU 32(R8), Y13 @@ -100413,69 +56306,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 5 to 3 outputs VMOVDQU (R9), Y11 VMOVDQU 32(R9), Y13 @@ -100492,69 +56341,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 6 to 3 outputs VMOVDQU (R10), Y11 VMOVDQU 32(R10), Y13 @@ -100571,69 +56376,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 7 to 3 outputs VMOVDQU (R11), Y11 VMOVDQU 32(R11), Y13 @@ -100650,69 +56411,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 8 to 3 outputs VMOVDQU (R12), Y11 VMOVDQU 32(R12), Y13 @@ -100729,69 +56446,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Load and process 64 bytes from input 9 to 3 outputs VMOVDQU (AX), Y11 VMOVDQU 32(AX), Y13 @@ -100808,69 +56481,25 @@ mulAvxTwo_10x3_64Xor_loop: VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y13, Y7, Y9 VPSHUFB Y11, Y7, Y7 VPSHUFB Y14, Y8, Y10 VPSHUFB Y12, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y4) + XOR3WAY( $0x00, Y9, Y10, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif // Store 3 outputs VMOVDQU Y0, (R14) VMOVDQU Y1, 32(R14) @@ -100968,54 +56597,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -101026,54 +56624,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -101084,54 +56651,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -101142,54 +56678,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -101200,54 +56705,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -101258,54 +56732,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 @@ -101316,54 +56759,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 @@ -101374,54 +56786,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -101432,54 +56813,23 @@ mulAvxTwo_10x4_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -101552,60 +56902,29 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 32(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y5 VMOVDQU 96(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y5 VMOVDQU 160(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y5 VMOVDQU 224(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 1 to 4 outputs VMOVDQU (SI), Y7 ADDQ $0x20, SI @@ -101616,54 +56935,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 288(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 320(CX), Y5 VMOVDQU 352(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 384(CX), Y5 VMOVDQU 416(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 448(CX), Y5 VMOVDQU 480(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 2 to 4 outputs VMOVDQU (DI), Y7 ADDQ $0x20, DI @@ -101674,54 +56962,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 544(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 576(CX), Y5 VMOVDQU 608(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 640(CX), Y5 VMOVDQU 672(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 704(CX), Y5 VMOVDQU 736(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 3 to 4 outputs VMOVDQU (R8), Y7 ADDQ $0x20, R8 @@ -101732,54 +56989,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 800(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 832(CX), Y5 VMOVDQU 864(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 896(CX), Y5 VMOVDQU 928(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 960(CX), Y5 VMOVDQU 992(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 4 to 4 outputs VMOVDQU (R9), Y7 ADDQ $0x20, R9 @@ -101790,54 +57016,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1056(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1088(CX), Y5 VMOVDQU 1120(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1152(CX), Y5 VMOVDQU 1184(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1216(CX), Y5 VMOVDQU 1248(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 5 to 4 outputs VMOVDQU (R10), Y7 ADDQ $0x20, R10 @@ -101848,54 +57043,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1312(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1344(CX), Y5 VMOVDQU 1376(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1408(CX), Y5 VMOVDQU 1440(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1472(CX), Y5 VMOVDQU 1504(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 6 to 4 outputs VMOVDQU (R11), Y7 ADDQ $0x20, R11 @@ -101906,54 +57070,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1568(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1600(CX), Y5 VMOVDQU 1632(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1664(CX), Y5 VMOVDQU 1696(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1728(CX), Y5 VMOVDQU 1760(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 7 to 4 outputs VMOVDQU (R12), Y7 ADDQ $0x20, R12 @@ -101964,54 +57097,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 1824(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 1856(CX), Y5 VMOVDQU 1888(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 1920(CX), Y5 VMOVDQU 1952(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 1984(CX), Y5 VMOVDQU 2016(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 8 to 4 outputs VMOVDQU (R13), Y7 ADDQ $0x20, R13 @@ -102022,54 +57124,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2080(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2112(CX), Y5 VMOVDQU 2144(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2176(CX), Y5 VMOVDQU 2208(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2240(CX), Y5 VMOVDQU 2272(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Load and process 32 bytes from input 9 to 4 outputs VMOVDQU (DX), Y7 ADDQ $0x20, DX @@ -102080,54 +57151,23 @@ mulAvxTwo_10x4Xor_loop: VMOVDQU 2336(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y0 - -#else - VPXOR Y5, Y0, Y0 - VPXOR Y6, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y0) VMOVDQU 2368(CX), Y5 VMOVDQU 2400(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y1 - -#else - VPXOR Y5, Y1, Y1 - VPXOR Y6, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y1) VMOVDQU 2432(CX), Y5 VMOVDQU 2464(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y2 - -#else - VPXOR Y5, Y2, Y2 - VPXOR Y6, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y5, Y6, Y2) VMOVDQU 2496(CX), Y5 VMOVDQU 2528(CX), Y6 VPSHUFB Y7, Y5, Y5 VPSHUFB Y8, Y6, Y6 + XOR3WAY( $0x00, Y5, Y6, Y3) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y5, Y6, Y3 - -#else - VPXOR Y5, Y3, Y3 - VPXOR Y6, Y3, Y3 - -#endif // Store 4 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -102230,67 +57270,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -102301,67 +57302,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -102372,67 +57334,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -102443,67 +57366,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -102514,67 +57398,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -102585,67 +57430,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -102656,67 +57462,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 @@ -102727,67 +57494,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -102798,67 +57526,28 @@ mulAvxTwo_10x5_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -102933,75 +57622,36 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 32(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y6 VMOVDQU 96(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y6 VMOVDQU 160(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y6 VMOVDQU 224(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y6 VMOVDQU 288(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 1 to 5 outputs VMOVDQU (SI), Y8 ADDQ $0x20, SI @@ -103012,67 +57662,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 352(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 384(CX), Y6 VMOVDQU 416(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 448(CX), Y6 VMOVDQU 480(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 512(CX), Y6 VMOVDQU 544(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 576(CX), Y6 VMOVDQU 608(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 2 to 5 outputs VMOVDQU (DI), Y8 ADDQ $0x20, DI @@ -103083,67 +57694,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 672(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 704(CX), Y6 VMOVDQU 736(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 768(CX), Y6 VMOVDQU 800(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 832(CX), Y6 VMOVDQU 864(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 896(CX), Y6 VMOVDQU 928(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 3 to 5 outputs VMOVDQU (R8), Y8 ADDQ $0x20, R8 @@ -103154,67 +57726,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 992(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1024(CX), Y6 VMOVDQU 1056(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1088(CX), Y6 VMOVDQU 1120(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1152(CX), Y6 VMOVDQU 1184(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1216(CX), Y6 VMOVDQU 1248(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 4 to 5 outputs VMOVDQU (R9), Y8 ADDQ $0x20, R9 @@ -103225,67 +57758,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1312(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1344(CX), Y6 VMOVDQU 1376(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1408(CX), Y6 VMOVDQU 1440(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1472(CX), Y6 VMOVDQU 1504(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1536(CX), Y6 VMOVDQU 1568(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 5 to 5 outputs VMOVDQU (R10), Y8 ADDQ $0x20, R10 @@ -103296,67 +57790,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1632(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1664(CX), Y6 VMOVDQU 1696(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 1728(CX), Y6 VMOVDQU 1760(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 1792(CX), Y6 VMOVDQU 1824(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 1856(CX), Y6 VMOVDQU 1888(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 6 to 5 outputs VMOVDQU (R11), Y8 ADDQ $0x20, R11 @@ -103367,67 +57822,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 1952(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 1984(CX), Y6 VMOVDQU 2016(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2048(CX), Y6 VMOVDQU 2080(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2112(CX), Y6 VMOVDQU 2144(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2176(CX), Y6 VMOVDQU 2208(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 7 to 5 outputs VMOVDQU (R12), Y8 ADDQ $0x20, R12 @@ -103438,67 +57854,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2272(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2304(CX), Y6 VMOVDQU 2336(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2368(CX), Y6 VMOVDQU 2400(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2432(CX), Y6 VMOVDQU 2464(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2496(CX), Y6 VMOVDQU 2528(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 8 to 5 outputs VMOVDQU (R13), Y8 ADDQ $0x20, R13 @@ -103509,67 +57886,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2592(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2624(CX), Y6 VMOVDQU 2656(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 2688(CX), Y6 VMOVDQU 2720(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 2752(CX), Y6 VMOVDQU 2784(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 2816(CX), Y6 VMOVDQU 2848(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Load and process 32 bytes from input 9 to 5 outputs VMOVDQU (DX), Y8 ADDQ $0x20, DX @@ -103580,67 +57918,28 @@ mulAvxTwo_10x5Xor_loop: VMOVDQU 2912(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y0 - -#else - VPXOR Y6, Y0, Y0 - VPXOR Y7, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y0) VMOVDQU 2944(CX), Y6 VMOVDQU 2976(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y1 - -#else - VPXOR Y6, Y1, Y1 - VPXOR Y7, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y1) VMOVDQU 3008(CX), Y6 VMOVDQU 3040(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y2 - -#else - VPXOR Y6, Y2, Y2 - VPXOR Y7, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y2) VMOVDQU 3072(CX), Y6 VMOVDQU 3104(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y3 - -#else - VPXOR Y6, Y3, Y3 - VPXOR Y7, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y6, Y7, Y3) VMOVDQU 3136(CX), Y6 VMOVDQU 3168(CX), Y7 VPSHUFB Y8, Y6, Y6 VPSHUFB Y9, Y7, Y7 + XOR3WAY( $0x00, Y6, Y7, Y4) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y6, Y7, Y4 - -#else - VPXOR Y6, Y4, Y4 - VPXOR Y7, Y4, Y4 - -#endif // Store 5 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -103750,80 +58049,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -103834,80 +58086,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -103918,80 +58123,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -104002,80 +58160,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -104086,80 +58197,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -104170,80 +58234,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -104254,80 +58271,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 @@ -104338,80 +58308,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -104422,80 +58345,33 @@ mulAvxTwo_10x6_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -104572,90 +58448,43 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 32(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y7 VMOVDQU 96(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y7 VMOVDQU 160(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y7 VMOVDQU 224(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y7 VMOVDQU 288(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y7 VMOVDQU 352(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 1 to 6 outputs VMOVDQU (SI), Y9 ADDQ $0x20, SI @@ -104666,80 +58495,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 416(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 448(CX), Y7 VMOVDQU 480(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 512(CX), Y7 VMOVDQU 544(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 576(CX), Y7 VMOVDQU 608(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 640(CX), Y7 VMOVDQU 672(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 704(CX), Y7 VMOVDQU 736(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 2 to 6 outputs VMOVDQU (DI), Y9 ADDQ $0x20, DI @@ -104750,80 +58532,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 800(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 832(CX), Y7 VMOVDQU 864(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 896(CX), Y7 VMOVDQU 928(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 960(CX), Y7 VMOVDQU 992(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1024(CX), Y7 VMOVDQU 1056(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1088(CX), Y7 VMOVDQU 1120(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 3 to 6 outputs VMOVDQU (R8), Y9 ADDQ $0x20, R8 @@ -104834,80 +58569,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1184(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1216(CX), Y7 VMOVDQU 1248(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1280(CX), Y7 VMOVDQU 1312(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1344(CX), Y7 VMOVDQU 1376(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1408(CX), Y7 VMOVDQU 1440(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1472(CX), Y7 VMOVDQU 1504(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 4 to 6 outputs VMOVDQU (R9), Y9 ADDQ $0x20, R9 @@ -104918,80 +58606,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1568(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1600(CX), Y7 VMOVDQU 1632(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 1664(CX), Y7 VMOVDQU 1696(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 1728(CX), Y7 VMOVDQU 1760(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 1792(CX), Y7 VMOVDQU 1824(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 1856(CX), Y7 VMOVDQU 1888(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 5 to 6 outputs VMOVDQU (R10), Y9 ADDQ $0x20, R10 @@ -105002,80 +58643,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 1952(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 1984(CX), Y7 VMOVDQU 2016(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2048(CX), Y7 VMOVDQU 2080(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2112(CX), Y7 VMOVDQU 2144(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2176(CX), Y7 VMOVDQU 2208(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2240(CX), Y7 VMOVDQU 2272(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 6 to 6 outputs VMOVDQU (R11), Y9 ADDQ $0x20, R11 @@ -105086,80 +58680,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2336(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2368(CX), Y7 VMOVDQU 2400(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2432(CX), Y7 VMOVDQU 2464(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2496(CX), Y7 VMOVDQU 2528(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2560(CX), Y7 VMOVDQU 2592(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 2624(CX), Y7 VMOVDQU 2656(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 7 to 6 outputs VMOVDQU (R12), Y9 ADDQ $0x20, R12 @@ -105170,80 +58717,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 2720(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 2752(CX), Y7 VMOVDQU 2784(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 2816(CX), Y7 VMOVDQU 2848(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 2880(CX), Y7 VMOVDQU 2912(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 2944(CX), Y7 VMOVDQU 2976(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3008(CX), Y7 VMOVDQU 3040(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 8 to 6 outputs VMOVDQU (R13), Y9 ADDQ $0x20, R13 @@ -105254,80 +58754,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3104(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3136(CX), Y7 VMOVDQU 3168(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3200(CX), Y7 VMOVDQU 3232(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3264(CX), Y7 VMOVDQU 3296(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3328(CX), Y7 VMOVDQU 3360(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3392(CX), Y7 VMOVDQU 3424(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Load and process 32 bytes from input 9 to 6 outputs VMOVDQU (DX), Y9 ADDQ $0x20, DX @@ -105338,80 +58791,33 @@ mulAvxTwo_10x6Xor_loop: VMOVDQU 3488(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y0 - -#else - VPXOR Y7, Y0, Y0 - VPXOR Y8, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y0) VMOVDQU 3520(CX), Y7 VMOVDQU 3552(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y1 - -#else - VPXOR Y7, Y1, Y1 - VPXOR Y8, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y1) VMOVDQU 3584(CX), Y7 VMOVDQU 3616(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y2 - -#else - VPXOR Y7, Y2, Y2 - VPXOR Y8, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y2) VMOVDQU 3648(CX), Y7 VMOVDQU 3680(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y3 - -#else - VPXOR Y7, Y3, Y3 - VPXOR Y8, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y3) VMOVDQU 3712(CX), Y7 VMOVDQU 3744(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y4 - -#else - VPXOR Y7, Y4, Y4 - VPXOR Y8, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y7, Y8, Y4) VMOVDQU 3776(CX), Y7 VMOVDQU 3808(CX), Y8 VPSHUFB Y9, Y7, Y7 VPSHUFB Y10, Y8, Y8 + XOR3WAY( $0x00, Y7, Y8, Y5) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y7, Y8, Y5 - -#else - VPXOR Y7, Y5, Y5 - VPXOR Y8, Y5, Y5 - -#endif // Store 6 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -105528,93 +58934,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -105625,93 +58976,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -105722,93 +59018,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -105819,93 +59060,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -105916,93 +59102,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -106013,93 +59144,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -106110,93 +59186,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 @@ -106207,93 +59228,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -106304,93 +59270,38 @@ mulAvxTwo_10x7_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -106469,105 +59380,50 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 32(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y8 VMOVDQU 96(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y8 VMOVDQU 160(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y8 VMOVDQU 224(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y8 VMOVDQU 288(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y8 VMOVDQU 352(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y8 VMOVDQU 416(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 1 to 7 outputs VMOVDQU (SI), Y10 ADDQ $0x20, SI @@ -106578,93 +59434,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 480(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 512(CX), Y8 VMOVDQU 544(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 576(CX), Y8 VMOVDQU 608(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 640(CX), Y8 VMOVDQU 672(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 704(CX), Y8 VMOVDQU 736(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 768(CX), Y8 VMOVDQU 800(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 832(CX), Y8 VMOVDQU 864(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 2 to 7 outputs VMOVDQU (DI), Y10 ADDQ $0x20, DI @@ -106675,93 +59476,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 928(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 960(CX), Y8 VMOVDQU 992(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1024(CX), Y8 VMOVDQU 1056(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1088(CX), Y8 VMOVDQU 1120(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1152(CX), Y8 VMOVDQU 1184(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1216(CX), Y8 VMOVDQU 1248(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1280(CX), Y8 VMOVDQU 1312(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 3 to 7 outputs VMOVDQU (R8), Y10 ADDQ $0x20, R8 @@ -106772,93 +59518,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1376(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1408(CX), Y8 VMOVDQU 1440(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1472(CX), Y8 VMOVDQU 1504(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1536(CX), Y8 VMOVDQU 1568(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 1600(CX), Y8 VMOVDQU 1632(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 1664(CX), Y8 VMOVDQU 1696(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 1728(CX), Y8 VMOVDQU 1760(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 4 to 7 outputs VMOVDQU (R9), Y10 ADDQ $0x20, R9 @@ -106869,93 +59560,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 1824(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 1856(CX), Y8 VMOVDQU 1888(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 1920(CX), Y8 VMOVDQU 1952(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 1984(CX), Y8 VMOVDQU 2016(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2048(CX), Y8 VMOVDQU 2080(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2112(CX), Y8 VMOVDQU 2144(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2176(CX), Y8 VMOVDQU 2208(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 5 to 7 outputs VMOVDQU (R10), Y10 ADDQ $0x20, R10 @@ -106966,93 +59602,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2272(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2304(CX), Y8 VMOVDQU 2336(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2368(CX), Y8 VMOVDQU 2400(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2432(CX), Y8 VMOVDQU 2464(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2496(CX), Y8 VMOVDQU 2528(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 2560(CX), Y8 VMOVDQU 2592(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 2624(CX), Y8 VMOVDQU 2656(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 6 to 7 outputs VMOVDQU (R11), Y10 ADDQ $0x20, R11 @@ -107063,93 +59644,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 2720(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 2752(CX), Y8 VMOVDQU 2784(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 2816(CX), Y8 VMOVDQU 2848(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 2880(CX), Y8 VMOVDQU 2912(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 2944(CX), Y8 VMOVDQU 2976(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3008(CX), Y8 VMOVDQU 3040(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3072(CX), Y8 VMOVDQU 3104(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 7 to 7 outputs VMOVDQU (R12), Y10 ADDQ $0x20, R12 @@ -107160,93 +59686,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3168(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3200(CX), Y8 VMOVDQU 3232(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3264(CX), Y8 VMOVDQU 3296(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3328(CX), Y8 VMOVDQU 3360(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3392(CX), Y8 VMOVDQU 3424(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3456(CX), Y8 VMOVDQU 3488(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3520(CX), Y8 VMOVDQU 3552(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 8 to 7 outputs VMOVDQU (R13), Y10 ADDQ $0x20, R13 @@ -107257,93 +59728,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 3616(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 3648(CX), Y8 VMOVDQU 3680(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 3712(CX), Y8 VMOVDQU 3744(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 3776(CX), Y8 VMOVDQU 3808(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 3840(CX), Y8 VMOVDQU 3872(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 3904(CX), Y8 VMOVDQU 3936(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 3968(CX), Y8 VMOVDQU 4000(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Load and process 32 bytes from input 9 to 7 outputs VMOVDQU (DX), Y10 ADDQ $0x20, DX @@ -107354,93 +59770,38 @@ mulAvxTwo_10x7Xor_loop: VMOVDQU 4064(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y0 - -#else - VPXOR Y8, Y0, Y0 - VPXOR Y9, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y0) VMOVDQU 4096(CX), Y8 VMOVDQU 4128(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y1 - -#else - VPXOR Y8, Y1, Y1 - VPXOR Y9, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y1) VMOVDQU 4160(CX), Y8 VMOVDQU 4192(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y2 - -#else - VPXOR Y8, Y2, Y2 - VPXOR Y9, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y2) VMOVDQU 4224(CX), Y8 VMOVDQU 4256(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y3 - -#else - VPXOR Y8, Y3, Y3 - VPXOR Y9, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y3) VMOVDQU 4288(CX), Y8 VMOVDQU 4320(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y4 - -#else - VPXOR Y8, Y4, Y4 - VPXOR Y9, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y4) VMOVDQU 4352(CX), Y8 VMOVDQU 4384(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y5 - -#else - VPXOR Y8, Y5, Y5 - VPXOR Y9, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y8, Y9, Y5) VMOVDQU 4416(CX), Y8 VMOVDQU 4448(CX), Y9 VPSHUFB Y10, Y8, Y8 VPSHUFB Y11, Y9, Y9 + XOR3WAY( $0x00, Y8, Y9, Y6) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y8, Y9, Y6 - -#else - VPXOR Y8, Y6, Y6 - VPXOR Y9, Y6, Y6 - -#endif // Store 7 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -107564,106 +59925,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -107674,106 +59972,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -107784,106 +60019,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -107894,106 +60066,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -108004,106 +60113,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -108114,106 +60160,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -108224,106 +60207,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 @@ -108334,106 +60254,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -108444,106 +60301,43 @@ mulAvxTwo_10x8_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -108624,120 +60418,57 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 32(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y9 VMOVDQU 96(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y9 VMOVDQU 160(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y9 VMOVDQU 224(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y9 VMOVDQU 288(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y9 VMOVDQU 352(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y9 VMOVDQU 416(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y9 VMOVDQU 480(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 1 to 8 outputs VMOVDQU (SI), Y11 ADDQ $0x20, SI @@ -108748,106 +60479,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 544(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 576(CX), Y9 VMOVDQU 608(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 640(CX), Y9 VMOVDQU 672(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 704(CX), Y9 VMOVDQU 736(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 768(CX), Y9 VMOVDQU 800(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 832(CX), Y9 VMOVDQU 864(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 896(CX), Y9 VMOVDQU 928(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 960(CX), Y9 VMOVDQU 992(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 2 to 8 outputs VMOVDQU (DI), Y11 ADDQ $0x20, DI @@ -108858,106 +60526,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1056(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1088(CX), Y9 VMOVDQU 1120(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1152(CX), Y9 VMOVDQU 1184(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1216(CX), Y9 VMOVDQU 1248(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1280(CX), Y9 VMOVDQU 1312(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1344(CX), Y9 VMOVDQU 1376(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1408(CX), Y9 VMOVDQU 1440(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1472(CX), Y9 VMOVDQU 1504(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 3 to 8 outputs VMOVDQU (R8), Y11 ADDQ $0x20, R8 @@ -108968,106 +60573,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 1568(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 1600(CX), Y9 VMOVDQU 1632(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 1664(CX), Y9 VMOVDQU 1696(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 1728(CX), Y9 VMOVDQU 1760(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 1792(CX), Y9 VMOVDQU 1824(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 1856(CX), Y9 VMOVDQU 1888(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 1920(CX), Y9 VMOVDQU 1952(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 1984(CX), Y9 VMOVDQU 2016(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 4 to 8 outputs VMOVDQU (R9), Y11 ADDQ $0x20, R9 @@ -109078,106 +60620,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2080(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2112(CX), Y9 VMOVDQU 2144(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2176(CX), Y9 VMOVDQU 2208(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2240(CX), Y9 VMOVDQU 2272(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2304(CX), Y9 VMOVDQU 2336(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2368(CX), Y9 VMOVDQU 2400(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2432(CX), Y9 VMOVDQU 2464(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 2496(CX), Y9 VMOVDQU 2528(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 5 to 8 outputs VMOVDQU (R10), Y11 ADDQ $0x20, R10 @@ -109188,106 +60667,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 2592(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 2624(CX), Y9 VMOVDQU 2656(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 2688(CX), Y9 VMOVDQU 2720(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 2752(CX), Y9 VMOVDQU 2784(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 2816(CX), Y9 VMOVDQU 2848(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 2880(CX), Y9 VMOVDQU 2912(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 2944(CX), Y9 VMOVDQU 2976(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3008(CX), Y9 VMOVDQU 3040(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 6 to 8 outputs VMOVDQU (R11), Y11 ADDQ $0x20, R11 @@ -109298,106 +60714,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3104(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3136(CX), Y9 VMOVDQU 3168(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3200(CX), Y9 VMOVDQU 3232(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3264(CX), Y9 VMOVDQU 3296(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3328(CX), Y9 VMOVDQU 3360(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3392(CX), Y9 VMOVDQU 3424(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3456(CX), Y9 VMOVDQU 3488(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 3520(CX), Y9 VMOVDQU 3552(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 7 to 8 outputs VMOVDQU (R12), Y11 ADDQ $0x20, R12 @@ -109408,106 +60761,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 3616(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 3648(CX), Y9 VMOVDQU 3680(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 3712(CX), Y9 VMOVDQU 3744(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 3776(CX), Y9 VMOVDQU 3808(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 3840(CX), Y9 VMOVDQU 3872(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 3904(CX), Y9 VMOVDQU 3936(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 3968(CX), Y9 VMOVDQU 4000(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4032(CX), Y9 VMOVDQU 4064(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 8 to 8 outputs VMOVDQU (R13), Y11 ADDQ $0x20, R13 @@ -109518,106 +60808,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4128(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4160(CX), Y9 VMOVDQU 4192(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4224(CX), Y9 VMOVDQU 4256(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4288(CX), Y9 VMOVDQU 4320(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4352(CX), Y9 VMOVDQU 4384(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4416(CX), Y9 VMOVDQU 4448(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4480(CX), Y9 VMOVDQU 4512(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 4544(CX), Y9 VMOVDQU 4576(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Load and process 32 bytes from input 9 to 8 outputs VMOVDQU (DX), Y11 ADDQ $0x20, DX @@ -109628,106 +60855,43 @@ mulAvxTwo_10x8Xor_loop: VMOVDQU 4640(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y0 - -#else - VPXOR Y9, Y0, Y0 - VPXOR Y10, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y0) VMOVDQU 4672(CX), Y9 VMOVDQU 4704(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y1 - -#else - VPXOR Y9, Y1, Y1 - VPXOR Y10, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y1) VMOVDQU 4736(CX), Y9 VMOVDQU 4768(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y2 - -#else - VPXOR Y9, Y2, Y2 - VPXOR Y10, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y2) VMOVDQU 4800(CX), Y9 VMOVDQU 4832(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y3 - -#else - VPXOR Y9, Y3, Y3 - VPXOR Y10, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y3) VMOVDQU 4864(CX), Y9 VMOVDQU 4896(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y4 - -#else - VPXOR Y9, Y4, Y4 - VPXOR Y10, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y4) VMOVDQU 4928(CX), Y9 VMOVDQU 4960(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y5 - -#else - VPXOR Y9, Y5, Y5 - VPXOR Y10, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y5) VMOVDQU 4992(CX), Y9 VMOVDQU 5024(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y6 - -#else - VPXOR Y9, Y6, Y6 - VPXOR Y10, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y9, Y10, Y6) VMOVDQU 5056(CX), Y9 VMOVDQU 5088(CX), Y10 VPSHUFB Y11, Y9, Y9 VPSHUFB Y12, Y10, Y10 + XOR3WAY( $0x00, Y9, Y10, Y7) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y9, Y10, Y7 - -#else - VPXOR Y9, Y7, Y7 - VPXOR Y10, Y7, Y7 - -#endif // Store 8 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -109858,119 +61022,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -109981,119 +61074,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -110104,119 +61126,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -110227,119 +61178,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -110350,119 +61230,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -110473,119 +61282,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -110596,119 +61334,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 @@ -110719,119 +61386,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -110842,119 +61438,48 @@ mulAvxTwo_10x9_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -111037,135 +61562,64 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 32(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y10 VMOVDQU 96(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y10 VMOVDQU 160(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y10 VMOVDQU 224(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y10 VMOVDQU 288(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y10 VMOVDQU 352(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y10 VMOVDQU 416(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y10 VMOVDQU 480(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y10 VMOVDQU 544(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 1 to 9 outputs VMOVDQU (SI), Y12 ADDQ $0x20, SI @@ -111176,119 +61630,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 608(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 640(CX), Y10 VMOVDQU 672(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 704(CX), Y10 VMOVDQU 736(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 768(CX), Y10 VMOVDQU 800(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 832(CX), Y10 VMOVDQU 864(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 896(CX), Y10 VMOVDQU 928(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 960(CX), Y10 VMOVDQU 992(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1024(CX), Y10 VMOVDQU 1056(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1088(CX), Y10 VMOVDQU 1120(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 2 to 9 outputs VMOVDQU (DI), Y12 ADDQ $0x20, DI @@ -111299,119 +61682,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1184(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1216(CX), Y10 VMOVDQU 1248(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1280(CX), Y10 VMOVDQU 1312(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1344(CX), Y10 VMOVDQU 1376(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1408(CX), Y10 VMOVDQU 1440(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 1472(CX), Y10 VMOVDQU 1504(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 1536(CX), Y10 VMOVDQU 1568(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 1600(CX), Y10 VMOVDQU 1632(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 1664(CX), Y10 VMOVDQU 1696(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 3 to 9 outputs VMOVDQU (R8), Y12 ADDQ $0x20, R8 @@ -111422,119 +61734,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 1760(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 1792(CX), Y10 VMOVDQU 1824(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 1856(CX), Y10 VMOVDQU 1888(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 1920(CX), Y10 VMOVDQU 1952(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 1984(CX), Y10 VMOVDQU 2016(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2048(CX), Y10 VMOVDQU 2080(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2112(CX), Y10 VMOVDQU 2144(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2176(CX), Y10 VMOVDQU 2208(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2240(CX), Y10 VMOVDQU 2272(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 4 to 9 outputs VMOVDQU (R9), Y12 ADDQ $0x20, R9 @@ -111545,119 +61786,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2336(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2368(CX), Y10 VMOVDQU 2400(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 2432(CX), Y10 VMOVDQU 2464(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 2496(CX), Y10 VMOVDQU 2528(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 2560(CX), Y10 VMOVDQU 2592(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 2624(CX), Y10 VMOVDQU 2656(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 2688(CX), Y10 VMOVDQU 2720(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 2752(CX), Y10 VMOVDQU 2784(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 2816(CX), Y10 VMOVDQU 2848(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 5 to 9 outputs VMOVDQU (R10), Y12 ADDQ $0x20, R10 @@ -111668,119 +61838,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 2912(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 2944(CX), Y10 VMOVDQU 2976(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3008(CX), Y10 VMOVDQU 3040(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3072(CX), Y10 VMOVDQU 3104(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3136(CX), Y10 VMOVDQU 3168(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3200(CX), Y10 VMOVDQU 3232(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3264(CX), Y10 VMOVDQU 3296(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3328(CX), Y10 VMOVDQU 3360(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3392(CX), Y10 VMOVDQU 3424(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 6 to 9 outputs VMOVDQU (R11), Y12 ADDQ $0x20, R11 @@ -111791,119 +61890,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 3488(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 3520(CX), Y10 VMOVDQU 3552(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 3584(CX), Y10 VMOVDQU 3616(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 3648(CX), Y10 VMOVDQU 3680(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 3712(CX), Y10 VMOVDQU 3744(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 3776(CX), Y10 VMOVDQU 3808(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 3840(CX), Y10 VMOVDQU 3872(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 3904(CX), Y10 VMOVDQU 3936(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 3968(CX), Y10 VMOVDQU 4000(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 7 to 9 outputs VMOVDQU (R12), Y12 ADDQ $0x20, R12 @@ -111914,119 +61942,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4064(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4096(CX), Y10 VMOVDQU 4128(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4160(CX), Y10 VMOVDQU 4192(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4224(CX), Y10 VMOVDQU 4256(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4288(CX), Y10 VMOVDQU 4320(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4352(CX), Y10 VMOVDQU 4384(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4416(CX), Y10 VMOVDQU 4448(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 4480(CX), Y10 VMOVDQU 4512(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 4544(CX), Y10 VMOVDQU 4576(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 8 to 9 outputs VMOVDQU (R13), Y12 ADDQ $0x20, R13 @@ -112037,119 +61994,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 4640(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 4672(CX), Y10 VMOVDQU 4704(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 4736(CX), Y10 VMOVDQU 4768(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 4800(CX), Y10 VMOVDQU 4832(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 4864(CX), Y10 VMOVDQU 4896(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 4928(CX), Y10 VMOVDQU 4960(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 4992(CX), Y10 VMOVDQU 5024(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5056(CX), Y10 VMOVDQU 5088(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5120(CX), Y10 VMOVDQU 5152(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Load and process 32 bytes from input 9 to 9 outputs VMOVDQU (DX), Y12 ADDQ $0x20, DX @@ -112160,119 +62046,48 @@ mulAvxTwo_10x9Xor_loop: VMOVDQU 5216(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y0 - -#else - VPXOR Y10, Y0, Y0 - VPXOR Y11, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y0) VMOVDQU 5248(CX), Y10 VMOVDQU 5280(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y1 - -#else - VPXOR Y10, Y1, Y1 - VPXOR Y11, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y1) VMOVDQU 5312(CX), Y10 VMOVDQU 5344(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y2 - -#else - VPXOR Y10, Y2, Y2 - VPXOR Y11, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y2) VMOVDQU 5376(CX), Y10 VMOVDQU 5408(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y3 - -#else - VPXOR Y10, Y3, Y3 - VPXOR Y11, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y3) VMOVDQU 5440(CX), Y10 VMOVDQU 5472(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y4 - -#else - VPXOR Y10, Y4, Y4 - VPXOR Y11, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y4) VMOVDQU 5504(CX), Y10 VMOVDQU 5536(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y5 - -#else - VPXOR Y10, Y5, Y5 - VPXOR Y11, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y5) VMOVDQU 5568(CX), Y10 VMOVDQU 5600(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y6 - -#else - VPXOR Y10, Y6, Y6 - VPXOR Y11, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y6) VMOVDQU 5632(CX), Y10 VMOVDQU 5664(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y7 - -#else - VPXOR Y10, Y7, Y7 - VPXOR Y11, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y10, Y11, Y7) VMOVDQU 5696(CX), Y10 VMOVDQU 5728(CX), Y11 VPSHUFB Y12, Y10, Y10 VPSHUFB Y13, Y11, Y11 + XOR3WAY( $0x00, Y10, Y11, Y8) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y10, Y11, Y8 - -#else - VPXOR Y10, Y8, Y8 - VPXOR Y11, Y8, Y8 - -#endif // Store 9 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -112410,132 +62225,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -112546,132 +62282,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -112682,132 +62339,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -112818,132 +62396,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -112954,132 +62453,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -113090,132 +62510,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -113226,132 +62567,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 @@ -113362,132 +62624,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -113498,132 +62681,53 @@ mulAvxTwo_10x10_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1) @@ -113708,150 +62812,71 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 32(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) MOVQ 24(R14), BP VMOVDQU (BP)(R15*1), Y1 VMOVDQU 64(CX), Y11 VMOVDQU 96(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) MOVQ 48(R14), BP VMOVDQU (BP)(R15*1), Y2 VMOVDQU 128(CX), Y11 VMOVDQU 160(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) MOVQ 72(R14), BP VMOVDQU (BP)(R15*1), Y3 VMOVDQU 192(CX), Y11 VMOVDQU 224(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) MOVQ 96(R14), BP VMOVDQU (BP)(R15*1), Y4 VMOVDQU 256(CX), Y11 VMOVDQU 288(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) MOVQ 120(R14), BP VMOVDQU (BP)(R15*1), Y5 VMOVDQU 320(CX), Y11 VMOVDQU 352(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) MOVQ 144(R14), BP VMOVDQU (BP)(R15*1), Y6 VMOVDQU 384(CX), Y11 VMOVDQU 416(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) MOVQ 168(R14), BP VMOVDQU (BP)(R15*1), Y7 VMOVDQU 448(CX), Y11 VMOVDQU 480(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) MOVQ 192(R14), BP VMOVDQU (BP)(R15*1), Y8 VMOVDQU 512(CX), Y11 VMOVDQU 544(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) MOVQ 216(R14), BP VMOVDQU (BP)(R15*1), Y9 VMOVDQU 576(CX), Y11 VMOVDQU 608(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 1 to 10 outputs VMOVDQU (SI), Y13 ADDQ $0x20, SI @@ -113862,132 +62887,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 672(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 704(CX), Y11 VMOVDQU 736(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 768(CX), Y11 VMOVDQU 800(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 832(CX), Y11 VMOVDQU 864(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 896(CX), Y11 VMOVDQU 928(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 960(CX), Y11 VMOVDQU 992(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1024(CX), Y11 VMOVDQU 1056(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1088(CX), Y11 VMOVDQU 1120(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1152(CX), Y11 VMOVDQU 1184(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1216(CX), Y11 VMOVDQU 1248(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 2 to 10 outputs VMOVDQU (DI), Y13 ADDQ $0x20, DI @@ -113998,132 +62944,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1312(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1344(CX), Y11 VMOVDQU 1376(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 1408(CX), Y11 VMOVDQU 1440(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 1472(CX), Y11 VMOVDQU 1504(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 1536(CX), Y11 VMOVDQU 1568(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 1600(CX), Y11 VMOVDQU 1632(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 1664(CX), Y11 VMOVDQU 1696(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 1728(CX), Y11 VMOVDQU 1760(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 1792(CX), Y11 VMOVDQU 1824(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 1856(CX), Y11 VMOVDQU 1888(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 3 to 10 outputs VMOVDQU (R8), Y13 ADDQ $0x20, R8 @@ -114134,132 +63001,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 1952(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 1984(CX), Y11 VMOVDQU 2016(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2048(CX), Y11 VMOVDQU 2080(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2112(CX), Y11 VMOVDQU 2144(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2176(CX), Y11 VMOVDQU 2208(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2240(CX), Y11 VMOVDQU 2272(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2304(CX), Y11 VMOVDQU 2336(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 2368(CX), Y11 VMOVDQU 2400(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 2432(CX), Y11 VMOVDQU 2464(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 2496(CX), Y11 VMOVDQU 2528(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 4 to 10 outputs VMOVDQU (R9), Y13 ADDQ $0x20, R9 @@ -114270,132 +63058,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 2592(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 2624(CX), Y11 VMOVDQU 2656(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 2688(CX), Y11 VMOVDQU 2720(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 2752(CX), Y11 VMOVDQU 2784(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 2816(CX), Y11 VMOVDQU 2848(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 2880(CX), Y11 VMOVDQU 2912(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 2944(CX), Y11 VMOVDQU 2976(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3008(CX), Y11 VMOVDQU 3040(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3072(CX), Y11 VMOVDQU 3104(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3136(CX), Y11 VMOVDQU 3168(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 5 to 10 outputs VMOVDQU (R10), Y13 ADDQ $0x20, R10 @@ -114406,132 +63115,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3232(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3264(CX), Y11 VMOVDQU 3296(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3328(CX), Y11 VMOVDQU 3360(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 3392(CX), Y11 VMOVDQU 3424(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 3456(CX), Y11 VMOVDQU 3488(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 3520(CX), Y11 VMOVDQU 3552(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 3584(CX), Y11 VMOVDQU 3616(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 3648(CX), Y11 VMOVDQU 3680(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 3712(CX), Y11 VMOVDQU 3744(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 3776(CX), Y11 VMOVDQU 3808(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 6 to 10 outputs VMOVDQU (R11), Y13 ADDQ $0x20, R11 @@ -114542,132 +63172,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 3872(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 3904(CX), Y11 VMOVDQU 3936(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 3968(CX), Y11 VMOVDQU 4000(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4032(CX), Y11 VMOVDQU 4064(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4096(CX), Y11 VMOVDQU 4128(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4160(CX), Y11 VMOVDQU 4192(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4224(CX), Y11 VMOVDQU 4256(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4288(CX), Y11 VMOVDQU 4320(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4352(CX), Y11 VMOVDQU 4384(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 4416(CX), Y11 VMOVDQU 4448(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 7 to 10 outputs VMOVDQU (R12), Y13 ADDQ $0x20, R12 @@ -114678,132 +63229,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 4512(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 4544(CX), Y11 VMOVDQU 4576(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 4608(CX), Y11 VMOVDQU 4640(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 4672(CX), Y11 VMOVDQU 4704(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 4736(CX), Y11 VMOVDQU 4768(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 4800(CX), Y11 VMOVDQU 4832(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 4864(CX), Y11 VMOVDQU 4896(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 4928(CX), Y11 VMOVDQU 4960(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 4992(CX), Y11 VMOVDQU 5024(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5056(CX), Y11 VMOVDQU 5088(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 8 to 10 outputs VMOVDQU (R13), Y13 ADDQ $0x20, R13 @@ -114814,132 +63286,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5152(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5184(CX), Y11 VMOVDQU 5216(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5248(CX), Y11 VMOVDQU 5280(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5312(CX), Y11 VMOVDQU 5344(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 5376(CX), Y11 VMOVDQU 5408(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 5440(CX), Y11 VMOVDQU 5472(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 5504(CX), Y11 VMOVDQU 5536(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 5568(CX), Y11 VMOVDQU 5600(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 5632(CX), Y11 VMOVDQU 5664(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 5696(CX), Y11 VMOVDQU 5728(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Load and process 32 bytes from input 9 to 10 outputs VMOVDQU (DX), Y13 ADDQ $0x20, DX @@ -114950,132 +63343,53 @@ mulAvxTwo_10x10Xor_loop: VMOVDQU 5792(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y0 - -#else - VPXOR Y11, Y0, Y0 - VPXOR Y12, Y0, Y0 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y0) VMOVDQU 5824(CX), Y11 VMOVDQU 5856(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y1 - -#else - VPXOR Y11, Y1, Y1 - VPXOR Y12, Y1, Y1 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y1) VMOVDQU 5888(CX), Y11 VMOVDQU 5920(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y2 - -#else - VPXOR Y11, Y2, Y2 - VPXOR Y12, Y2, Y2 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y2) VMOVDQU 5952(CX), Y11 VMOVDQU 5984(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y3 - -#else - VPXOR Y11, Y3, Y3 - VPXOR Y12, Y3, Y3 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y3) VMOVDQU 6016(CX), Y11 VMOVDQU 6048(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y4 - -#else - VPXOR Y11, Y4, Y4 - VPXOR Y12, Y4, Y4 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y4) VMOVDQU 6080(CX), Y11 VMOVDQU 6112(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y5 - -#else - VPXOR Y11, Y5, Y5 - VPXOR Y12, Y5, Y5 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y5) VMOVDQU 6144(CX), Y11 VMOVDQU 6176(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y6 - -#else - VPXOR Y11, Y6, Y6 - VPXOR Y12, Y6, Y6 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y6) VMOVDQU 6208(CX), Y11 VMOVDQU 6240(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y7 - -#else - VPXOR Y11, Y7, Y7 - VPXOR Y12, Y7, Y7 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y7) VMOVDQU 6272(CX), Y11 VMOVDQU 6304(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 - -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y8 - -#else - VPXOR Y11, Y8, Y8 - VPXOR Y12, Y8, Y8 - -#endif + XOR3WAY( $0x00, Y11, Y12, Y8) VMOVDQU 6336(CX), Y11 VMOVDQU 6368(CX), Y12 VPSHUFB Y13, Y11, Y11 VPSHUFB Y14, Y12, Y12 + XOR3WAY( $0x00, Y11, Y12, Y9) -#ifdef GOAMD64_v4 - VPTERNLOGD $0x96, Y11, Y12, Y9 - -#else - VPXOR Y11, Y9, Y9 - VPXOR Y12, Y9, Y9 - -#endif // Store 10 outputs MOVQ (R14), BP VMOVDQU Y0, (BP)(R15*1)